1 /*- 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35 * 36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_compat.h" 47 #include "opt_hwpmc_hooks.h" 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/capability.h> 52 #include <sys/kernel.h> 53 #include <sys/lock.h> 54 #include <sys/mutex.h> 55 #include <sys/sysproto.h> 56 #include <sys/filedesc.h> 57 #include <sys/priv.h> 58 #include <sys/proc.h> 59 #include <sys/racct.h> 60 #include <sys/resource.h> 61 #include <sys/resourcevar.h> 62 #include <sys/sysctl.h> 63 #include <sys/vnode.h> 64 #include <sys/fcntl.h> 65 #include <sys/file.h> 66 #include <sys/mman.h> 67 #include <sys/mount.h> 68 #include <sys/conf.h> 69 #include <sys/stat.h> 70 #include <sys/sysent.h> 71 #include <sys/vmmeter.h> 72 73 #include <security/mac/mac_framework.h> 74 75 #include <vm/vm.h> 76 #include <vm/vm_param.h> 77 #include <vm/pmap.h> 78 #include <vm/vm_map.h> 79 #include <vm/vm_object.h> 80 #include <vm/vm_page.h> 81 #include <vm/vm_pager.h> 82 #include <vm/vm_pageout.h> 83 #include <vm/vm_extern.h> 84 #include <vm/vm_page.h> 85 #include <vm/vnode_pager.h> 86 87 #ifdef HWPMC_HOOKS 88 #include <sys/pmckern.h> 89 #endif 90 91 int old_mlock = 0; 92 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RW | CTLFLAG_TUN, &old_mlock, 0, 93 "Do not apply RLIMIT_MEMLOCK on mlockall"); 94 TUNABLE_INT("vm.old_mlock", &old_mlock); 95 96 #ifndef _SYS_SYSPROTO_H_ 97 struct sbrk_args { 98 int incr; 99 }; 100 #endif 101 102 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 103 int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *); 104 static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 105 int *, struct cdev *, vm_ooffset_t *, vm_object_t *); 106 static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 107 int *, struct shmfd *, vm_ooffset_t, vm_object_t *); 108 109 /* 110 * MPSAFE 111 */ 112 /* ARGSUSED */ 113 int 114 sys_sbrk(td, uap) 115 struct thread *td; 116 struct sbrk_args *uap; 117 { 118 /* Not yet implemented */ 119 return (EOPNOTSUPP); 120 } 121 122 #ifndef _SYS_SYSPROTO_H_ 123 struct sstk_args { 124 int incr; 125 }; 126 #endif 127 128 /* 129 * MPSAFE 130 */ 131 /* ARGSUSED */ 132 int 133 sys_sstk(td, uap) 134 struct thread *td; 135 struct sstk_args *uap; 136 { 137 /* Not yet implemented */ 138 return (EOPNOTSUPP); 139 } 140 141 #if defined(COMPAT_43) 142 #ifndef _SYS_SYSPROTO_H_ 143 struct getpagesize_args { 144 int dummy; 145 }; 146 #endif 147 148 int 149 ogetpagesize(td, uap) 150 struct thread *td; 151 struct getpagesize_args *uap; 152 { 153 /* MP SAFE */ 154 td->td_retval[0] = PAGE_SIZE; 155 return (0); 156 } 157 #endif /* COMPAT_43 */ 158 159 160 /* 161 * Memory Map (mmap) system call. Note that the file offset 162 * and address are allowed to be NOT page aligned, though if 163 * the MAP_FIXED flag it set, both must have the same remainder 164 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 165 * page-aligned, the actual mapping starts at trunc_page(addr) 166 * and the return value is adjusted up by the page offset. 167 * 168 * Generally speaking, only character devices which are themselves 169 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 170 * there would be no cache coherency between a descriptor and a VM mapping 171 * both to the same character device. 172 */ 173 #ifndef _SYS_SYSPROTO_H_ 174 struct mmap_args { 175 void *addr; 176 size_t len; 177 int prot; 178 int flags; 179 int fd; 180 long pad; 181 off_t pos; 182 }; 183 #endif 184 185 /* 186 * MPSAFE 187 */ 188 int 189 sys_mmap(td, uap) 190 struct thread *td; 191 struct mmap_args *uap; 192 { 193 #ifdef HWPMC_HOOKS 194 struct pmckern_map_in pkm; 195 #endif 196 struct file *fp; 197 struct vnode *vp; 198 vm_offset_t addr; 199 vm_size_t size, pageoff; 200 vm_prot_t cap_maxprot, prot, maxprot; 201 void *handle; 202 objtype_t handle_type; 203 int flags, error; 204 off_t pos; 205 struct vmspace *vms = td->td_proc->p_vmspace; 206 cap_rights_t rights; 207 208 addr = (vm_offset_t) uap->addr; 209 size = uap->len; 210 prot = uap->prot & VM_PROT_ALL; 211 flags = uap->flags; 212 pos = uap->pos; 213 214 fp = NULL; 215 216 /* 217 * Enforce the constraints. 218 * Mapping of length 0 is only allowed for old binaries. 219 * Anonymous mapping shall specify -1 as filedescriptor and 220 * zero position for new code. Be nice to ancient a.out 221 * binaries and correct pos for anonymous mapping, since old 222 * ld.so sometimes issues anonymous map requests with non-zero 223 * pos. 224 */ 225 if (!SV_CURPROC_FLAG(SV_AOUT)) { 226 if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) || 227 ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0))) 228 return (EINVAL); 229 } else { 230 if ((flags & MAP_ANON) != 0) 231 pos = 0; 232 } 233 234 if (flags & MAP_STACK) { 235 if ((uap->fd != -1) || 236 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 237 return (EINVAL); 238 flags |= MAP_ANON; 239 pos = 0; 240 } 241 242 /* 243 * Align the file position to a page boundary, 244 * and save its page offset component. 245 */ 246 pageoff = (pos & PAGE_MASK); 247 pos -= pageoff; 248 249 /* Adjust size for rounding (on both ends). */ 250 size += pageoff; /* low end... */ 251 size = (vm_size_t) round_page(size); /* hi end */ 252 253 /* 254 * Check for illegal addresses. Watch out for address wrap... Note 255 * that VM_*_ADDRESS are not constants due to casts (argh). 256 */ 257 if (flags & MAP_FIXED) { 258 /* 259 * The specified address must have the same remainder 260 * as the file offset taken modulo PAGE_SIZE, so it 261 * should be aligned after adjustment by pageoff. 262 */ 263 addr -= pageoff; 264 if (addr & PAGE_MASK) 265 return (EINVAL); 266 267 /* Address range must be all in user VM space. */ 268 if (addr < vm_map_min(&vms->vm_map) || 269 addr + size > vm_map_max(&vms->vm_map)) 270 return (EINVAL); 271 if (addr + size < addr) 272 return (EINVAL); 273 } else { 274 /* 275 * XXX for non-fixed mappings where no hint is provided or 276 * the hint would fall in the potential heap space, 277 * place it after the end of the largest possible heap. 278 * 279 * There should really be a pmap call to determine a reasonable 280 * location. 281 */ 282 PROC_LOCK(td->td_proc); 283 if (addr == 0 || 284 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 285 addr < round_page((vm_offset_t)vms->vm_daddr + 286 lim_max(td->td_proc, RLIMIT_DATA)))) 287 addr = round_page((vm_offset_t)vms->vm_daddr + 288 lim_max(td->td_proc, RLIMIT_DATA)); 289 PROC_UNLOCK(td->td_proc); 290 } 291 if (flags & MAP_ANON) { 292 /* 293 * Mapping blank space is trivial. 294 */ 295 handle = NULL; 296 handle_type = OBJT_DEFAULT; 297 maxprot = VM_PROT_ALL; 298 cap_maxprot = VM_PROT_ALL; 299 } else { 300 /* 301 * Mapping file, get fp for validation and don't let the 302 * descriptor disappear on us if we block. Check capability 303 * rights, but also return the maximum rights to be combined 304 * with maxprot later. 305 */ 306 rights = CAP_MMAP; 307 if (prot & PROT_READ) 308 rights |= CAP_READ; 309 if ((flags & MAP_SHARED) != 0) { 310 if (prot & PROT_WRITE) 311 rights |= CAP_WRITE; 312 } 313 if (prot & PROT_EXEC) 314 rights |= CAP_MAPEXEC; 315 if ((error = fget_mmap(td, uap->fd, rights, &cap_maxprot, 316 &fp)) != 0) 317 goto done; 318 if (fp->f_type == DTYPE_SHM) { 319 handle = fp->f_data; 320 handle_type = OBJT_SWAP; 321 maxprot = VM_PROT_NONE; 322 323 /* FREAD should always be set. */ 324 if (fp->f_flag & FREAD) 325 maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; 326 if (fp->f_flag & FWRITE) 327 maxprot |= VM_PROT_WRITE; 328 goto map; 329 } 330 if (fp->f_type != DTYPE_VNODE) { 331 error = ENODEV; 332 goto done; 333 } 334 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ 335 defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) 336 /* 337 * POSIX shared-memory objects are defined to have 338 * kernel persistence, and are not defined to support 339 * read(2)/write(2) -- or even open(2). Thus, we can 340 * use MAP_ASYNC to trade on-disk coherence for speed. 341 * The shm_open(3) library routine turns on the FPOSIXSHM 342 * flag to request this behavior. 343 */ 344 if (fp->f_flag & FPOSIXSHM) 345 flags |= MAP_NOSYNC; 346 #endif 347 vp = fp->f_vnode; 348 /* 349 * Ensure that file and memory protections are 350 * compatible. Note that we only worry about 351 * writability if mapping is shared; in this case, 352 * current and max prot are dictated by the open file. 353 * XXX use the vnode instead? Problem is: what 354 * credentials do we use for determination? What if 355 * proc does a setuid? 356 */ 357 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 358 maxprot = VM_PROT_NONE; 359 else 360 maxprot = VM_PROT_EXECUTE; 361 if (fp->f_flag & FREAD) { 362 maxprot |= VM_PROT_READ; 363 } else if (prot & PROT_READ) { 364 error = EACCES; 365 goto done; 366 } 367 /* 368 * If we are sharing potential changes (either via 369 * MAP_SHARED or via the implicit sharing of character 370 * device mappings), and we are trying to get write 371 * permission although we opened it without asking 372 * for it, bail out. 373 */ 374 if ((flags & MAP_SHARED) != 0) { 375 if ((fp->f_flag & FWRITE) != 0) { 376 maxprot |= VM_PROT_WRITE; 377 } else if ((prot & PROT_WRITE) != 0) { 378 error = EACCES; 379 goto done; 380 } 381 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 382 maxprot |= VM_PROT_WRITE; 383 cap_maxprot |= VM_PROT_WRITE; 384 } 385 handle = (void *)vp; 386 handle_type = OBJT_VNODE; 387 } 388 map: 389 td->td_fpop = fp; 390 maxprot &= cap_maxprot; 391 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 392 flags, handle_type, handle, pos); 393 td->td_fpop = NULL; 394 #ifdef HWPMC_HOOKS 395 /* inform hwpmc(4) if an executable is being mapped */ 396 if (error == 0 && handle_type == OBJT_VNODE && 397 (prot & PROT_EXEC)) { 398 pkm.pm_file = handle; 399 pkm.pm_address = (uintptr_t) addr; 400 PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); 401 } 402 #endif 403 if (error == 0) 404 td->td_retval[0] = (register_t) (addr + pageoff); 405 done: 406 if (fp) 407 fdrop(fp, td); 408 409 return (error); 410 } 411 412 int 413 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 414 { 415 struct mmap_args oargs; 416 417 oargs.addr = uap->addr; 418 oargs.len = uap->len; 419 oargs.prot = uap->prot; 420 oargs.flags = uap->flags; 421 oargs.fd = uap->fd; 422 oargs.pos = uap->pos; 423 return (sys_mmap(td, &oargs)); 424 } 425 426 #ifdef COMPAT_43 427 #ifndef _SYS_SYSPROTO_H_ 428 struct ommap_args { 429 caddr_t addr; 430 int len; 431 int prot; 432 int flags; 433 int fd; 434 long pos; 435 }; 436 #endif 437 int 438 ommap(td, uap) 439 struct thread *td; 440 struct ommap_args *uap; 441 { 442 struct mmap_args nargs; 443 static const char cvtbsdprot[8] = { 444 0, 445 PROT_EXEC, 446 PROT_WRITE, 447 PROT_EXEC | PROT_WRITE, 448 PROT_READ, 449 PROT_EXEC | PROT_READ, 450 PROT_WRITE | PROT_READ, 451 PROT_EXEC | PROT_WRITE | PROT_READ, 452 }; 453 454 #define OMAP_ANON 0x0002 455 #define OMAP_COPY 0x0020 456 #define OMAP_SHARED 0x0010 457 #define OMAP_FIXED 0x0100 458 459 nargs.addr = uap->addr; 460 nargs.len = uap->len; 461 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 462 #ifdef COMPAT_FREEBSD32 463 #if defined(__amd64__) || defined(__ia64__) 464 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 465 nargs.prot != 0) 466 nargs.prot |= PROT_EXEC; 467 #endif 468 #endif 469 nargs.flags = 0; 470 if (uap->flags & OMAP_ANON) 471 nargs.flags |= MAP_ANON; 472 if (uap->flags & OMAP_COPY) 473 nargs.flags |= MAP_COPY; 474 if (uap->flags & OMAP_SHARED) 475 nargs.flags |= MAP_SHARED; 476 else 477 nargs.flags |= MAP_PRIVATE; 478 if (uap->flags & OMAP_FIXED) 479 nargs.flags |= MAP_FIXED; 480 nargs.fd = uap->fd; 481 nargs.pos = uap->pos; 482 return (sys_mmap(td, &nargs)); 483 } 484 #endif /* COMPAT_43 */ 485 486 487 #ifndef _SYS_SYSPROTO_H_ 488 struct msync_args { 489 void *addr; 490 size_t len; 491 int flags; 492 }; 493 #endif 494 /* 495 * MPSAFE 496 */ 497 int 498 sys_msync(td, uap) 499 struct thread *td; 500 struct msync_args *uap; 501 { 502 vm_offset_t addr; 503 vm_size_t size, pageoff; 504 int flags; 505 vm_map_t map; 506 int rv; 507 508 addr = (vm_offset_t) uap->addr; 509 size = uap->len; 510 flags = uap->flags; 511 512 pageoff = (addr & PAGE_MASK); 513 addr -= pageoff; 514 size += pageoff; 515 size = (vm_size_t) round_page(size); 516 if (addr + size < addr) 517 return (EINVAL); 518 519 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 520 return (EINVAL); 521 522 map = &td->td_proc->p_vmspace->vm_map; 523 524 /* 525 * Clean the pages and interpret the return value. 526 */ 527 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 528 (flags & MS_INVALIDATE) != 0); 529 switch (rv) { 530 case KERN_SUCCESS: 531 return (0); 532 case KERN_INVALID_ADDRESS: 533 return (EINVAL); /* Sun returns ENOMEM? */ 534 case KERN_INVALID_ARGUMENT: 535 return (EBUSY); 536 case KERN_FAILURE: 537 return (EIO); 538 default: 539 return (EINVAL); 540 } 541 } 542 543 #ifndef _SYS_SYSPROTO_H_ 544 struct munmap_args { 545 void *addr; 546 size_t len; 547 }; 548 #endif 549 /* 550 * MPSAFE 551 */ 552 int 553 sys_munmap(td, uap) 554 struct thread *td; 555 struct munmap_args *uap; 556 { 557 #ifdef HWPMC_HOOKS 558 struct pmckern_map_out pkm; 559 vm_map_entry_t entry; 560 #endif 561 vm_offset_t addr; 562 vm_size_t size, pageoff; 563 vm_map_t map; 564 565 addr = (vm_offset_t) uap->addr; 566 size = uap->len; 567 if (size == 0) 568 return (EINVAL); 569 570 pageoff = (addr & PAGE_MASK); 571 addr -= pageoff; 572 size += pageoff; 573 size = (vm_size_t) round_page(size); 574 if (addr + size < addr) 575 return (EINVAL); 576 577 /* 578 * Check for illegal addresses. Watch out for address wrap... 579 */ 580 map = &td->td_proc->p_vmspace->vm_map; 581 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 582 return (EINVAL); 583 vm_map_lock(map); 584 #ifdef HWPMC_HOOKS 585 /* 586 * Inform hwpmc if the address range being unmapped contains 587 * an executable region. 588 */ 589 pkm.pm_address = (uintptr_t) NULL; 590 if (vm_map_lookup_entry(map, addr, &entry)) { 591 for (; 592 entry != &map->header && entry->start < addr + size; 593 entry = entry->next) { 594 if (vm_map_check_protection(map, entry->start, 595 entry->end, VM_PROT_EXECUTE) == TRUE) { 596 pkm.pm_address = (uintptr_t) addr; 597 pkm.pm_size = (size_t) size; 598 break; 599 } 600 } 601 } 602 #endif 603 vm_map_delete(map, addr, addr + size); 604 605 #ifdef HWPMC_HOOKS 606 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 607 vm_map_lock_downgrade(map); 608 if (pkm.pm_address != (uintptr_t) NULL) 609 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 610 vm_map_unlock_read(map); 611 #else 612 vm_map_unlock(map); 613 #endif 614 /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ 615 return (0); 616 } 617 618 #ifndef _SYS_SYSPROTO_H_ 619 struct mprotect_args { 620 const void *addr; 621 size_t len; 622 int prot; 623 }; 624 #endif 625 /* 626 * MPSAFE 627 */ 628 int 629 sys_mprotect(td, uap) 630 struct thread *td; 631 struct mprotect_args *uap; 632 { 633 vm_offset_t addr; 634 vm_size_t size, pageoff; 635 vm_prot_t prot; 636 637 addr = (vm_offset_t) uap->addr; 638 size = uap->len; 639 prot = uap->prot & VM_PROT_ALL; 640 641 pageoff = (addr & PAGE_MASK); 642 addr -= pageoff; 643 size += pageoff; 644 size = (vm_size_t) round_page(size); 645 if (addr + size < addr) 646 return (EINVAL); 647 648 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 649 addr + size, prot, FALSE)) { 650 case KERN_SUCCESS: 651 return (0); 652 case KERN_PROTECTION_FAILURE: 653 return (EACCES); 654 case KERN_RESOURCE_SHORTAGE: 655 return (ENOMEM); 656 } 657 return (EINVAL); 658 } 659 660 #ifndef _SYS_SYSPROTO_H_ 661 struct minherit_args { 662 void *addr; 663 size_t len; 664 int inherit; 665 }; 666 #endif 667 /* 668 * MPSAFE 669 */ 670 int 671 sys_minherit(td, uap) 672 struct thread *td; 673 struct minherit_args *uap; 674 { 675 vm_offset_t addr; 676 vm_size_t size, pageoff; 677 vm_inherit_t inherit; 678 679 addr = (vm_offset_t)uap->addr; 680 size = uap->len; 681 inherit = uap->inherit; 682 683 pageoff = (addr & PAGE_MASK); 684 addr -= pageoff; 685 size += pageoff; 686 size = (vm_size_t) round_page(size); 687 if (addr + size < addr) 688 return (EINVAL); 689 690 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 691 addr + size, inherit)) { 692 case KERN_SUCCESS: 693 return (0); 694 case KERN_PROTECTION_FAILURE: 695 return (EACCES); 696 } 697 return (EINVAL); 698 } 699 700 #ifndef _SYS_SYSPROTO_H_ 701 struct madvise_args { 702 void *addr; 703 size_t len; 704 int behav; 705 }; 706 #endif 707 708 /* 709 * MPSAFE 710 */ 711 int 712 sys_madvise(td, uap) 713 struct thread *td; 714 struct madvise_args *uap; 715 { 716 vm_offset_t start, end; 717 vm_map_t map; 718 struct proc *p; 719 int error; 720 721 /* 722 * Check for our special case, advising the swap pager we are 723 * "immortal." 724 */ 725 if (uap->behav == MADV_PROTECT) { 726 error = priv_check(td, PRIV_VM_MADV_PROTECT); 727 if (error == 0) { 728 p = td->td_proc; 729 PROC_LOCK(p); 730 p->p_flag |= P_PROTECTED; 731 PROC_UNLOCK(p); 732 } 733 return (error); 734 } 735 /* 736 * Check for illegal behavior 737 */ 738 if (uap->behav < 0 || uap->behav > MADV_CORE) 739 return (EINVAL); 740 /* 741 * Check for illegal addresses. Watch out for address wrap... Note 742 * that VM_*_ADDRESS are not constants due to casts (argh). 743 */ 744 map = &td->td_proc->p_vmspace->vm_map; 745 if ((vm_offset_t)uap->addr < vm_map_min(map) || 746 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 747 return (EINVAL); 748 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 749 return (EINVAL); 750 751 /* 752 * Since this routine is only advisory, we default to conservative 753 * behavior. 754 */ 755 start = trunc_page((vm_offset_t) uap->addr); 756 end = round_page((vm_offset_t) uap->addr + uap->len); 757 758 if (vm_map_madvise(map, start, end, uap->behav)) 759 return (EINVAL); 760 return (0); 761 } 762 763 #ifndef _SYS_SYSPROTO_H_ 764 struct mincore_args { 765 const void *addr; 766 size_t len; 767 char *vec; 768 }; 769 #endif 770 771 /* 772 * MPSAFE 773 */ 774 int 775 sys_mincore(td, uap) 776 struct thread *td; 777 struct mincore_args *uap; 778 { 779 vm_offset_t addr, first_addr; 780 vm_offset_t end, cend; 781 pmap_t pmap; 782 vm_map_t map; 783 char *vec; 784 int error = 0; 785 int vecindex, lastvecindex; 786 vm_map_entry_t current; 787 vm_map_entry_t entry; 788 vm_object_t object; 789 vm_paddr_t locked_pa; 790 vm_page_t m; 791 vm_pindex_t pindex; 792 int mincoreinfo; 793 unsigned int timestamp; 794 boolean_t locked; 795 796 /* 797 * Make sure that the addresses presented are valid for user 798 * mode. 799 */ 800 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 801 end = addr + (vm_size_t)round_page(uap->len); 802 map = &td->td_proc->p_vmspace->vm_map; 803 if (end > vm_map_max(map) || end < addr) 804 return (ENOMEM); 805 806 /* 807 * Address of byte vector 808 */ 809 vec = uap->vec; 810 811 pmap = vmspace_pmap(td->td_proc->p_vmspace); 812 813 vm_map_lock_read(map); 814 RestartScan: 815 timestamp = map->timestamp; 816 817 if (!vm_map_lookup_entry(map, addr, &entry)) { 818 vm_map_unlock_read(map); 819 return (ENOMEM); 820 } 821 822 /* 823 * Do this on a map entry basis so that if the pages are not 824 * in the current processes address space, we can easily look 825 * up the pages elsewhere. 826 */ 827 lastvecindex = -1; 828 for (current = entry; 829 (current != &map->header) && (current->start < end); 830 current = current->next) { 831 832 /* 833 * check for contiguity 834 */ 835 if (current->end < end && 836 (entry->next == &map->header || 837 current->next->start > current->end)) { 838 vm_map_unlock_read(map); 839 return (ENOMEM); 840 } 841 842 /* 843 * ignore submaps (for now) or null objects 844 */ 845 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 846 current->object.vm_object == NULL) 847 continue; 848 849 /* 850 * limit this scan to the current map entry and the 851 * limits for the mincore call 852 */ 853 if (addr < current->start) 854 addr = current->start; 855 cend = current->end; 856 if (cend > end) 857 cend = end; 858 859 /* 860 * scan this entry one page at a time 861 */ 862 while (addr < cend) { 863 /* 864 * Check pmap first, it is likely faster, also 865 * it can provide info as to whether we are the 866 * one referencing or modifying the page. 867 */ 868 object = NULL; 869 locked_pa = 0; 870 retry: 871 m = NULL; 872 mincoreinfo = pmap_mincore(pmap, addr, &locked_pa); 873 if (locked_pa != 0) { 874 /* 875 * The page is mapped by this process but not 876 * both accessed and modified. It is also 877 * managed. Acquire the object lock so that 878 * other mappings might be examined. 879 */ 880 m = PHYS_TO_VM_PAGE(locked_pa); 881 if (m->object != object) { 882 if (object != NULL) 883 VM_OBJECT_UNLOCK(object); 884 object = m->object; 885 locked = VM_OBJECT_TRYLOCK(object); 886 vm_page_unlock(m); 887 if (!locked) { 888 VM_OBJECT_LOCK(object); 889 vm_page_lock(m); 890 goto retry; 891 } 892 } else 893 vm_page_unlock(m); 894 KASSERT(m->valid == VM_PAGE_BITS_ALL, 895 ("mincore: page %p is mapped but invalid", 896 m)); 897 } else if (mincoreinfo == 0) { 898 /* 899 * The page is not mapped by this process. If 900 * the object implements managed pages, then 901 * determine if the page is resident so that 902 * the mappings might be examined. 903 */ 904 if (current->object.vm_object != object) { 905 if (object != NULL) 906 VM_OBJECT_UNLOCK(object); 907 object = current->object.vm_object; 908 VM_OBJECT_LOCK(object); 909 } 910 if (object->type == OBJT_DEFAULT || 911 object->type == OBJT_SWAP || 912 object->type == OBJT_VNODE) { 913 pindex = OFF_TO_IDX(current->offset + 914 (addr - current->start)); 915 m = vm_page_lookup(object, pindex); 916 if (m == NULL && 917 vm_page_is_cached(object, pindex)) 918 mincoreinfo = MINCORE_INCORE; 919 if (m != NULL && m->valid == 0) 920 m = NULL; 921 if (m != NULL) 922 mincoreinfo = MINCORE_INCORE; 923 } 924 } 925 if (m != NULL) { 926 /* Examine other mappings to the page. */ 927 if (m->dirty == 0 && pmap_is_modified(m)) 928 vm_page_dirty(m); 929 if (m->dirty != 0) 930 mincoreinfo |= MINCORE_MODIFIED_OTHER; 931 /* 932 * The first test for PGA_REFERENCED is an 933 * optimization. The second test is 934 * required because a concurrent pmap 935 * operation could clear the last reference 936 * and set PGA_REFERENCED before the call to 937 * pmap_is_referenced(). 938 */ 939 if ((m->aflags & PGA_REFERENCED) != 0 || 940 pmap_is_referenced(m) || 941 (m->aflags & PGA_REFERENCED) != 0) 942 mincoreinfo |= MINCORE_REFERENCED_OTHER; 943 } 944 if (object != NULL) 945 VM_OBJECT_UNLOCK(object); 946 947 /* 948 * subyte may page fault. In case it needs to modify 949 * the map, we release the lock. 950 */ 951 vm_map_unlock_read(map); 952 953 /* 954 * calculate index into user supplied byte vector 955 */ 956 vecindex = OFF_TO_IDX(addr - first_addr); 957 958 /* 959 * If we have skipped map entries, we need to make sure that 960 * the byte vector is zeroed for those skipped entries. 961 */ 962 while ((lastvecindex + 1) < vecindex) { 963 error = subyte(vec + lastvecindex, 0); 964 if (error) { 965 error = EFAULT; 966 goto done2; 967 } 968 ++lastvecindex; 969 } 970 971 /* 972 * Pass the page information to the user 973 */ 974 error = subyte(vec + vecindex, mincoreinfo); 975 if (error) { 976 error = EFAULT; 977 goto done2; 978 } 979 980 /* 981 * If the map has changed, due to the subyte, the previous 982 * output may be invalid. 983 */ 984 vm_map_lock_read(map); 985 if (timestamp != map->timestamp) 986 goto RestartScan; 987 988 lastvecindex = vecindex; 989 addr += PAGE_SIZE; 990 } 991 } 992 993 /* 994 * subyte may page fault. In case it needs to modify 995 * the map, we release the lock. 996 */ 997 vm_map_unlock_read(map); 998 999 /* 1000 * Zero the last entries in the byte vector. 1001 */ 1002 vecindex = OFF_TO_IDX(end - first_addr); 1003 while ((lastvecindex + 1) < vecindex) { 1004 error = subyte(vec + lastvecindex, 0); 1005 if (error) { 1006 error = EFAULT; 1007 goto done2; 1008 } 1009 ++lastvecindex; 1010 } 1011 1012 /* 1013 * If the map has changed, due to the subyte, the previous 1014 * output may be invalid. 1015 */ 1016 vm_map_lock_read(map); 1017 if (timestamp != map->timestamp) 1018 goto RestartScan; 1019 vm_map_unlock_read(map); 1020 done2: 1021 return (error); 1022 } 1023 1024 #ifndef _SYS_SYSPROTO_H_ 1025 struct mlock_args { 1026 const void *addr; 1027 size_t len; 1028 }; 1029 #endif 1030 /* 1031 * MPSAFE 1032 */ 1033 int 1034 sys_mlock(td, uap) 1035 struct thread *td; 1036 struct mlock_args *uap; 1037 { 1038 struct proc *proc; 1039 vm_offset_t addr, end, last, start; 1040 vm_size_t npages, size; 1041 vm_map_t map; 1042 unsigned long nsize; 1043 int error; 1044 1045 error = priv_check(td, PRIV_VM_MLOCK); 1046 if (error) 1047 return (error); 1048 addr = (vm_offset_t)uap->addr; 1049 size = uap->len; 1050 last = addr + size; 1051 start = trunc_page(addr); 1052 end = round_page(last); 1053 if (last < addr || end < addr) 1054 return (EINVAL); 1055 npages = atop(end - start); 1056 if (npages > vm_page_max_wired) 1057 return (ENOMEM); 1058 proc = td->td_proc; 1059 map = &proc->p_vmspace->vm_map; 1060 PROC_LOCK(proc); 1061 nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1062 if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) { 1063 PROC_UNLOCK(proc); 1064 return (ENOMEM); 1065 } 1066 PROC_UNLOCK(proc); 1067 if (npages + cnt.v_wire_count > vm_page_max_wired) 1068 return (EAGAIN); 1069 #ifdef RACCT 1070 PROC_LOCK(proc); 1071 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1072 PROC_UNLOCK(proc); 1073 if (error != 0) 1074 return (ENOMEM); 1075 #endif 1076 error = vm_map_wire(map, start, end, 1077 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1078 #ifdef RACCT 1079 if (error != KERN_SUCCESS) { 1080 PROC_LOCK(proc); 1081 racct_set(proc, RACCT_MEMLOCK, 1082 ptoa(pmap_wired_count(map->pmap))); 1083 PROC_UNLOCK(proc); 1084 } 1085 #endif 1086 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1087 } 1088 1089 #ifndef _SYS_SYSPROTO_H_ 1090 struct mlockall_args { 1091 int how; 1092 }; 1093 #endif 1094 1095 /* 1096 * MPSAFE 1097 */ 1098 int 1099 sys_mlockall(td, uap) 1100 struct thread *td; 1101 struct mlockall_args *uap; 1102 { 1103 vm_map_t map; 1104 int error; 1105 1106 map = &td->td_proc->p_vmspace->vm_map; 1107 error = priv_check(td, PRIV_VM_MLOCK); 1108 if (error) 1109 return (error); 1110 1111 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1112 return (EINVAL); 1113 1114 /* 1115 * If wiring all pages in the process would cause it to exceed 1116 * a hard resource limit, return ENOMEM. 1117 */ 1118 if (!old_mlock && uap->how & MCL_CURRENT) { 1119 PROC_LOCK(td->td_proc); 1120 if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1121 PROC_UNLOCK(td->td_proc); 1122 return (ENOMEM); 1123 } 1124 PROC_UNLOCK(td->td_proc); 1125 } 1126 #ifdef RACCT 1127 PROC_LOCK(td->td_proc); 1128 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1129 PROC_UNLOCK(td->td_proc); 1130 if (error != 0) 1131 return (ENOMEM); 1132 #endif 1133 1134 if (uap->how & MCL_FUTURE) { 1135 vm_map_lock(map); 1136 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1137 vm_map_unlock(map); 1138 error = 0; 1139 } 1140 1141 if (uap->how & MCL_CURRENT) { 1142 /* 1143 * P1003.1-2001 mandates that all currently mapped pages 1144 * will be memory resident and locked (wired) upon return 1145 * from mlockall(). vm_map_wire() will wire pages, by 1146 * calling vm_fault_wire() for each page in the region. 1147 */ 1148 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1149 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1150 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1151 } 1152 #ifdef RACCT 1153 if (error != KERN_SUCCESS) { 1154 PROC_LOCK(td->td_proc); 1155 racct_set(td->td_proc, RACCT_MEMLOCK, 1156 ptoa(pmap_wired_count(map->pmap))); 1157 PROC_UNLOCK(td->td_proc); 1158 } 1159 #endif 1160 1161 return (error); 1162 } 1163 1164 #ifndef _SYS_SYSPROTO_H_ 1165 struct munlockall_args { 1166 register_t dummy; 1167 }; 1168 #endif 1169 1170 /* 1171 * MPSAFE 1172 */ 1173 int 1174 sys_munlockall(td, uap) 1175 struct thread *td; 1176 struct munlockall_args *uap; 1177 { 1178 vm_map_t map; 1179 int error; 1180 1181 map = &td->td_proc->p_vmspace->vm_map; 1182 error = priv_check(td, PRIV_VM_MUNLOCK); 1183 if (error) 1184 return (error); 1185 1186 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1187 vm_map_lock(map); 1188 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1189 vm_map_unlock(map); 1190 1191 /* Forcibly unwire all pages. */ 1192 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1193 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1194 #ifdef RACCT 1195 if (error == KERN_SUCCESS) { 1196 PROC_LOCK(td->td_proc); 1197 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1198 PROC_UNLOCK(td->td_proc); 1199 } 1200 #endif 1201 1202 return (error); 1203 } 1204 1205 #ifndef _SYS_SYSPROTO_H_ 1206 struct munlock_args { 1207 const void *addr; 1208 size_t len; 1209 }; 1210 #endif 1211 /* 1212 * MPSAFE 1213 */ 1214 int 1215 sys_munlock(td, uap) 1216 struct thread *td; 1217 struct munlock_args *uap; 1218 { 1219 vm_offset_t addr, end, last, start; 1220 vm_size_t size; 1221 int error; 1222 1223 error = priv_check(td, PRIV_VM_MUNLOCK); 1224 if (error) 1225 return (error); 1226 addr = (vm_offset_t)uap->addr; 1227 size = uap->len; 1228 last = addr + size; 1229 start = trunc_page(addr); 1230 end = round_page(last); 1231 if (last < addr || end < addr) 1232 return (EINVAL); 1233 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1234 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1235 #ifdef RACCT 1236 if (error == KERN_SUCCESS) { 1237 PROC_LOCK(td->td_proc); 1238 racct_sub(td->td_proc, RACCT_MEMLOCK, ptoa(end - start)); 1239 PROC_UNLOCK(td->td_proc); 1240 } 1241 #endif 1242 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1243 } 1244 1245 /* 1246 * vm_mmap_vnode() 1247 * 1248 * Helper function for vm_mmap. Perform sanity check specific for mmap 1249 * operations on vnodes. 1250 * 1251 * For VCHR vnodes, the vnode lock is held over the call to 1252 * vm_mmap_cdev() to keep vp->v_rdev valid. 1253 */ 1254 int 1255 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1256 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1257 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1258 boolean_t *writecounted) 1259 { 1260 struct vattr va; 1261 vm_object_t obj; 1262 vm_offset_t foff; 1263 struct mount *mp; 1264 struct ucred *cred; 1265 int error, flags, locktype; 1266 1267 mp = vp->v_mount; 1268 cred = td->td_ucred; 1269 if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED)) 1270 locktype = LK_EXCLUSIVE; 1271 else 1272 locktype = LK_SHARED; 1273 if ((error = vget(vp, locktype, td)) != 0) 1274 return (error); 1275 foff = *foffp; 1276 flags = *flagsp; 1277 obj = vp->v_object; 1278 if (vp->v_type == VREG) { 1279 /* 1280 * Get the proper underlying object 1281 */ 1282 if (obj == NULL) { 1283 error = EINVAL; 1284 goto done; 1285 } 1286 if (obj->handle != vp) { 1287 vput(vp); 1288 vp = (struct vnode *)obj->handle; 1289 /* 1290 * Bypass filesystems obey the mpsafety of the 1291 * underlying fs. 1292 */ 1293 error = vget(vp, locktype, td); 1294 if (error != 0) 1295 return (error); 1296 } 1297 if (locktype == LK_EXCLUSIVE) { 1298 *writecounted = TRUE; 1299 vnode_pager_update_writecount(obj, 0, objsize); 1300 } 1301 } else if (vp->v_type == VCHR) { 1302 error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp, 1303 vp->v_rdev, foffp, objp); 1304 if (error == 0) 1305 goto mark_atime; 1306 goto done; 1307 } else { 1308 error = EINVAL; 1309 goto done; 1310 } 1311 if ((error = VOP_GETATTR(vp, &va, cred))) 1312 goto done; 1313 #ifdef MAC 1314 error = mac_vnode_check_mmap(cred, vp, prot, flags); 1315 if (error != 0) 1316 goto done; 1317 #endif 1318 if ((flags & MAP_SHARED) != 0) { 1319 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1320 if (prot & PROT_WRITE) { 1321 error = EPERM; 1322 goto done; 1323 } 1324 *maxprotp &= ~VM_PROT_WRITE; 1325 } 1326 } 1327 /* 1328 * If it is a regular file without any references 1329 * we do not need to sync it. 1330 * Adjust object size to be the size of actual file. 1331 */ 1332 objsize = round_page(va.va_size); 1333 if (va.va_nlink == 0) 1334 flags |= MAP_NOSYNC; 1335 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, cred); 1336 if (obj == NULL) { 1337 error = ENOMEM; 1338 goto done; 1339 } 1340 *objp = obj; 1341 *flagsp = flags; 1342 1343 mark_atime: 1344 vfs_mark_atime(vp, cred); 1345 1346 done: 1347 vput(vp); 1348 return (error); 1349 } 1350 1351 /* 1352 * vm_mmap_cdev() 1353 * 1354 * MPSAFE 1355 * 1356 * Helper function for vm_mmap. Perform sanity check specific for mmap 1357 * operations on cdevs. 1358 */ 1359 int 1360 vm_mmap_cdev(struct thread *td, vm_size_t objsize, 1361 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1362 struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp) 1363 { 1364 vm_object_t obj; 1365 struct cdevsw *dsw; 1366 int error, flags, ref; 1367 1368 flags = *flagsp; 1369 1370 dsw = dev_refthread(cdev, &ref); 1371 if (dsw == NULL) 1372 return (ENXIO); 1373 if (dsw->d_flags & D_MMAP_ANON) { 1374 dev_relthread(cdev, ref); 1375 *maxprotp = VM_PROT_ALL; 1376 *flagsp |= MAP_ANON; 1377 return (0); 1378 } 1379 /* 1380 * cdevs do not provide private mappings of any kind. 1381 */ 1382 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1383 (prot & PROT_WRITE) != 0) { 1384 dev_relthread(cdev, ref); 1385 return (EACCES); 1386 } 1387 if (flags & (MAP_PRIVATE|MAP_COPY)) { 1388 dev_relthread(cdev, ref); 1389 return (EINVAL); 1390 } 1391 /* 1392 * Force device mappings to be shared. 1393 */ 1394 flags |= MAP_SHARED; 1395 #ifdef MAC_XXX 1396 error = mac_cdev_check_mmap(td->td_ucred, cdev, prot); 1397 if (error != 0) { 1398 dev_relthread(cdev, ref); 1399 return (error); 1400 } 1401 #endif 1402 /* 1403 * First, try d_mmap_single(). If that is not implemented 1404 * (returns ENODEV), fall back to using the device pager. 1405 * Note that d_mmap_single() must return a reference to the 1406 * object (it needs to bump the reference count of the object 1407 * it returns somehow). 1408 * 1409 * XXX assumes VM_PROT_* == PROT_* 1410 */ 1411 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1412 dev_relthread(cdev, ref); 1413 if (error != ENODEV) 1414 return (error); 1415 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1416 td->td_ucred); 1417 if (obj == NULL) 1418 return (EINVAL); 1419 *objp = obj; 1420 *flagsp = flags; 1421 return (0); 1422 } 1423 1424 /* 1425 * vm_mmap_shm() 1426 * 1427 * MPSAFE 1428 * 1429 * Helper function for vm_mmap. Perform sanity check specific for mmap 1430 * operations on shm file descriptors. 1431 */ 1432 int 1433 vm_mmap_shm(struct thread *td, vm_size_t objsize, 1434 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1435 struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp) 1436 { 1437 int error; 1438 1439 if ((*flagsp & MAP_SHARED) != 0 && 1440 (*maxprotp & VM_PROT_WRITE) == 0 && 1441 (prot & PROT_WRITE) != 0) 1442 return (EACCES); 1443 #ifdef MAC 1444 error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp); 1445 if (error != 0) 1446 return (error); 1447 #endif 1448 error = shm_mmap(shmfd, objsize, foff, objp); 1449 if (error) 1450 return (error); 1451 return (0); 1452 } 1453 1454 /* 1455 * vm_mmap() 1456 * 1457 * MPSAFE 1458 * 1459 * Internal version of mmap. Currently used by mmap, exec, and sys5 1460 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1461 */ 1462 int 1463 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1464 vm_prot_t maxprot, int flags, 1465 objtype_t handle_type, void *handle, 1466 vm_ooffset_t foff) 1467 { 1468 boolean_t fitit; 1469 vm_object_t object = NULL; 1470 struct thread *td = curthread; 1471 int docow, error, rv; 1472 boolean_t writecounted; 1473 1474 if (size == 0) 1475 return (0); 1476 1477 size = round_page(size); 1478 1479 if (map == &td->td_proc->p_vmspace->vm_map) { 1480 PROC_LOCK(td->td_proc); 1481 if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) { 1482 PROC_UNLOCK(td->td_proc); 1483 return (ENOMEM); 1484 } 1485 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1486 PROC_UNLOCK(td->td_proc); 1487 return (ENOMEM); 1488 } 1489 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1490 if (ptoa(pmap_wired_count(map->pmap)) + size > 1491 lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1492 racct_set_force(td->td_proc, RACCT_VMEM, 1493 map->size); 1494 PROC_UNLOCK(td->td_proc); 1495 return (ENOMEM); 1496 } 1497 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1498 ptoa(pmap_wired_count(map->pmap)) + size); 1499 if (error != 0) { 1500 racct_set_force(td->td_proc, RACCT_VMEM, 1501 map->size); 1502 PROC_UNLOCK(td->td_proc); 1503 return (error); 1504 } 1505 } 1506 PROC_UNLOCK(td->td_proc); 1507 } 1508 1509 /* 1510 * We currently can only deal with page aligned file offsets. 1511 * The check is here rather than in the syscall because the 1512 * kernel calls this function internally for other mmaping 1513 * operations (such as in exec) and non-aligned offsets will 1514 * cause pmap inconsistencies...so we want to be sure to 1515 * disallow this in all cases. 1516 */ 1517 if (foff & PAGE_MASK) 1518 return (EINVAL); 1519 1520 if ((flags & MAP_FIXED) == 0) { 1521 fitit = TRUE; 1522 *addr = round_page(*addr); 1523 } else { 1524 if (*addr != trunc_page(*addr)) 1525 return (EINVAL); 1526 fitit = FALSE; 1527 } 1528 writecounted = FALSE; 1529 1530 /* 1531 * Lookup/allocate object. 1532 */ 1533 switch (handle_type) { 1534 case OBJT_DEVICE: 1535 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, 1536 handle, &foff, &object); 1537 break; 1538 case OBJT_VNODE: 1539 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1540 handle, &foff, &object, &writecounted); 1541 break; 1542 case OBJT_SWAP: 1543 error = vm_mmap_shm(td, size, prot, &maxprot, &flags, 1544 handle, foff, &object); 1545 break; 1546 case OBJT_DEFAULT: 1547 if (handle == NULL) { 1548 error = 0; 1549 break; 1550 } 1551 /* FALLTHROUGH */ 1552 default: 1553 error = EINVAL; 1554 break; 1555 } 1556 if (error) 1557 return (error); 1558 if (flags & MAP_ANON) { 1559 object = NULL; 1560 docow = 0; 1561 /* 1562 * Unnamed anonymous regions always start at 0. 1563 */ 1564 if (handle == 0) 1565 foff = 0; 1566 } else if (flags & MAP_PREFAULT_READ) 1567 docow = MAP_PREFAULT; 1568 else 1569 docow = MAP_PREFAULT_PARTIAL; 1570 1571 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1572 docow |= MAP_COPY_ON_WRITE; 1573 if (flags & MAP_NOSYNC) 1574 docow |= MAP_DISABLE_SYNCER; 1575 if (flags & MAP_NOCORE) 1576 docow |= MAP_DISABLE_COREDUMP; 1577 /* Shared memory is also shared with children. */ 1578 if (flags & MAP_SHARED) 1579 docow |= MAP_INHERIT_SHARE; 1580 if (writecounted) 1581 docow |= MAP_VN_WRITECOUNT; 1582 1583 if (flags & MAP_STACK) 1584 rv = vm_map_stack(map, *addr, size, prot, maxprot, 1585 docow | MAP_STACK_GROWS_DOWN); 1586 else if (fitit) 1587 rv = vm_map_find(map, object, foff, addr, size, 1588 object != NULL && object->type == OBJT_DEVICE ? 1589 VMFS_ALIGNED_SPACE : VMFS_ANY_SPACE, prot, maxprot, docow); 1590 else 1591 rv = vm_map_fixed(map, object, foff, *addr, size, 1592 prot, maxprot, docow); 1593 1594 if (rv == KERN_SUCCESS) { 1595 /* 1596 * If the process has requested that all future mappings 1597 * be wired, then heed this. 1598 */ 1599 if (map->flags & MAP_WIREFUTURE) { 1600 vm_map_wire(map, *addr, *addr + size, 1601 VM_MAP_WIRE_USER | ((flags & MAP_STACK) ? 1602 VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES)); 1603 } 1604 } else { 1605 /* 1606 * If this mapping was accounted for in the vnode's 1607 * writecount, then undo that now. 1608 */ 1609 if (writecounted) 1610 vnode_pager_release_writecount(object, 0, size); 1611 /* 1612 * Lose the object reference. Will destroy the 1613 * object if it's an unnamed anonymous mapping 1614 * or named anonymous without other references. 1615 */ 1616 vm_object_deallocate(object); 1617 } 1618 return (vm_mmap_to_errno(rv)); 1619 } 1620 1621 /* 1622 * Translate a Mach VM return code to zero on success or the appropriate errno 1623 * on failure. 1624 */ 1625 int 1626 vm_mmap_to_errno(int rv) 1627 { 1628 1629 switch (rv) { 1630 case KERN_SUCCESS: 1631 return (0); 1632 case KERN_INVALID_ADDRESS: 1633 case KERN_NO_SPACE: 1634 return (ENOMEM); 1635 case KERN_PROTECTION_FAILURE: 1636 return (EACCES); 1637 default: 1638 return (EINVAL); 1639 } 1640 } 1641