1 /*- 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35 * 36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_compat.h" 47 #include "opt_hwpmc_hooks.h" 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/capability.h> 52 #include <sys/kernel.h> 53 #include <sys/lock.h> 54 #include <sys/mutex.h> 55 #include <sys/sysproto.h> 56 #include <sys/filedesc.h> 57 #include <sys/priv.h> 58 #include <sys/proc.h> 59 #include <sys/racct.h> 60 #include <sys/resource.h> 61 #include <sys/resourcevar.h> 62 #include <sys/rwlock.h> 63 #include <sys/sysctl.h> 64 #include <sys/vnode.h> 65 #include <sys/fcntl.h> 66 #include <sys/file.h> 67 #include <sys/mman.h> 68 #include <sys/mount.h> 69 #include <sys/conf.h> 70 #include <sys/stat.h> 71 #include <sys/sysent.h> 72 #include <sys/vmmeter.h> 73 74 #include <security/mac/mac_framework.h> 75 76 #include <vm/vm.h> 77 #include <vm/vm_param.h> 78 #include <vm/pmap.h> 79 #include <vm/vm_map.h> 80 #include <vm/vm_object.h> 81 #include <vm/vm_page.h> 82 #include <vm/vm_pager.h> 83 #include <vm/vm_pageout.h> 84 #include <vm/vm_extern.h> 85 #include <vm/vm_page.h> 86 #include <vm/vnode_pager.h> 87 88 #ifdef HWPMC_HOOKS 89 #include <sys/pmckern.h> 90 #endif 91 92 int old_mlock = 0; 93 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RW | CTLFLAG_TUN, &old_mlock, 0, 94 "Do not apply RLIMIT_MEMLOCK on mlockall"); 95 TUNABLE_INT("vm.old_mlock", &old_mlock); 96 97 #ifndef _SYS_SYSPROTO_H_ 98 struct sbrk_args { 99 int incr; 100 }; 101 #endif 102 103 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 104 int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *); 105 static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 106 int *, struct cdev *, vm_ooffset_t *, vm_object_t *); 107 static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 108 int *, struct shmfd *, vm_ooffset_t, vm_object_t *); 109 110 /* 111 * MPSAFE 112 */ 113 /* ARGSUSED */ 114 int 115 sys_sbrk(td, uap) 116 struct thread *td; 117 struct sbrk_args *uap; 118 { 119 /* Not yet implemented */ 120 return (EOPNOTSUPP); 121 } 122 123 #ifndef _SYS_SYSPROTO_H_ 124 struct sstk_args { 125 int incr; 126 }; 127 #endif 128 129 /* 130 * MPSAFE 131 */ 132 /* ARGSUSED */ 133 int 134 sys_sstk(td, uap) 135 struct thread *td; 136 struct sstk_args *uap; 137 { 138 /* Not yet implemented */ 139 return (EOPNOTSUPP); 140 } 141 142 #if defined(COMPAT_43) 143 #ifndef _SYS_SYSPROTO_H_ 144 struct getpagesize_args { 145 int dummy; 146 }; 147 #endif 148 149 int 150 ogetpagesize(td, uap) 151 struct thread *td; 152 struct getpagesize_args *uap; 153 { 154 /* MP SAFE */ 155 td->td_retval[0] = PAGE_SIZE; 156 return (0); 157 } 158 #endif /* COMPAT_43 */ 159 160 161 /* 162 * Memory Map (mmap) system call. Note that the file offset 163 * and address are allowed to be NOT page aligned, though if 164 * the MAP_FIXED flag it set, both must have the same remainder 165 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 166 * page-aligned, the actual mapping starts at trunc_page(addr) 167 * and the return value is adjusted up by the page offset. 168 * 169 * Generally speaking, only character devices which are themselves 170 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 171 * there would be no cache coherency between a descriptor and a VM mapping 172 * both to the same character device. 173 */ 174 #ifndef _SYS_SYSPROTO_H_ 175 struct mmap_args { 176 void *addr; 177 size_t len; 178 int prot; 179 int flags; 180 int fd; 181 long pad; 182 off_t pos; 183 }; 184 #endif 185 186 /* 187 * MPSAFE 188 */ 189 int 190 sys_mmap(td, uap) 191 struct thread *td; 192 struct mmap_args *uap; 193 { 194 #ifdef HWPMC_HOOKS 195 struct pmckern_map_in pkm; 196 #endif 197 struct file *fp; 198 struct vnode *vp; 199 vm_offset_t addr; 200 vm_size_t size, pageoff; 201 vm_prot_t cap_maxprot, prot, maxprot; 202 void *handle; 203 objtype_t handle_type; 204 int flags, error; 205 off_t pos; 206 struct vmspace *vms = td->td_proc->p_vmspace; 207 cap_rights_t rights; 208 209 addr = (vm_offset_t) uap->addr; 210 size = uap->len; 211 prot = uap->prot & VM_PROT_ALL; 212 flags = uap->flags; 213 pos = uap->pos; 214 215 fp = NULL; 216 217 /* 218 * Enforce the constraints. 219 * Mapping of length 0 is only allowed for old binaries. 220 * Anonymous mapping shall specify -1 as filedescriptor and 221 * zero position for new code. Be nice to ancient a.out 222 * binaries and correct pos for anonymous mapping, since old 223 * ld.so sometimes issues anonymous map requests with non-zero 224 * pos. 225 */ 226 if (!SV_CURPROC_FLAG(SV_AOUT)) { 227 if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) || 228 ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0))) 229 return (EINVAL); 230 } else { 231 if ((flags & MAP_ANON) != 0) 232 pos = 0; 233 } 234 235 if (flags & MAP_STACK) { 236 if ((uap->fd != -1) || 237 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 238 return (EINVAL); 239 flags |= MAP_ANON; 240 pos = 0; 241 } 242 243 /* 244 * Align the file position to a page boundary, 245 * and save its page offset component. 246 */ 247 pageoff = (pos & PAGE_MASK); 248 pos -= pageoff; 249 250 /* Adjust size for rounding (on both ends). */ 251 size += pageoff; /* low end... */ 252 size = (vm_size_t) round_page(size); /* hi end */ 253 254 /* 255 * Check for illegal addresses. Watch out for address wrap... Note 256 * that VM_*_ADDRESS are not constants due to casts (argh). 257 */ 258 if (flags & MAP_FIXED) { 259 /* 260 * The specified address must have the same remainder 261 * as the file offset taken modulo PAGE_SIZE, so it 262 * should be aligned after adjustment by pageoff. 263 */ 264 addr -= pageoff; 265 if (addr & PAGE_MASK) 266 return (EINVAL); 267 268 /* Address range must be all in user VM space. */ 269 if (addr < vm_map_min(&vms->vm_map) || 270 addr + size > vm_map_max(&vms->vm_map)) 271 return (EINVAL); 272 if (addr + size < addr) 273 return (EINVAL); 274 } else { 275 /* 276 * XXX for non-fixed mappings where no hint is provided or 277 * the hint would fall in the potential heap space, 278 * place it after the end of the largest possible heap. 279 * 280 * There should really be a pmap call to determine a reasonable 281 * location. 282 */ 283 PROC_LOCK(td->td_proc); 284 if (addr == 0 || 285 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 286 addr < round_page((vm_offset_t)vms->vm_daddr + 287 lim_max(td->td_proc, RLIMIT_DATA)))) 288 addr = round_page((vm_offset_t)vms->vm_daddr + 289 lim_max(td->td_proc, RLIMIT_DATA)); 290 PROC_UNLOCK(td->td_proc); 291 } 292 if (flags & MAP_ANON) { 293 /* 294 * Mapping blank space is trivial. 295 */ 296 handle = NULL; 297 handle_type = OBJT_DEFAULT; 298 maxprot = VM_PROT_ALL; 299 cap_maxprot = VM_PROT_ALL; 300 } else { 301 /* 302 * Mapping file, get fp for validation and don't let the 303 * descriptor disappear on us if we block. Check capability 304 * rights, but also return the maximum rights to be combined 305 * with maxprot later. 306 */ 307 rights = CAP_MMAP; 308 if (prot & PROT_READ) 309 rights |= CAP_MMAP_R; 310 if ((flags & MAP_SHARED) != 0) { 311 if (prot & PROT_WRITE) 312 rights |= CAP_MMAP_W; 313 } 314 if (prot & PROT_EXEC) 315 rights |= CAP_MMAP_X; 316 if ((error = fget_mmap(td, uap->fd, rights, &cap_maxprot, 317 &fp)) != 0) 318 goto done; 319 if (fp->f_type == DTYPE_SHM) { 320 handle = fp->f_data; 321 handle_type = OBJT_SWAP; 322 maxprot = VM_PROT_NONE; 323 324 /* FREAD should always be set. */ 325 if (fp->f_flag & FREAD) 326 maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; 327 if (fp->f_flag & FWRITE) 328 maxprot |= VM_PROT_WRITE; 329 goto map; 330 } 331 if (fp->f_type != DTYPE_VNODE) { 332 error = ENODEV; 333 goto done; 334 } 335 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ 336 defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) 337 /* 338 * POSIX shared-memory objects are defined to have 339 * kernel persistence, and are not defined to support 340 * read(2)/write(2) -- or even open(2). Thus, we can 341 * use MAP_ASYNC to trade on-disk coherence for speed. 342 * The shm_open(3) library routine turns on the FPOSIXSHM 343 * flag to request this behavior. 344 */ 345 if (fp->f_flag & FPOSIXSHM) 346 flags |= MAP_NOSYNC; 347 #endif 348 vp = fp->f_vnode; 349 /* 350 * Ensure that file and memory protections are 351 * compatible. Note that we only worry about 352 * writability if mapping is shared; in this case, 353 * current and max prot are dictated by the open file. 354 * XXX use the vnode instead? Problem is: what 355 * credentials do we use for determination? What if 356 * proc does a setuid? 357 */ 358 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 359 maxprot = VM_PROT_NONE; 360 else 361 maxprot = VM_PROT_EXECUTE; 362 if (fp->f_flag & FREAD) { 363 maxprot |= VM_PROT_READ; 364 } else if (prot & PROT_READ) { 365 error = EACCES; 366 goto done; 367 } 368 /* 369 * If we are sharing potential changes (either via 370 * MAP_SHARED or via the implicit sharing of character 371 * device mappings), and we are trying to get write 372 * permission although we opened it without asking 373 * for it, bail out. 374 */ 375 if ((flags & MAP_SHARED) != 0) { 376 if ((fp->f_flag & FWRITE) != 0) { 377 maxprot |= VM_PROT_WRITE; 378 } else if ((prot & PROT_WRITE) != 0) { 379 error = EACCES; 380 goto done; 381 } 382 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 383 maxprot |= VM_PROT_WRITE; 384 cap_maxprot |= VM_PROT_WRITE; 385 } 386 handle = (void *)vp; 387 handle_type = OBJT_VNODE; 388 } 389 map: 390 td->td_fpop = fp; 391 maxprot &= cap_maxprot; 392 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 393 flags, handle_type, handle, pos); 394 td->td_fpop = NULL; 395 #ifdef HWPMC_HOOKS 396 /* inform hwpmc(4) if an executable is being mapped */ 397 if (error == 0 && handle_type == OBJT_VNODE && 398 (prot & PROT_EXEC)) { 399 pkm.pm_file = handle; 400 pkm.pm_address = (uintptr_t) addr; 401 PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); 402 } 403 #endif 404 if (error == 0) 405 td->td_retval[0] = (register_t) (addr + pageoff); 406 done: 407 if (fp) 408 fdrop(fp, td); 409 410 return (error); 411 } 412 413 int 414 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 415 { 416 struct mmap_args oargs; 417 418 oargs.addr = uap->addr; 419 oargs.len = uap->len; 420 oargs.prot = uap->prot; 421 oargs.flags = uap->flags; 422 oargs.fd = uap->fd; 423 oargs.pos = uap->pos; 424 return (sys_mmap(td, &oargs)); 425 } 426 427 #ifdef COMPAT_43 428 #ifndef _SYS_SYSPROTO_H_ 429 struct ommap_args { 430 caddr_t addr; 431 int len; 432 int prot; 433 int flags; 434 int fd; 435 long pos; 436 }; 437 #endif 438 int 439 ommap(td, uap) 440 struct thread *td; 441 struct ommap_args *uap; 442 { 443 struct mmap_args nargs; 444 static const char cvtbsdprot[8] = { 445 0, 446 PROT_EXEC, 447 PROT_WRITE, 448 PROT_EXEC | PROT_WRITE, 449 PROT_READ, 450 PROT_EXEC | PROT_READ, 451 PROT_WRITE | PROT_READ, 452 PROT_EXEC | PROT_WRITE | PROT_READ, 453 }; 454 455 #define OMAP_ANON 0x0002 456 #define OMAP_COPY 0x0020 457 #define OMAP_SHARED 0x0010 458 #define OMAP_FIXED 0x0100 459 460 nargs.addr = uap->addr; 461 nargs.len = uap->len; 462 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 463 #ifdef COMPAT_FREEBSD32 464 #if defined(__amd64__) || defined(__ia64__) 465 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 466 nargs.prot != 0) 467 nargs.prot |= PROT_EXEC; 468 #endif 469 #endif 470 nargs.flags = 0; 471 if (uap->flags & OMAP_ANON) 472 nargs.flags |= MAP_ANON; 473 if (uap->flags & OMAP_COPY) 474 nargs.flags |= MAP_COPY; 475 if (uap->flags & OMAP_SHARED) 476 nargs.flags |= MAP_SHARED; 477 else 478 nargs.flags |= MAP_PRIVATE; 479 if (uap->flags & OMAP_FIXED) 480 nargs.flags |= MAP_FIXED; 481 nargs.fd = uap->fd; 482 nargs.pos = uap->pos; 483 return (sys_mmap(td, &nargs)); 484 } 485 #endif /* COMPAT_43 */ 486 487 488 #ifndef _SYS_SYSPROTO_H_ 489 struct msync_args { 490 void *addr; 491 size_t len; 492 int flags; 493 }; 494 #endif 495 /* 496 * MPSAFE 497 */ 498 int 499 sys_msync(td, uap) 500 struct thread *td; 501 struct msync_args *uap; 502 { 503 vm_offset_t addr; 504 vm_size_t size, pageoff; 505 int flags; 506 vm_map_t map; 507 int rv; 508 509 addr = (vm_offset_t) uap->addr; 510 size = uap->len; 511 flags = uap->flags; 512 513 pageoff = (addr & PAGE_MASK); 514 addr -= pageoff; 515 size += pageoff; 516 size = (vm_size_t) round_page(size); 517 if (addr + size < addr) 518 return (EINVAL); 519 520 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 521 return (EINVAL); 522 523 map = &td->td_proc->p_vmspace->vm_map; 524 525 /* 526 * Clean the pages and interpret the return value. 527 */ 528 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 529 (flags & MS_INVALIDATE) != 0); 530 switch (rv) { 531 case KERN_SUCCESS: 532 return (0); 533 case KERN_INVALID_ADDRESS: 534 return (EINVAL); /* Sun returns ENOMEM? */ 535 case KERN_INVALID_ARGUMENT: 536 return (EBUSY); 537 case KERN_FAILURE: 538 return (EIO); 539 default: 540 return (EINVAL); 541 } 542 } 543 544 #ifndef _SYS_SYSPROTO_H_ 545 struct munmap_args { 546 void *addr; 547 size_t len; 548 }; 549 #endif 550 /* 551 * MPSAFE 552 */ 553 int 554 sys_munmap(td, uap) 555 struct thread *td; 556 struct munmap_args *uap; 557 { 558 #ifdef HWPMC_HOOKS 559 struct pmckern_map_out pkm; 560 vm_map_entry_t entry; 561 #endif 562 vm_offset_t addr; 563 vm_size_t size, pageoff; 564 vm_map_t map; 565 566 addr = (vm_offset_t) uap->addr; 567 size = uap->len; 568 if (size == 0) 569 return (EINVAL); 570 571 pageoff = (addr & PAGE_MASK); 572 addr -= pageoff; 573 size += pageoff; 574 size = (vm_size_t) round_page(size); 575 if (addr + size < addr) 576 return (EINVAL); 577 578 /* 579 * Check for illegal addresses. Watch out for address wrap... 580 */ 581 map = &td->td_proc->p_vmspace->vm_map; 582 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 583 return (EINVAL); 584 vm_map_lock(map); 585 #ifdef HWPMC_HOOKS 586 /* 587 * Inform hwpmc if the address range being unmapped contains 588 * an executable region. 589 */ 590 pkm.pm_address = (uintptr_t) NULL; 591 if (vm_map_lookup_entry(map, addr, &entry)) { 592 for (; 593 entry != &map->header && entry->start < addr + size; 594 entry = entry->next) { 595 if (vm_map_check_protection(map, entry->start, 596 entry->end, VM_PROT_EXECUTE) == TRUE) { 597 pkm.pm_address = (uintptr_t) addr; 598 pkm.pm_size = (size_t) size; 599 break; 600 } 601 } 602 } 603 #endif 604 vm_map_delete(map, addr, addr + size); 605 606 #ifdef HWPMC_HOOKS 607 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 608 vm_map_lock_downgrade(map); 609 if (pkm.pm_address != (uintptr_t) NULL) 610 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 611 vm_map_unlock_read(map); 612 #else 613 vm_map_unlock(map); 614 #endif 615 /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ 616 return (0); 617 } 618 619 #ifndef _SYS_SYSPROTO_H_ 620 struct mprotect_args { 621 const void *addr; 622 size_t len; 623 int prot; 624 }; 625 #endif 626 /* 627 * MPSAFE 628 */ 629 int 630 sys_mprotect(td, uap) 631 struct thread *td; 632 struct mprotect_args *uap; 633 { 634 vm_offset_t addr; 635 vm_size_t size, pageoff; 636 vm_prot_t prot; 637 638 addr = (vm_offset_t) uap->addr; 639 size = uap->len; 640 prot = uap->prot & VM_PROT_ALL; 641 642 pageoff = (addr & PAGE_MASK); 643 addr -= pageoff; 644 size += pageoff; 645 size = (vm_size_t) round_page(size); 646 if (addr + size < addr) 647 return (EINVAL); 648 649 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 650 addr + size, prot, FALSE)) { 651 case KERN_SUCCESS: 652 return (0); 653 case KERN_PROTECTION_FAILURE: 654 return (EACCES); 655 case KERN_RESOURCE_SHORTAGE: 656 return (ENOMEM); 657 } 658 return (EINVAL); 659 } 660 661 #ifndef _SYS_SYSPROTO_H_ 662 struct minherit_args { 663 void *addr; 664 size_t len; 665 int inherit; 666 }; 667 #endif 668 /* 669 * MPSAFE 670 */ 671 int 672 sys_minherit(td, uap) 673 struct thread *td; 674 struct minherit_args *uap; 675 { 676 vm_offset_t addr; 677 vm_size_t size, pageoff; 678 vm_inherit_t inherit; 679 680 addr = (vm_offset_t)uap->addr; 681 size = uap->len; 682 inherit = uap->inherit; 683 684 pageoff = (addr & PAGE_MASK); 685 addr -= pageoff; 686 size += pageoff; 687 size = (vm_size_t) round_page(size); 688 if (addr + size < addr) 689 return (EINVAL); 690 691 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 692 addr + size, inherit)) { 693 case KERN_SUCCESS: 694 return (0); 695 case KERN_PROTECTION_FAILURE: 696 return (EACCES); 697 } 698 return (EINVAL); 699 } 700 701 #ifndef _SYS_SYSPROTO_H_ 702 struct madvise_args { 703 void *addr; 704 size_t len; 705 int behav; 706 }; 707 #endif 708 709 /* 710 * MPSAFE 711 */ 712 int 713 sys_madvise(td, uap) 714 struct thread *td; 715 struct madvise_args *uap; 716 { 717 vm_offset_t start, end; 718 vm_map_t map; 719 struct proc *p; 720 int error; 721 722 /* 723 * Check for our special case, advising the swap pager we are 724 * "immortal." 725 */ 726 if (uap->behav == MADV_PROTECT) { 727 error = priv_check(td, PRIV_VM_MADV_PROTECT); 728 if (error == 0) { 729 p = td->td_proc; 730 PROC_LOCK(p); 731 p->p_flag |= P_PROTECTED; 732 PROC_UNLOCK(p); 733 } 734 return (error); 735 } 736 /* 737 * Check for illegal behavior 738 */ 739 if (uap->behav < 0 || uap->behav > MADV_CORE) 740 return (EINVAL); 741 /* 742 * Check for illegal addresses. Watch out for address wrap... Note 743 * that VM_*_ADDRESS are not constants due to casts (argh). 744 */ 745 map = &td->td_proc->p_vmspace->vm_map; 746 if ((vm_offset_t)uap->addr < vm_map_min(map) || 747 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 748 return (EINVAL); 749 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 750 return (EINVAL); 751 752 /* 753 * Since this routine is only advisory, we default to conservative 754 * behavior. 755 */ 756 start = trunc_page((vm_offset_t) uap->addr); 757 end = round_page((vm_offset_t) uap->addr + uap->len); 758 759 if (vm_map_madvise(map, start, end, uap->behav)) 760 return (EINVAL); 761 return (0); 762 } 763 764 #ifndef _SYS_SYSPROTO_H_ 765 struct mincore_args { 766 const void *addr; 767 size_t len; 768 char *vec; 769 }; 770 #endif 771 772 /* 773 * MPSAFE 774 */ 775 int 776 sys_mincore(td, uap) 777 struct thread *td; 778 struct mincore_args *uap; 779 { 780 vm_offset_t addr, first_addr; 781 vm_offset_t end, cend; 782 pmap_t pmap; 783 vm_map_t map; 784 char *vec; 785 int error = 0; 786 int vecindex, lastvecindex; 787 vm_map_entry_t current; 788 vm_map_entry_t entry; 789 vm_object_t object; 790 vm_paddr_t locked_pa; 791 vm_page_t m; 792 vm_pindex_t pindex; 793 int mincoreinfo; 794 unsigned int timestamp; 795 boolean_t locked; 796 797 /* 798 * Make sure that the addresses presented are valid for user 799 * mode. 800 */ 801 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 802 end = addr + (vm_size_t)round_page(uap->len); 803 map = &td->td_proc->p_vmspace->vm_map; 804 if (end > vm_map_max(map) || end < addr) 805 return (ENOMEM); 806 807 /* 808 * Address of byte vector 809 */ 810 vec = uap->vec; 811 812 pmap = vmspace_pmap(td->td_proc->p_vmspace); 813 814 vm_map_lock_read(map); 815 RestartScan: 816 timestamp = map->timestamp; 817 818 if (!vm_map_lookup_entry(map, addr, &entry)) { 819 vm_map_unlock_read(map); 820 return (ENOMEM); 821 } 822 823 /* 824 * Do this on a map entry basis so that if the pages are not 825 * in the current processes address space, we can easily look 826 * up the pages elsewhere. 827 */ 828 lastvecindex = -1; 829 for (current = entry; 830 (current != &map->header) && (current->start < end); 831 current = current->next) { 832 833 /* 834 * check for contiguity 835 */ 836 if (current->end < end && 837 (entry->next == &map->header || 838 current->next->start > current->end)) { 839 vm_map_unlock_read(map); 840 return (ENOMEM); 841 } 842 843 /* 844 * ignore submaps (for now) or null objects 845 */ 846 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 847 current->object.vm_object == NULL) 848 continue; 849 850 /* 851 * limit this scan to the current map entry and the 852 * limits for the mincore call 853 */ 854 if (addr < current->start) 855 addr = current->start; 856 cend = current->end; 857 if (cend > end) 858 cend = end; 859 860 /* 861 * scan this entry one page at a time 862 */ 863 while (addr < cend) { 864 /* 865 * Check pmap first, it is likely faster, also 866 * it can provide info as to whether we are the 867 * one referencing or modifying the page. 868 */ 869 object = NULL; 870 locked_pa = 0; 871 retry: 872 m = NULL; 873 mincoreinfo = pmap_mincore(pmap, addr, &locked_pa); 874 if (locked_pa != 0) { 875 /* 876 * The page is mapped by this process but not 877 * both accessed and modified. It is also 878 * managed. Acquire the object lock so that 879 * other mappings might be examined. 880 */ 881 m = PHYS_TO_VM_PAGE(locked_pa); 882 if (m->object != object) { 883 if (object != NULL) 884 VM_OBJECT_WUNLOCK(object); 885 object = m->object; 886 locked = VM_OBJECT_TRYWLOCK(object); 887 vm_page_unlock(m); 888 if (!locked) { 889 VM_OBJECT_WLOCK(object); 890 vm_page_lock(m); 891 goto retry; 892 } 893 } else 894 vm_page_unlock(m); 895 KASSERT(m->valid == VM_PAGE_BITS_ALL, 896 ("mincore: page %p is mapped but invalid", 897 m)); 898 } else if (mincoreinfo == 0) { 899 /* 900 * The page is not mapped by this process. If 901 * the object implements managed pages, then 902 * determine if the page is resident so that 903 * the mappings might be examined. 904 */ 905 if (current->object.vm_object != object) { 906 if (object != NULL) 907 VM_OBJECT_WUNLOCK(object); 908 object = current->object.vm_object; 909 VM_OBJECT_WLOCK(object); 910 } 911 if (object->type == OBJT_DEFAULT || 912 object->type == OBJT_SWAP || 913 object->type == OBJT_VNODE) { 914 pindex = OFF_TO_IDX(current->offset + 915 (addr - current->start)); 916 m = vm_page_lookup(object, pindex); 917 if (m == NULL && 918 vm_page_is_cached(object, pindex)) 919 mincoreinfo = MINCORE_INCORE; 920 if (m != NULL && m->valid == 0) 921 m = NULL; 922 if (m != NULL) 923 mincoreinfo = MINCORE_INCORE; 924 } 925 } 926 if (m != NULL) { 927 /* Examine other mappings to the page. */ 928 if (m->dirty == 0 && pmap_is_modified(m)) 929 vm_page_dirty(m); 930 if (m->dirty != 0) 931 mincoreinfo |= MINCORE_MODIFIED_OTHER; 932 /* 933 * The first test for PGA_REFERENCED is an 934 * optimization. The second test is 935 * required because a concurrent pmap 936 * operation could clear the last reference 937 * and set PGA_REFERENCED before the call to 938 * pmap_is_referenced(). 939 */ 940 if ((m->aflags & PGA_REFERENCED) != 0 || 941 pmap_is_referenced(m) || 942 (m->aflags & PGA_REFERENCED) != 0) 943 mincoreinfo |= MINCORE_REFERENCED_OTHER; 944 } 945 if (object != NULL) 946 VM_OBJECT_WUNLOCK(object); 947 948 /* 949 * subyte may page fault. In case it needs to modify 950 * the map, we release the lock. 951 */ 952 vm_map_unlock_read(map); 953 954 /* 955 * calculate index into user supplied byte vector 956 */ 957 vecindex = OFF_TO_IDX(addr - first_addr); 958 959 /* 960 * If we have skipped map entries, we need to make sure that 961 * the byte vector is zeroed for those skipped entries. 962 */ 963 while ((lastvecindex + 1) < vecindex) { 964 error = subyte(vec + lastvecindex, 0); 965 if (error) { 966 error = EFAULT; 967 goto done2; 968 } 969 ++lastvecindex; 970 } 971 972 /* 973 * Pass the page information to the user 974 */ 975 error = subyte(vec + vecindex, mincoreinfo); 976 if (error) { 977 error = EFAULT; 978 goto done2; 979 } 980 981 /* 982 * If the map has changed, due to the subyte, the previous 983 * output may be invalid. 984 */ 985 vm_map_lock_read(map); 986 if (timestamp != map->timestamp) 987 goto RestartScan; 988 989 lastvecindex = vecindex; 990 addr += PAGE_SIZE; 991 } 992 } 993 994 /* 995 * subyte may page fault. In case it needs to modify 996 * the map, we release the lock. 997 */ 998 vm_map_unlock_read(map); 999 1000 /* 1001 * Zero the last entries in the byte vector. 1002 */ 1003 vecindex = OFF_TO_IDX(end - first_addr); 1004 while ((lastvecindex + 1) < vecindex) { 1005 error = subyte(vec + lastvecindex, 0); 1006 if (error) { 1007 error = EFAULT; 1008 goto done2; 1009 } 1010 ++lastvecindex; 1011 } 1012 1013 /* 1014 * If the map has changed, due to the subyte, the previous 1015 * output may be invalid. 1016 */ 1017 vm_map_lock_read(map); 1018 if (timestamp != map->timestamp) 1019 goto RestartScan; 1020 vm_map_unlock_read(map); 1021 done2: 1022 return (error); 1023 } 1024 1025 #ifndef _SYS_SYSPROTO_H_ 1026 struct mlock_args { 1027 const void *addr; 1028 size_t len; 1029 }; 1030 #endif 1031 /* 1032 * MPSAFE 1033 */ 1034 int 1035 sys_mlock(td, uap) 1036 struct thread *td; 1037 struct mlock_args *uap; 1038 { 1039 struct proc *proc; 1040 vm_offset_t addr, end, last, start; 1041 vm_size_t npages, size; 1042 vm_map_t map; 1043 unsigned long nsize; 1044 int error; 1045 1046 error = priv_check(td, PRIV_VM_MLOCK); 1047 if (error) 1048 return (error); 1049 addr = (vm_offset_t)uap->addr; 1050 size = uap->len; 1051 last = addr + size; 1052 start = trunc_page(addr); 1053 end = round_page(last); 1054 if (last < addr || end < addr) 1055 return (EINVAL); 1056 npages = atop(end - start); 1057 if (npages > vm_page_max_wired) 1058 return (ENOMEM); 1059 proc = td->td_proc; 1060 map = &proc->p_vmspace->vm_map; 1061 PROC_LOCK(proc); 1062 nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1063 if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) { 1064 PROC_UNLOCK(proc); 1065 return (ENOMEM); 1066 } 1067 PROC_UNLOCK(proc); 1068 if (npages + cnt.v_wire_count > vm_page_max_wired) 1069 return (EAGAIN); 1070 #ifdef RACCT 1071 PROC_LOCK(proc); 1072 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1073 PROC_UNLOCK(proc); 1074 if (error != 0) 1075 return (ENOMEM); 1076 #endif 1077 error = vm_map_wire(map, start, end, 1078 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1079 #ifdef RACCT 1080 if (error != KERN_SUCCESS) { 1081 PROC_LOCK(proc); 1082 racct_set(proc, RACCT_MEMLOCK, 1083 ptoa(pmap_wired_count(map->pmap))); 1084 PROC_UNLOCK(proc); 1085 } 1086 #endif 1087 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1088 } 1089 1090 #ifndef _SYS_SYSPROTO_H_ 1091 struct mlockall_args { 1092 int how; 1093 }; 1094 #endif 1095 1096 /* 1097 * MPSAFE 1098 */ 1099 int 1100 sys_mlockall(td, uap) 1101 struct thread *td; 1102 struct mlockall_args *uap; 1103 { 1104 vm_map_t map; 1105 int error; 1106 1107 map = &td->td_proc->p_vmspace->vm_map; 1108 error = priv_check(td, PRIV_VM_MLOCK); 1109 if (error) 1110 return (error); 1111 1112 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1113 return (EINVAL); 1114 1115 /* 1116 * If wiring all pages in the process would cause it to exceed 1117 * a hard resource limit, return ENOMEM. 1118 */ 1119 if (!old_mlock && uap->how & MCL_CURRENT) { 1120 PROC_LOCK(td->td_proc); 1121 if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1122 PROC_UNLOCK(td->td_proc); 1123 return (ENOMEM); 1124 } 1125 PROC_UNLOCK(td->td_proc); 1126 } 1127 #ifdef RACCT 1128 PROC_LOCK(td->td_proc); 1129 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1130 PROC_UNLOCK(td->td_proc); 1131 if (error != 0) 1132 return (ENOMEM); 1133 #endif 1134 1135 if (uap->how & MCL_FUTURE) { 1136 vm_map_lock(map); 1137 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1138 vm_map_unlock(map); 1139 error = 0; 1140 } 1141 1142 if (uap->how & MCL_CURRENT) { 1143 /* 1144 * P1003.1-2001 mandates that all currently mapped pages 1145 * will be memory resident and locked (wired) upon return 1146 * from mlockall(). vm_map_wire() will wire pages, by 1147 * calling vm_fault_wire() for each page in the region. 1148 */ 1149 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1150 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1151 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1152 } 1153 #ifdef RACCT 1154 if (error != KERN_SUCCESS) { 1155 PROC_LOCK(td->td_proc); 1156 racct_set(td->td_proc, RACCT_MEMLOCK, 1157 ptoa(pmap_wired_count(map->pmap))); 1158 PROC_UNLOCK(td->td_proc); 1159 } 1160 #endif 1161 1162 return (error); 1163 } 1164 1165 #ifndef _SYS_SYSPROTO_H_ 1166 struct munlockall_args { 1167 register_t dummy; 1168 }; 1169 #endif 1170 1171 /* 1172 * MPSAFE 1173 */ 1174 int 1175 sys_munlockall(td, uap) 1176 struct thread *td; 1177 struct munlockall_args *uap; 1178 { 1179 vm_map_t map; 1180 int error; 1181 1182 map = &td->td_proc->p_vmspace->vm_map; 1183 error = priv_check(td, PRIV_VM_MUNLOCK); 1184 if (error) 1185 return (error); 1186 1187 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1188 vm_map_lock(map); 1189 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1190 vm_map_unlock(map); 1191 1192 /* Forcibly unwire all pages. */ 1193 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1194 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1195 #ifdef RACCT 1196 if (error == KERN_SUCCESS) { 1197 PROC_LOCK(td->td_proc); 1198 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1199 PROC_UNLOCK(td->td_proc); 1200 } 1201 #endif 1202 1203 return (error); 1204 } 1205 1206 #ifndef _SYS_SYSPROTO_H_ 1207 struct munlock_args { 1208 const void *addr; 1209 size_t len; 1210 }; 1211 #endif 1212 /* 1213 * MPSAFE 1214 */ 1215 int 1216 sys_munlock(td, uap) 1217 struct thread *td; 1218 struct munlock_args *uap; 1219 { 1220 vm_offset_t addr, end, last, start; 1221 vm_size_t size; 1222 int error; 1223 1224 error = priv_check(td, PRIV_VM_MUNLOCK); 1225 if (error) 1226 return (error); 1227 addr = (vm_offset_t)uap->addr; 1228 size = uap->len; 1229 last = addr + size; 1230 start = trunc_page(addr); 1231 end = round_page(last); 1232 if (last < addr || end < addr) 1233 return (EINVAL); 1234 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1235 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1236 #ifdef RACCT 1237 if (error == KERN_SUCCESS) { 1238 PROC_LOCK(td->td_proc); 1239 racct_sub(td->td_proc, RACCT_MEMLOCK, ptoa(end - start)); 1240 PROC_UNLOCK(td->td_proc); 1241 } 1242 #endif 1243 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1244 } 1245 1246 /* 1247 * vm_mmap_vnode() 1248 * 1249 * Helper function for vm_mmap. Perform sanity check specific for mmap 1250 * operations on vnodes. 1251 * 1252 * For VCHR vnodes, the vnode lock is held over the call to 1253 * vm_mmap_cdev() to keep vp->v_rdev valid. 1254 */ 1255 int 1256 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1257 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1258 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1259 boolean_t *writecounted) 1260 { 1261 struct vattr va; 1262 vm_object_t obj; 1263 vm_offset_t foff; 1264 struct mount *mp; 1265 struct ucred *cred; 1266 int error, flags, locktype; 1267 1268 mp = vp->v_mount; 1269 cred = td->td_ucred; 1270 if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED)) 1271 locktype = LK_EXCLUSIVE; 1272 else 1273 locktype = LK_SHARED; 1274 if ((error = vget(vp, locktype, td)) != 0) 1275 return (error); 1276 foff = *foffp; 1277 flags = *flagsp; 1278 obj = vp->v_object; 1279 if (vp->v_type == VREG) { 1280 /* 1281 * Get the proper underlying object 1282 */ 1283 if (obj == NULL) { 1284 error = EINVAL; 1285 goto done; 1286 } 1287 if (obj->handle != vp) { 1288 vput(vp); 1289 vp = (struct vnode *)obj->handle; 1290 /* 1291 * Bypass filesystems obey the mpsafety of the 1292 * underlying fs. 1293 */ 1294 error = vget(vp, locktype, td); 1295 if (error != 0) 1296 return (error); 1297 } 1298 if (locktype == LK_EXCLUSIVE) { 1299 *writecounted = TRUE; 1300 vnode_pager_update_writecount(obj, 0, objsize); 1301 } 1302 } else if (vp->v_type == VCHR) { 1303 error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp, 1304 vp->v_rdev, foffp, objp); 1305 if (error == 0) 1306 goto mark_atime; 1307 goto done; 1308 } else { 1309 error = EINVAL; 1310 goto done; 1311 } 1312 if ((error = VOP_GETATTR(vp, &va, cred))) 1313 goto done; 1314 #ifdef MAC 1315 error = mac_vnode_check_mmap(cred, vp, prot, flags); 1316 if (error != 0) 1317 goto done; 1318 #endif 1319 if ((flags & MAP_SHARED) != 0) { 1320 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1321 if (prot & PROT_WRITE) { 1322 error = EPERM; 1323 goto done; 1324 } 1325 *maxprotp &= ~VM_PROT_WRITE; 1326 } 1327 } 1328 /* 1329 * If it is a regular file without any references 1330 * we do not need to sync it. 1331 * Adjust object size to be the size of actual file. 1332 */ 1333 objsize = round_page(va.va_size); 1334 if (va.va_nlink == 0) 1335 flags |= MAP_NOSYNC; 1336 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, cred); 1337 if (obj == NULL) { 1338 error = ENOMEM; 1339 goto done; 1340 } 1341 *objp = obj; 1342 *flagsp = flags; 1343 1344 mark_atime: 1345 vfs_mark_atime(vp, cred); 1346 1347 done: 1348 if (error != 0 && *writecounted) { 1349 *writecounted = FALSE; 1350 vnode_pager_update_writecount(obj, objsize, 0); 1351 } 1352 vput(vp); 1353 return (error); 1354 } 1355 1356 /* 1357 * vm_mmap_cdev() 1358 * 1359 * MPSAFE 1360 * 1361 * Helper function for vm_mmap. Perform sanity check specific for mmap 1362 * operations on cdevs. 1363 */ 1364 int 1365 vm_mmap_cdev(struct thread *td, vm_size_t objsize, 1366 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1367 struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp) 1368 { 1369 vm_object_t obj; 1370 struct cdevsw *dsw; 1371 int error, flags, ref; 1372 1373 flags = *flagsp; 1374 1375 dsw = dev_refthread(cdev, &ref); 1376 if (dsw == NULL) 1377 return (ENXIO); 1378 if (dsw->d_flags & D_MMAP_ANON) { 1379 dev_relthread(cdev, ref); 1380 *maxprotp = VM_PROT_ALL; 1381 *flagsp |= MAP_ANON; 1382 return (0); 1383 } 1384 /* 1385 * cdevs do not provide private mappings of any kind. 1386 */ 1387 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1388 (prot & PROT_WRITE) != 0) { 1389 dev_relthread(cdev, ref); 1390 return (EACCES); 1391 } 1392 if (flags & (MAP_PRIVATE|MAP_COPY)) { 1393 dev_relthread(cdev, ref); 1394 return (EINVAL); 1395 } 1396 /* 1397 * Force device mappings to be shared. 1398 */ 1399 flags |= MAP_SHARED; 1400 #ifdef MAC_XXX 1401 error = mac_cdev_check_mmap(td->td_ucred, cdev, prot); 1402 if (error != 0) { 1403 dev_relthread(cdev, ref); 1404 return (error); 1405 } 1406 #endif 1407 /* 1408 * First, try d_mmap_single(). If that is not implemented 1409 * (returns ENODEV), fall back to using the device pager. 1410 * Note that d_mmap_single() must return a reference to the 1411 * object (it needs to bump the reference count of the object 1412 * it returns somehow). 1413 * 1414 * XXX assumes VM_PROT_* == PROT_* 1415 */ 1416 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1417 dev_relthread(cdev, ref); 1418 if (error != ENODEV) 1419 return (error); 1420 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1421 td->td_ucred); 1422 if (obj == NULL) 1423 return (EINVAL); 1424 *objp = obj; 1425 *flagsp = flags; 1426 return (0); 1427 } 1428 1429 /* 1430 * vm_mmap_shm() 1431 * 1432 * MPSAFE 1433 * 1434 * Helper function for vm_mmap. Perform sanity check specific for mmap 1435 * operations on shm file descriptors. 1436 */ 1437 int 1438 vm_mmap_shm(struct thread *td, vm_size_t objsize, 1439 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1440 struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp) 1441 { 1442 int error; 1443 1444 if ((*flagsp & MAP_SHARED) != 0 && 1445 (*maxprotp & VM_PROT_WRITE) == 0 && 1446 (prot & PROT_WRITE) != 0) 1447 return (EACCES); 1448 #ifdef MAC 1449 error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp); 1450 if (error != 0) 1451 return (error); 1452 #endif 1453 error = shm_mmap(shmfd, objsize, foff, objp); 1454 if (error) 1455 return (error); 1456 return (0); 1457 } 1458 1459 /* 1460 * vm_mmap() 1461 * 1462 * MPSAFE 1463 * 1464 * Internal version of mmap. Currently used by mmap, exec, and sys5 1465 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1466 */ 1467 int 1468 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1469 vm_prot_t maxprot, int flags, 1470 objtype_t handle_type, void *handle, 1471 vm_ooffset_t foff) 1472 { 1473 boolean_t fitit; 1474 vm_object_t object = NULL; 1475 struct thread *td = curthread; 1476 int docow, error, rv; 1477 boolean_t writecounted; 1478 1479 if (size == 0) 1480 return (0); 1481 1482 size = round_page(size); 1483 1484 if (map == &td->td_proc->p_vmspace->vm_map) { 1485 PROC_LOCK(td->td_proc); 1486 if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) { 1487 PROC_UNLOCK(td->td_proc); 1488 return (ENOMEM); 1489 } 1490 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1491 PROC_UNLOCK(td->td_proc); 1492 return (ENOMEM); 1493 } 1494 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1495 if (ptoa(pmap_wired_count(map->pmap)) + size > 1496 lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1497 racct_set_force(td->td_proc, RACCT_VMEM, 1498 map->size); 1499 PROC_UNLOCK(td->td_proc); 1500 return (ENOMEM); 1501 } 1502 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1503 ptoa(pmap_wired_count(map->pmap)) + size); 1504 if (error != 0) { 1505 racct_set_force(td->td_proc, RACCT_VMEM, 1506 map->size); 1507 PROC_UNLOCK(td->td_proc); 1508 return (error); 1509 } 1510 } 1511 PROC_UNLOCK(td->td_proc); 1512 } 1513 1514 /* 1515 * We currently can only deal with page aligned file offsets. 1516 * The check is here rather than in the syscall because the 1517 * kernel calls this function internally for other mmaping 1518 * operations (such as in exec) and non-aligned offsets will 1519 * cause pmap inconsistencies...so we want to be sure to 1520 * disallow this in all cases. 1521 */ 1522 if (foff & PAGE_MASK) 1523 return (EINVAL); 1524 1525 if ((flags & MAP_FIXED) == 0) { 1526 fitit = TRUE; 1527 *addr = round_page(*addr); 1528 } else { 1529 if (*addr != trunc_page(*addr)) 1530 return (EINVAL); 1531 fitit = FALSE; 1532 } 1533 writecounted = FALSE; 1534 1535 /* 1536 * Lookup/allocate object. 1537 */ 1538 switch (handle_type) { 1539 case OBJT_DEVICE: 1540 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, 1541 handle, &foff, &object); 1542 break; 1543 case OBJT_VNODE: 1544 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1545 handle, &foff, &object, &writecounted); 1546 break; 1547 case OBJT_SWAP: 1548 error = vm_mmap_shm(td, size, prot, &maxprot, &flags, 1549 handle, foff, &object); 1550 break; 1551 case OBJT_DEFAULT: 1552 if (handle == NULL) { 1553 error = 0; 1554 break; 1555 } 1556 /* FALLTHROUGH */ 1557 default: 1558 error = EINVAL; 1559 break; 1560 } 1561 if (error) 1562 return (error); 1563 if (flags & MAP_ANON) { 1564 object = NULL; 1565 docow = 0; 1566 /* 1567 * Unnamed anonymous regions always start at 0. 1568 */ 1569 if (handle == 0) 1570 foff = 0; 1571 } else if (flags & MAP_PREFAULT_READ) 1572 docow = MAP_PREFAULT; 1573 else 1574 docow = MAP_PREFAULT_PARTIAL; 1575 1576 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1577 docow |= MAP_COPY_ON_WRITE; 1578 if (flags & MAP_NOSYNC) 1579 docow |= MAP_DISABLE_SYNCER; 1580 if (flags & MAP_NOCORE) 1581 docow |= MAP_DISABLE_COREDUMP; 1582 /* Shared memory is also shared with children. */ 1583 if (flags & MAP_SHARED) 1584 docow |= MAP_INHERIT_SHARE; 1585 if (writecounted) 1586 docow |= MAP_VN_WRITECOUNT; 1587 1588 if (flags & MAP_STACK) 1589 rv = vm_map_stack(map, *addr, size, prot, maxprot, 1590 docow | MAP_STACK_GROWS_DOWN); 1591 else if (fitit) 1592 rv = vm_map_find(map, object, foff, addr, size, 1593 object != NULL && object->type == OBJT_DEVICE ? 1594 VMFS_ALIGNED_SPACE : VMFS_ANY_SPACE, prot, maxprot, docow); 1595 else 1596 rv = vm_map_fixed(map, object, foff, *addr, size, 1597 prot, maxprot, docow); 1598 1599 if (rv == KERN_SUCCESS) { 1600 /* 1601 * If the process has requested that all future mappings 1602 * be wired, then heed this. 1603 */ 1604 if (map->flags & MAP_WIREFUTURE) { 1605 vm_map_wire(map, *addr, *addr + size, 1606 VM_MAP_WIRE_USER | ((flags & MAP_STACK) ? 1607 VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES)); 1608 } 1609 } else { 1610 /* 1611 * If this mapping was accounted for in the vnode's 1612 * writecount, then undo that now. 1613 */ 1614 if (writecounted) 1615 vnode_pager_release_writecount(object, 0, size); 1616 /* 1617 * Lose the object reference. Will destroy the 1618 * object if it's an unnamed anonymous mapping 1619 * or named anonymous without other references. 1620 */ 1621 vm_object_deallocate(object); 1622 } 1623 return (vm_mmap_to_errno(rv)); 1624 } 1625 1626 /* 1627 * Translate a Mach VM return code to zero on success or the appropriate errno 1628 * on failure. 1629 */ 1630 int 1631 vm_mmap_to_errno(int rv) 1632 { 1633 1634 switch (rv) { 1635 case KERN_SUCCESS: 1636 return (0); 1637 case KERN_INVALID_ADDRESS: 1638 case KERN_NO_SPACE: 1639 return (ENOMEM); 1640 case KERN_PROTECTION_FAILURE: 1641 return (EACCES); 1642 default: 1643 return (EINVAL); 1644 } 1645 } 1646