1 /*- 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35 * 36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_compat.h" 47 #include "opt_hwpmc_hooks.h" 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/capability.h> 52 #include <sys/kernel.h> 53 #include <sys/lock.h> 54 #include <sys/mutex.h> 55 #include <sys/sysproto.h> 56 #include <sys/filedesc.h> 57 #include <sys/priv.h> 58 #include <sys/proc.h> 59 #include <sys/racct.h> 60 #include <sys/resource.h> 61 #include <sys/resourcevar.h> 62 #include <sys/rwlock.h> 63 #include <sys/sysctl.h> 64 #include <sys/vnode.h> 65 #include <sys/fcntl.h> 66 #include <sys/file.h> 67 #include <sys/mman.h> 68 #include <sys/mount.h> 69 #include <sys/conf.h> 70 #include <sys/stat.h> 71 #include <sys/sysent.h> 72 #include <sys/vmmeter.h> 73 74 #include <security/mac/mac_framework.h> 75 76 #include <vm/vm.h> 77 #include <vm/vm_param.h> 78 #include <vm/pmap.h> 79 #include <vm/vm_map.h> 80 #include <vm/vm_object.h> 81 #include <vm/vm_page.h> 82 #include <vm/vm_pager.h> 83 #include <vm/vm_pageout.h> 84 #include <vm/vm_extern.h> 85 #include <vm/vm_page.h> 86 #include <vm/vnode_pager.h> 87 88 #ifdef HWPMC_HOOKS 89 #include <sys/pmckern.h> 90 #endif 91 92 int old_mlock = 0; 93 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RW | CTLFLAG_TUN, &old_mlock, 0, 94 "Do not apply RLIMIT_MEMLOCK on mlockall"); 95 TUNABLE_INT("vm.old_mlock", &old_mlock); 96 97 #ifndef _SYS_SYSPROTO_H_ 98 struct sbrk_args { 99 int incr; 100 }; 101 #endif 102 103 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 104 int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *); 105 static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 106 int *, struct cdev *, vm_ooffset_t *, vm_object_t *); 107 static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 108 int *, struct shmfd *, vm_ooffset_t, vm_object_t *); 109 110 /* 111 * MPSAFE 112 */ 113 /* ARGSUSED */ 114 int 115 sys_sbrk(td, uap) 116 struct thread *td; 117 struct sbrk_args *uap; 118 { 119 /* Not yet implemented */ 120 return (EOPNOTSUPP); 121 } 122 123 #ifndef _SYS_SYSPROTO_H_ 124 struct sstk_args { 125 int incr; 126 }; 127 #endif 128 129 /* 130 * MPSAFE 131 */ 132 /* ARGSUSED */ 133 int 134 sys_sstk(td, uap) 135 struct thread *td; 136 struct sstk_args *uap; 137 { 138 /* Not yet implemented */ 139 return (EOPNOTSUPP); 140 } 141 142 #if defined(COMPAT_43) 143 #ifndef _SYS_SYSPROTO_H_ 144 struct getpagesize_args { 145 int dummy; 146 }; 147 #endif 148 149 int 150 ogetpagesize(td, uap) 151 struct thread *td; 152 struct getpagesize_args *uap; 153 { 154 /* MP SAFE */ 155 td->td_retval[0] = PAGE_SIZE; 156 return (0); 157 } 158 #endif /* COMPAT_43 */ 159 160 161 /* 162 * Memory Map (mmap) system call. Note that the file offset 163 * and address are allowed to be NOT page aligned, though if 164 * the MAP_FIXED flag it set, both must have the same remainder 165 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 166 * page-aligned, the actual mapping starts at trunc_page(addr) 167 * and the return value is adjusted up by the page offset. 168 * 169 * Generally speaking, only character devices which are themselves 170 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 171 * there would be no cache coherency between a descriptor and a VM mapping 172 * both to the same character device. 173 */ 174 #ifndef _SYS_SYSPROTO_H_ 175 struct mmap_args { 176 void *addr; 177 size_t len; 178 int prot; 179 int flags; 180 int fd; 181 long pad; 182 off_t pos; 183 }; 184 #endif 185 186 /* 187 * MPSAFE 188 */ 189 int 190 sys_mmap(td, uap) 191 struct thread *td; 192 struct mmap_args *uap; 193 { 194 #ifdef HWPMC_HOOKS 195 struct pmckern_map_in pkm; 196 #endif 197 struct file *fp; 198 struct vnode *vp; 199 vm_offset_t addr; 200 vm_size_t size, pageoff; 201 vm_prot_t cap_maxprot, prot, maxprot; 202 void *handle; 203 objtype_t handle_type; 204 int align, error, flags; 205 off_t pos; 206 struct vmspace *vms = td->td_proc->p_vmspace; 207 cap_rights_t rights; 208 209 addr = (vm_offset_t) uap->addr; 210 size = uap->len; 211 prot = uap->prot & VM_PROT_ALL; 212 flags = uap->flags; 213 pos = uap->pos; 214 215 fp = NULL; 216 217 /* 218 * Enforce the constraints. 219 * Mapping of length 0 is only allowed for old binaries. 220 * Anonymous mapping shall specify -1 as filedescriptor and 221 * zero position for new code. Be nice to ancient a.out 222 * binaries and correct pos for anonymous mapping, since old 223 * ld.so sometimes issues anonymous map requests with non-zero 224 * pos. 225 */ 226 if (!SV_CURPROC_FLAG(SV_AOUT)) { 227 if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) || 228 ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0))) 229 return (EINVAL); 230 } else { 231 if ((flags & MAP_ANON) != 0) 232 pos = 0; 233 } 234 235 if (flags & MAP_STACK) { 236 if ((uap->fd != -1) || 237 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 238 return (EINVAL); 239 flags |= MAP_ANON; 240 pos = 0; 241 } 242 243 /* 244 * Align the file position to a page boundary, 245 * and save its page offset component. 246 */ 247 pageoff = (pos & PAGE_MASK); 248 pos -= pageoff; 249 250 /* Adjust size for rounding (on both ends). */ 251 size += pageoff; /* low end... */ 252 size = (vm_size_t) round_page(size); /* hi end */ 253 254 /* Ensure alignment is at least a page and fits in a pointer. */ 255 align = flags & MAP_ALIGNMENT_MASK; 256 if (align != 0 && align != MAP_ALIGNED_SUPER && 257 (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || 258 align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) 259 return (EINVAL); 260 261 /* 262 * Check for illegal addresses. Watch out for address wrap... Note 263 * that VM_*_ADDRESS are not constants due to casts (argh). 264 */ 265 if (flags & MAP_FIXED) { 266 /* 267 * The specified address must have the same remainder 268 * as the file offset taken modulo PAGE_SIZE, so it 269 * should be aligned after adjustment by pageoff. 270 */ 271 addr -= pageoff; 272 if (addr & PAGE_MASK) 273 return (EINVAL); 274 275 /* Address range must be all in user VM space. */ 276 if (addr < vm_map_min(&vms->vm_map) || 277 addr + size > vm_map_max(&vms->vm_map)) 278 return (EINVAL); 279 if (addr + size < addr) 280 return (EINVAL); 281 } else { 282 /* 283 * XXX for non-fixed mappings where no hint is provided or 284 * the hint would fall in the potential heap space, 285 * place it after the end of the largest possible heap. 286 * 287 * There should really be a pmap call to determine a reasonable 288 * location. 289 */ 290 PROC_LOCK(td->td_proc); 291 if (addr == 0 || 292 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 293 addr < round_page((vm_offset_t)vms->vm_daddr + 294 lim_max(td->td_proc, RLIMIT_DATA)))) 295 addr = round_page((vm_offset_t)vms->vm_daddr + 296 lim_max(td->td_proc, RLIMIT_DATA)); 297 PROC_UNLOCK(td->td_proc); 298 } 299 if (flags & MAP_ANON) { 300 /* 301 * Mapping blank space is trivial. 302 */ 303 handle = NULL; 304 handle_type = OBJT_DEFAULT; 305 maxprot = VM_PROT_ALL; 306 cap_maxprot = VM_PROT_ALL; 307 } else { 308 /* 309 * Mapping file, get fp for validation and don't let the 310 * descriptor disappear on us if we block. Check capability 311 * rights, but also return the maximum rights to be combined 312 * with maxprot later. 313 */ 314 cap_rights_init(&rights, CAP_MMAP); 315 if (prot & PROT_READ) 316 cap_rights_set(&rights, CAP_MMAP_R); 317 if ((flags & MAP_SHARED) != 0) { 318 if (prot & PROT_WRITE) 319 cap_rights_set(&rights, CAP_MMAP_W); 320 } 321 if (prot & PROT_EXEC) 322 cap_rights_set(&rights, CAP_MMAP_X); 323 error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp); 324 if (error != 0) 325 goto done; 326 if (fp->f_type == DTYPE_SHM) { 327 handle = fp->f_data; 328 handle_type = OBJT_SWAP; 329 maxprot = VM_PROT_NONE; 330 331 /* FREAD should always be set. */ 332 if (fp->f_flag & FREAD) 333 maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; 334 if (fp->f_flag & FWRITE) 335 maxprot |= VM_PROT_WRITE; 336 goto map; 337 } 338 if (fp->f_type != DTYPE_VNODE) { 339 error = ENODEV; 340 goto done; 341 } 342 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ 343 defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) 344 /* 345 * POSIX shared-memory objects are defined to have 346 * kernel persistence, and are not defined to support 347 * read(2)/write(2) -- or even open(2). Thus, we can 348 * use MAP_ASYNC to trade on-disk coherence for speed. 349 * The shm_open(3) library routine turns on the FPOSIXSHM 350 * flag to request this behavior. 351 */ 352 if (fp->f_flag & FPOSIXSHM) 353 flags |= MAP_NOSYNC; 354 #endif 355 vp = fp->f_vnode; 356 /* 357 * Ensure that file and memory protections are 358 * compatible. Note that we only worry about 359 * writability if mapping is shared; in this case, 360 * current and max prot are dictated by the open file. 361 * XXX use the vnode instead? Problem is: what 362 * credentials do we use for determination? What if 363 * proc does a setuid? 364 */ 365 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 366 maxprot = VM_PROT_NONE; 367 else 368 maxprot = VM_PROT_EXECUTE; 369 if (fp->f_flag & FREAD) { 370 maxprot |= VM_PROT_READ; 371 } else if (prot & PROT_READ) { 372 error = EACCES; 373 goto done; 374 } 375 /* 376 * If we are sharing potential changes (either via 377 * MAP_SHARED or via the implicit sharing of character 378 * device mappings), and we are trying to get write 379 * permission although we opened it without asking 380 * for it, bail out. 381 */ 382 if ((flags & MAP_SHARED) != 0) { 383 if ((fp->f_flag & FWRITE) != 0) { 384 maxprot |= VM_PROT_WRITE; 385 } else if ((prot & PROT_WRITE) != 0) { 386 error = EACCES; 387 goto done; 388 } 389 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 390 maxprot |= VM_PROT_WRITE; 391 cap_maxprot |= VM_PROT_WRITE; 392 } 393 handle = (void *)vp; 394 handle_type = OBJT_VNODE; 395 } 396 map: 397 td->td_fpop = fp; 398 maxprot &= cap_maxprot; 399 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 400 flags, handle_type, handle, pos); 401 td->td_fpop = NULL; 402 #ifdef HWPMC_HOOKS 403 /* inform hwpmc(4) if an executable is being mapped */ 404 if (error == 0 && handle_type == OBJT_VNODE && 405 (prot & PROT_EXEC)) { 406 pkm.pm_file = handle; 407 pkm.pm_address = (uintptr_t) addr; 408 PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); 409 } 410 #endif 411 if (error == 0) 412 td->td_retval[0] = (register_t) (addr + pageoff); 413 done: 414 if (fp) 415 fdrop(fp, td); 416 417 return (error); 418 } 419 420 int 421 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 422 { 423 struct mmap_args oargs; 424 425 oargs.addr = uap->addr; 426 oargs.len = uap->len; 427 oargs.prot = uap->prot; 428 oargs.flags = uap->flags; 429 oargs.fd = uap->fd; 430 oargs.pos = uap->pos; 431 return (sys_mmap(td, &oargs)); 432 } 433 434 #ifdef COMPAT_43 435 #ifndef _SYS_SYSPROTO_H_ 436 struct ommap_args { 437 caddr_t addr; 438 int len; 439 int prot; 440 int flags; 441 int fd; 442 long pos; 443 }; 444 #endif 445 int 446 ommap(td, uap) 447 struct thread *td; 448 struct ommap_args *uap; 449 { 450 struct mmap_args nargs; 451 static const char cvtbsdprot[8] = { 452 0, 453 PROT_EXEC, 454 PROT_WRITE, 455 PROT_EXEC | PROT_WRITE, 456 PROT_READ, 457 PROT_EXEC | PROT_READ, 458 PROT_WRITE | PROT_READ, 459 PROT_EXEC | PROT_WRITE | PROT_READ, 460 }; 461 462 #define OMAP_ANON 0x0002 463 #define OMAP_COPY 0x0020 464 #define OMAP_SHARED 0x0010 465 #define OMAP_FIXED 0x0100 466 467 nargs.addr = uap->addr; 468 nargs.len = uap->len; 469 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 470 #ifdef COMPAT_FREEBSD32 471 #if defined(__amd64__) || defined(__ia64__) 472 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 473 nargs.prot != 0) 474 nargs.prot |= PROT_EXEC; 475 #endif 476 #endif 477 nargs.flags = 0; 478 if (uap->flags & OMAP_ANON) 479 nargs.flags |= MAP_ANON; 480 if (uap->flags & OMAP_COPY) 481 nargs.flags |= MAP_COPY; 482 if (uap->flags & OMAP_SHARED) 483 nargs.flags |= MAP_SHARED; 484 else 485 nargs.flags |= MAP_PRIVATE; 486 if (uap->flags & OMAP_FIXED) 487 nargs.flags |= MAP_FIXED; 488 nargs.fd = uap->fd; 489 nargs.pos = uap->pos; 490 return (sys_mmap(td, &nargs)); 491 } 492 #endif /* COMPAT_43 */ 493 494 495 #ifndef _SYS_SYSPROTO_H_ 496 struct msync_args { 497 void *addr; 498 size_t len; 499 int flags; 500 }; 501 #endif 502 /* 503 * MPSAFE 504 */ 505 int 506 sys_msync(td, uap) 507 struct thread *td; 508 struct msync_args *uap; 509 { 510 vm_offset_t addr; 511 vm_size_t size, pageoff; 512 int flags; 513 vm_map_t map; 514 int rv; 515 516 addr = (vm_offset_t) uap->addr; 517 size = uap->len; 518 flags = uap->flags; 519 520 pageoff = (addr & PAGE_MASK); 521 addr -= pageoff; 522 size += pageoff; 523 size = (vm_size_t) round_page(size); 524 if (addr + size < addr) 525 return (EINVAL); 526 527 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 528 return (EINVAL); 529 530 map = &td->td_proc->p_vmspace->vm_map; 531 532 /* 533 * Clean the pages and interpret the return value. 534 */ 535 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 536 (flags & MS_INVALIDATE) != 0); 537 switch (rv) { 538 case KERN_SUCCESS: 539 return (0); 540 case KERN_INVALID_ADDRESS: 541 return (EINVAL); /* Sun returns ENOMEM? */ 542 case KERN_INVALID_ARGUMENT: 543 return (EBUSY); 544 case KERN_FAILURE: 545 return (EIO); 546 default: 547 return (EINVAL); 548 } 549 } 550 551 #ifndef _SYS_SYSPROTO_H_ 552 struct munmap_args { 553 void *addr; 554 size_t len; 555 }; 556 #endif 557 /* 558 * MPSAFE 559 */ 560 int 561 sys_munmap(td, uap) 562 struct thread *td; 563 struct munmap_args *uap; 564 { 565 #ifdef HWPMC_HOOKS 566 struct pmckern_map_out pkm; 567 vm_map_entry_t entry; 568 #endif 569 vm_offset_t addr; 570 vm_size_t size, pageoff; 571 vm_map_t map; 572 573 addr = (vm_offset_t) uap->addr; 574 size = uap->len; 575 if (size == 0) 576 return (EINVAL); 577 578 pageoff = (addr & PAGE_MASK); 579 addr -= pageoff; 580 size += pageoff; 581 size = (vm_size_t) round_page(size); 582 if (addr + size < addr) 583 return (EINVAL); 584 585 /* 586 * Check for illegal addresses. Watch out for address wrap... 587 */ 588 map = &td->td_proc->p_vmspace->vm_map; 589 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 590 return (EINVAL); 591 vm_map_lock(map); 592 #ifdef HWPMC_HOOKS 593 /* 594 * Inform hwpmc if the address range being unmapped contains 595 * an executable region. 596 */ 597 pkm.pm_address = (uintptr_t) NULL; 598 if (vm_map_lookup_entry(map, addr, &entry)) { 599 for (; 600 entry != &map->header && entry->start < addr + size; 601 entry = entry->next) { 602 if (vm_map_check_protection(map, entry->start, 603 entry->end, VM_PROT_EXECUTE) == TRUE) { 604 pkm.pm_address = (uintptr_t) addr; 605 pkm.pm_size = (size_t) size; 606 break; 607 } 608 } 609 } 610 #endif 611 vm_map_delete(map, addr, addr + size); 612 613 #ifdef HWPMC_HOOKS 614 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 615 vm_map_lock_downgrade(map); 616 if (pkm.pm_address != (uintptr_t) NULL) 617 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 618 vm_map_unlock_read(map); 619 #else 620 vm_map_unlock(map); 621 #endif 622 /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ 623 return (0); 624 } 625 626 #ifndef _SYS_SYSPROTO_H_ 627 struct mprotect_args { 628 const void *addr; 629 size_t len; 630 int prot; 631 }; 632 #endif 633 /* 634 * MPSAFE 635 */ 636 int 637 sys_mprotect(td, uap) 638 struct thread *td; 639 struct mprotect_args *uap; 640 { 641 vm_offset_t addr; 642 vm_size_t size, pageoff; 643 vm_prot_t prot; 644 645 addr = (vm_offset_t) uap->addr; 646 size = uap->len; 647 prot = uap->prot & VM_PROT_ALL; 648 649 pageoff = (addr & PAGE_MASK); 650 addr -= pageoff; 651 size += pageoff; 652 size = (vm_size_t) round_page(size); 653 if (addr + size < addr) 654 return (EINVAL); 655 656 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 657 addr + size, prot, FALSE)) { 658 case KERN_SUCCESS: 659 return (0); 660 case KERN_PROTECTION_FAILURE: 661 return (EACCES); 662 case KERN_RESOURCE_SHORTAGE: 663 return (ENOMEM); 664 } 665 return (EINVAL); 666 } 667 668 #ifndef _SYS_SYSPROTO_H_ 669 struct minherit_args { 670 void *addr; 671 size_t len; 672 int inherit; 673 }; 674 #endif 675 /* 676 * MPSAFE 677 */ 678 int 679 sys_minherit(td, uap) 680 struct thread *td; 681 struct minherit_args *uap; 682 { 683 vm_offset_t addr; 684 vm_size_t size, pageoff; 685 vm_inherit_t inherit; 686 687 addr = (vm_offset_t)uap->addr; 688 size = uap->len; 689 inherit = uap->inherit; 690 691 pageoff = (addr & PAGE_MASK); 692 addr -= pageoff; 693 size += pageoff; 694 size = (vm_size_t) round_page(size); 695 if (addr + size < addr) 696 return (EINVAL); 697 698 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 699 addr + size, inherit)) { 700 case KERN_SUCCESS: 701 return (0); 702 case KERN_PROTECTION_FAILURE: 703 return (EACCES); 704 } 705 return (EINVAL); 706 } 707 708 #ifndef _SYS_SYSPROTO_H_ 709 struct madvise_args { 710 void *addr; 711 size_t len; 712 int behav; 713 }; 714 #endif 715 716 /* 717 * MPSAFE 718 */ 719 int 720 sys_madvise(td, uap) 721 struct thread *td; 722 struct madvise_args *uap; 723 { 724 vm_offset_t start, end; 725 vm_map_t map; 726 struct proc *p; 727 int error; 728 729 /* 730 * Check for our special case, advising the swap pager we are 731 * "immortal." 732 */ 733 if (uap->behav == MADV_PROTECT) { 734 error = priv_check(td, PRIV_VM_MADV_PROTECT); 735 if (error == 0) { 736 p = td->td_proc; 737 PROC_LOCK(p); 738 p->p_flag |= P_PROTECTED; 739 PROC_UNLOCK(p); 740 } 741 return (error); 742 } 743 /* 744 * Check for illegal behavior 745 */ 746 if (uap->behav < 0 || uap->behav > MADV_CORE) 747 return (EINVAL); 748 /* 749 * Check for illegal addresses. Watch out for address wrap... Note 750 * that VM_*_ADDRESS are not constants due to casts (argh). 751 */ 752 map = &td->td_proc->p_vmspace->vm_map; 753 if ((vm_offset_t)uap->addr < vm_map_min(map) || 754 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 755 return (EINVAL); 756 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 757 return (EINVAL); 758 759 /* 760 * Since this routine is only advisory, we default to conservative 761 * behavior. 762 */ 763 start = trunc_page((vm_offset_t) uap->addr); 764 end = round_page((vm_offset_t) uap->addr + uap->len); 765 766 if (vm_map_madvise(map, start, end, uap->behav)) 767 return (EINVAL); 768 return (0); 769 } 770 771 #ifndef _SYS_SYSPROTO_H_ 772 struct mincore_args { 773 const void *addr; 774 size_t len; 775 char *vec; 776 }; 777 #endif 778 779 /* 780 * MPSAFE 781 */ 782 int 783 sys_mincore(td, uap) 784 struct thread *td; 785 struct mincore_args *uap; 786 { 787 vm_offset_t addr, first_addr; 788 vm_offset_t end, cend; 789 pmap_t pmap; 790 vm_map_t map; 791 char *vec; 792 int error = 0; 793 int vecindex, lastvecindex; 794 vm_map_entry_t current; 795 vm_map_entry_t entry; 796 vm_object_t object; 797 vm_paddr_t locked_pa; 798 vm_page_t m; 799 vm_pindex_t pindex; 800 int mincoreinfo; 801 unsigned int timestamp; 802 boolean_t locked; 803 804 /* 805 * Make sure that the addresses presented are valid for user 806 * mode. 807 */ 808 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 809 end = addr + (vm_size_t)round_page(uap->len); 810 map = &td->td_proc->p_vmspace->vm_map; 811 if (end > vm_map_max(map) || end < addr) 812 return (ENOMEM); 813 814 /* 815 * Address of byte vector 816 */ 817 vec = uap->vec; 818 819 pmap = vmspace_pmap(td->td_proc->p_vmspace); 820 821 vm_map_lock_read(map); 822 RestartScan: 823 timestamp = map->timestamp; 824 825 if (!vm_map_lookup_entry(map, addr, &entry)) { 826 vm_map_unlock_read(map); 827 return (ENOMEM); 828 } 829 830 /* 831 * Do this on a map entry basis so that if the pages are not 832 * in the current processes address space, we can easily look 833 * up the pages elsewhere. 834 */ 835 lastvecindex = -1; 836 for (current = entry; 837 (current != &map->header) && (current->start < end); 838 current = current->next) { 839 840 /* 841 * check for contiguity 842 */ 843 if (current->end < end && 844 (entry->next == &map->header || 845 current->next->start > current->end)) { 846 vm_map_unlock_read(map); 847 return (ENOMEM); 848 } 849 850 /* 851 * ignore submaps (for now) or null objects 852 */ 853 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 854 current->object.vm_object == NULL) 855 continue; 856 857 /* 858 * limit this scan to the current map entry and the 859 * limits for the mincore call 860 */ 861 if (addr < current->start) 862 addr = current->start; 863 cend = current->end; 864 if (cend > end) 865 cend = end; 866 867 /* 868 * scan this entry one page at a time 869 */ 870 while (addr < cend) { 871 /* 872 * Check pmap first, it is likely faster, also 873 * it can provide info as to whether we are the 874 * one referencing or modifying the page. 875 */ 876 object = NULL; 877 locked_pa = 0; 878 retry: 879 m = NULL; 880 mincoreinfo = pmap_mincore(pmap, addr, &locked_pa); 881 if (locked_pa != 0) { 882 /* 883 * The page is mapped by this process but not 884 * both accessed and modified. It is also 885 * managed. Acquire the object lock so that 886 * other mappings might be examined. 887 */ 888 m = PHYS_TO_VM_PAGE(locked_pa); 889 if (m->object != object) { 890 if (object != NULL) 891 VM_OBJECT_WUNLOCK(object); 892 object = m->object; 893 locked = VM_OBJECT_TRYWLOCK(object); 894 vm_page_unlock(m); 895 if (!locked) { 896 VM_OBJECT_WLOCK(object); 897 vm_page_lock(m); 898 goto retry; 899 } 900 } else 901 vm_page_unlock(m); 902 KASSERT(m->valid == VM_PAGE_BITS_ALL, 903 ("mincore: page %p is mapped but invalid", 904 m)); 905 } else if (mincoreinfo == 0) { 906 /* 907 * The page is not mapped by this process. If 908 * the object implements managed pages, then 909 * determine if the page is resident so that 910 * the mappings might be examined. 911 */ 912 if (current->object.vm_object != object) { 913 if (object != NULL) 914 VM_OBJECT_WUNLOCK(object); 915 object = current->object.vm_object; 916 VM_OBJECT_WLOCK(object); 917 } 918 if (object->type == OBJT_DEFAULT || 919 object->type == OBJT_SWAP || 920 object->type == OBJT_VNODE) { 921 pindex = OFF_TO_IDX(current->offset + 922 (addr - current->start)); 923 m = vm_page_lookup(object, pindex); 924 if (m == NULL && 925 vm_page_is_cached(object, pindex)) 926 mincoreinfo = MINCORE_INCORE; 927 if (m != NULL && m->valid == 0) 928 m = NULL; 929 if (m != NULL) 930 mincoreinfo = MINCORE_INCORE; 931 } 932 } 933 if (m != NULL) { 934 /* Examine other mappings to the page. */ 935 if (m->dirty == 0 && pmap_is_modified(m)) 936 vm_page_dirty(m); 937 if (m->dirty != 0) 938 mincoreinfo |= MINCORE_MODIFIED_OTHER; 939 /* 940 * The first test for PGA_REFERENCED is an 941 * optimization. The second test is 942 * required because a concurrent pmap 943 * operation could clear the last reference 944 * and set PGA_REFERENCED before the call to 945 * pmap_is_referenced(). 946 */ 947 if ((m->aflags & PGA_REFERENCED) != 0 || 948 pmap_is_referenced(m) || 949 (m->aflags & PGA_REFERENCED) != 0) 950 mincoreinfo |= MINCORE_REFERENCED_OTHER; 951 } 952 if (object != NULL) 953 VM_OBJECT_WUNLOCK(object); 954 955 /* 956 * subyte may page fault. In case it needs to modify 957 * the map, we release the lock. 958 */ 959 vm_map_unlock_read(map); 960 961 /* 962 * calculate index into user supplied byte vector 963 */ 964 vecindex = OFF_TO_IDX(addr - first_addr); 965 966 /* 967 * If we have skipped map entries, we need to make sure that 968 * the byte vector is zeroed for those skipped entries. 969 */ 970 while ((lastvecindex + 1) < vecindex) { 971 error = subyte(vec + lastvecindex, 0); 972 if (error) { 973 error = EFAULT; 974 goto done2; 975 } 976 ++lastvecindex; 977 } 978 979 /* 980 * Pass the page information to the user 981 */ 982 error = subyte(vec + vecindex, mincoreinfo); 983 if (error) { 984 error = EFAULT; 985 goto done2; 986 } 987 988 /* 989 * If the map has changed, due to the subyte, the previous 990 * output may be invalid. 991 */ 992 vm_map_lock_read(map); 993 if (timestamp != map->timestamp) 994 goto RestartScan; 995 996 lastvecindex = vecindex; 997 addr += PAGE_SIZE; 998 } 999 } 1000 1001 /* 1002 * subyte may page fault. In case it needs to modify 1003 * the map, we release the lock. 1004 */ 1005 vm_map_unlock_read(map); 1006 1007 /* 1008 * Zero the last entries in the byte vector. 1009 */ 1010 vecindex = OFF_TO_IDX(end - first_addr); 1011 while ((lastvecindex + 1) < vecindex) { 1012 error = subyte(vec + lastvecindex, 0); 1013 if (error) { 1014 error = EFAULT; 1015 goto done2; 1016 } 1017 ++lastvecindex; 1018 } 1019 1020 /* 1021 * If the map has changed, due to the subyte, the previous 1022 * output may be invalid. 1023 */ 1024 vm_map_lock_read(map); 1025 if (timestamp != map->timestamp) 1026 goto RestartScan; 1027 vm_map_unlock_read(map); 1028 done2: 1029 return (error); 1030 } 1031 1032 #ifndef _SYS_SYSPROTO_H_ 1033 struct mlock_args { 1034 const void *addr; 1035 size_t len; 1036 }; 1037 #endif 1038 /* 1039 * MPSAFE 1040 */ 1041 int 1042 sys_mlock(td, uap) 1043 struct thread *td; 1044 struct mlock_args *uap; 1045 { 1046 1047 return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len)); 1048 } 1049 1050 int 1051 vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len) 1052 { 1053 vm_offset_t addr, end, last, start; 1054 vm_size_t npages, size; 1055 vm_map_t map; 1056 unsigned long nsize; 1057 int error; 1058 1059 error = priv_check_cred(cred, PRIV_VM_MLOCK, 0); 1060 if (error) 1061 return (error); 1062 addr = (vm_offset_t)addr0; 1063 size = len; 1064 last = addr + size; 1065 start = trunc_page(addr); 1066 end = round_page(last); 1067 if (last < addr || end < addr) 1068 return (EINVAL); 1069 npages = atop(end - start); 1070 if (npages > vm_page_max_wired) 1071 return (ENOMEM); 1072 map = &proc->p_vmspace->vm_map; 1073 PROC_LOCK(proc); 1074 nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1075 if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) { 1076 PROC_UNLOCK(proc); 1077 return (ENOMEM); 1078 } 1079 PROC_UNLOCK(proc); 1080 if (npages + cnt.v_wire_count > vm_page_max_wired) 1081 return (EAGAIN); 1082 #ifdef RACCT 1083 PROC_LOCK(proc); 1084 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1085 PROC_UNLOCK(proc); 1086 if (error != 0) 1087 return (ENOMEM); 1088 #endif 1089 error = vm_map_wire(map, start, end, 1090 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1091 #ifdef RACCT 1092 if (error != KERN_SUCCESS) { 1093 PROC_LOCK(proc); 1094 racct_set(proc, RACCT_MEMLOCK, 1095 ptoa(pmap_wired_count(map->pmap))); 1096 PROC_UNLOCK(proc); 1097 } 1098 #endif 1099 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1100 } 1101 1102 #ifndef _SYS_SYSPROTO_H_ 1103 struct mlockall_args { 1104 int how; 1105 }; 1106 #endif 1107 1108 /* 1109 * MPSAFE 1110 */ 1111 int 1112 sys_mlockall(td, uap) 1113 struct thread *td; 1114 struct mlockall_args *uap; 1115 { 1116 vm_map_t map; 1117 int error; 1118 1119 map = &td->td_proc->p_vmspace->vm_map; 1120 error = priv_check(td, PRIV_VM_MLOCK); 1121 if (error) 1122 return (error); 1123 1124 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1125 return (EINVAL); 1126 1127 /* 1128 * If wiring all pages in the process would cause it to exceed 1129 * a hard resource limit, return ENOMEM. 1130 */ 1131 if (!old_mlock && uap->how & MCL_CURRENT) { 1132 PROC_LOCK(td->td_proc); 1133 if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1134 PROC_UNLOCK(td->td_proc); 1135 return (ENOMEM); 1136 } 1137 PROC_UNLOCK(td->td_proc); 1138 } 1139 #ifdef RACCT 1140 PROC_LOCK(td->td_proc); 1141 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1142 PROC_UNLOCK(td->td_proc); 1143 if (error != 0) 1144 return (ENOMEM); 1145 #endif 1146 1147 if (uap->how & MCL_FUTURE) { 1148 vm_map_lock(map); 1149 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1150 vm_map_unlock(map); 1151 error = 0; 1152 } 1153 1154 if (uap->how & MCL_CURRENT) { 1155 /* 1156 * P1003.1-2001 mandates that all currently mapped pages 1157 * will be memory resident and locked (wired) upon return 1158 * from mlockall(). vm_map_wire() will wire pages, by 1159 * calling vm_fault_wire() for each page in the region. 1160 */ 1161 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1162 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1163 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1164 } 1165 #ifdef RACCT 1166 if (error != KERN_SUCCESS) { 1167 PROC_LOCK(td->td_proc); 1168 racct_set(td->td_proc, RACCT_MEMLOCK, 1169 ptoa(pmap_wired_count(map->pmap))); 1170 PROC_UNLOCK(td->td_proc); 1171 } 1172 #endif 1173 1174 return (error); 1175 } 1176 1177 #ifndef _SYS_SYSPROTO_H_ 1178 struct munlockall_args { 1179 register_t dummy; 1180 }; 1181 #endif 1182 1183 /* 1184 * MPSAFE 1185 */ 1186 int 1187 sys_munlockall(td, uap) 1188 struct thread *td; 1189 struct munlockall_args *uap; 1190 { 1191 vm_map_t map; 1192 int error; 1193 1194 map = &td->td_proc->p_vmspace->vm_map; 1195 error = priv_check(td, PRIV_VM_MUNLOCK); 1196 if (error) 1197 return (error); 1198 1199 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1200 vm_map_lock(map); 1201 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1202 vm_map_unlock(map); 1203 1204 /* Forcibly unwire all pages. */ 1205 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1206 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1207 #ifdef RACCT 1208 if (error == KERN_SUCCESS) { 1209 PROC_LOCK(td->td_proc); 1210 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1211 PROC_UNLOCK(td->td_proc); 1212 } 1213 #endif 1214 1215 return (error); 1216 } 1217 1218 #ifndef _SYS_SYSPROTO_H_ 1219 struct munlock_args { 1220 const void *addr; 1221 size_t len; 1222 }; 1223 #endif 1224 /* 1225 * MPSAFE 1226 */ 1227 int 1228 sys_munlock(td, uap) 1229 struct thread *td; 1230 struct munlock_args *uap; 1231 { 1232 vm_offset_t addr, end, last, start; 1233 vm_size_t size; 1234 #ifdef RACCT 1235 vm_map_t map; 1236 #endif 1237 int error; 1238 1239 error = priv_check(td, PRIV_VM_MUNLOCK); 1240 if (error) 1241 return (error); 1242 addr = (vm_offset_t)uap->addr; 1243 size = uap->len; 1244 last = addr + size; 1245 start = trunc_page(addr); 1246 end = round_page(last); 1247 if (last < addr || end < addr) 1248 return (EINVAL); 1249 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1250 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1251 #ifdef RACCT 1252 if (error == KERN_SUCCESS) { 1253 PROC_LOCK(td->td_proc); 1254 map = &td->td_proc->p_vmspace->vm_map; 1255 racct_set(td->td_proc, RACCT_MEMLOCK, 1256 ptoa(pmap_wired_count(map->pmap))); 1257 PROC_UNLOCK(td->td_proc); 1258 } 1259 #endif 1260 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1261 } 1262 1263 /* 1264 * vm_mmap_vnode() 1265 * 1266 * Helper function for vm_mmap. Perform sanity check specific for mmap 1267 * operations on vnodes. 1268 * 1269 * For VCHR vnodes, the vnode lock is held over the call to 1270 * vm_mmap_cdev() to keep vp->v_rdev valid. 1271 */ 1272 int 1273 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1274 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1275 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1276 boolean_t *writecounted) 1277 { 1278 struct vattr va; 1279 vm_object_t obj; 1280 vm_offset_t foff; 1281 struct mount *mp; 1282 struct ucred *cred; 1283 int error, flags, locktype; 1284 1285 mp = vp->v_mount; 1286 cred = td->td_ucred; 1287 if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED)) 1288 locktype = LK_EXCLUSIVE; 1289 else 1290 locktype = LK_SHARED; 1291 if ((error = vget(vp, locktype, td)) != 0) 1292 return (error); 1293 foff = *foffp; 1294 flags = *flagsp; 1295 obj = vp->v_object; 1296 if (vp->v_type == VREG) { 1297 /* 1298 * Get the proper underlying object 1299 */ 1300 if (obj == NULL) { 1301 error = EINVAL; 1302 goto done; 1303 } 1304 if (obj->type == OBJT_VNODE && obj->handle != vp) { 1305 vput(vp); 1306 vp = (struct vnode *)obj->handle; 1307 /* 1308 * Bypass filesystems obey the mpsafety of the 1309 * underlying fs. Tmpfs never bypasses. 1310 */ 1311 error = vget(vp, locktype, td); 1312 if (error != 0) 1313 return (error); 1314 } 1315 if (locktype == LK_EXCLUSIVE) { 1316 *writecounted = TRUE; 1317 vnode_pager_update_writecount(obj, 0, objsize); 1318 } 1319 } else if (vp->v_type == VCHR) { 1320 error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp, 1321 vp->v_rdev, foffp, objp); 1322 if (error == 0) 1323 goto mark_atime; 1324 goto done; 1325 } else { 1326 error = EINVAL; 1327 goto done; 1328 } 1329 if ((error = VOP_GETATTR(vp, &va, cred))) 1330 goto done; 1331 #ifdef MAC 1332 error = mac_vnode_check_mmap(cred, vp, prot, flags); 1333 if (error != 0) 1334 goto done; 1335 #endif 1336 if ((flags & MAP_SHARED) != 0) { 1337 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1338 if (prot & PROT_WRITE) { 1339 error = EPERM; 1340 goto done; 1341 } 1342 *maxprotp &= ~VM_PROT_WRITE; 1343 } 1344 } 1345 /* 1346 * If it is a regular file without any references 1347 * we do not need to sync it. 1348 * Adjust object size to be the size of actual file. 1349 */ 1350 objsize = round_page(va.va_size); 1351 if (va.va_nlink == 0) 1352 flags |= MAP_NOSYNC; 1353 if (obj->type == OBJT_VNODE) 1354 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, 1355 cred); 1356 else { 1357 KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP, 1358 ("wrong object type")); 1359 vm_object_reference(obj); 1360 } 1361 if (obj == NULL) { 1362 error = ENOMEM; 1363 goto done; 1364 } 1365 *objp = obj; 1366 *flagsp = flags; 1367 1368 mark_atime: 1369 vfs_mark_atime(vp, cred); 1370 1371 done: 1372 if (error != 0 && *writecounted) { 1373 *writecounted = FALSE; 1374 vnode_pager_update_writecount(obj, objsize, 0); 1375 } 1376 vput(vp); 1377 return (error); 1378 } 1379 1380 /* 1381 * vm_mmap_cdev() 1382 * 1383 * MPSAFE 1384 * 1385 * Helper function for vm_mmap. Perform sanity check specific for mmap 1386 * operations on cdevs. 1387 */ 1388 int 1389 vm_mmap_cdev(struct thread *td, vm_size_t objsize, 1390 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1391 struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp) 1392 { 1393 vm_object_t obj; 1394 struct cdevsw *dsw; 1395 int error, flags, ref; 1396 1397 flags = *flagsp; 1398 1399 dsw = dev_refthread(cdev, &ref); 1400 if (dsw == NULL) 1401 return (ENXIO); 1402 if (dsw->d_flags & D_MMAP_ANON) { 1403 dev_relthread(cdev, ref); 1404 *maxprotp = VM_PROT_ALL; 1405 *flagsp |= MAP_ANON; 1406 return (0); 1407 } 1408 /* 1409 * cdevs do not provide private mappings of any kind. 1410 */ 1411 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1412 (prot & PROT_WRITE) != 0) { 1413 dev_relthread(cdev, ref); 1414 return (EACCES); 1415 } 1416 if (flags & (MAP_PRIVATE|MAP_COPY)) { 1417 dev_relthread(cdev, ref); 1418 return (EINVAL); 1419 } 1420 /* 1421 * Force device mappings to be shared. 1422 */ 1423 flags |= MAP_SHARED; 1424 #ifdef MAC_XXX 1425 error = mac_cdev_check_mmap(td->td_ucred, cdev, prot); 1426 if (error != 0) { 1427 dev_relthread(cdev, ref); 1428 return (error); 1429 } 1430 #endif 1431 /* 1432 * First, try d_mmap_single(). If that is not implemented 1433 * (returns ENODEV), fall back to using the device pager. 1434 * Note that d_mmap_single() must return a reference to the 1435 * object (it needs to bump the reference count of the object 1436 * it returns somehow). 1437 * 1438 * XXX assumes VM_PROT_* == PROT_* 1439 */ 1440 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1441 dev_relthread(cdev, ref); 1442 if (error != ENODEV) 1443 return (error); 1444 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1445 td->td_ucred); 1446 if (obj == NULL) 1447 return (EINVAL); 1448 *objp = obj; 1449 *flagsp = flags; 1450 return (0); 1451 } 1452 1453 /* 1454 * vm_mmap_shm() 1455 * 1456 * MPSAFE 1457 * 1458 * Helper function for vm_mmap. Perform sanity check specific for mmap 1459 * operations on shm file descriptors. 1460 */ 1461 int 1462 vm_mmap_shm(struct thread *td, vm_size_t objsize, 1463 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1464 struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp) 1465 { 1466 int error; 1467 1468 if ((*flagsp & MAP_SHARED) != 0 && 1469 (*maxprotp & VM_PROT_WRITE) == 0 && 1470 (prot & PROT_WRITE) != 0) 1471 return (EACCES); 1472 #ifdef MAC 1473 error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp); 1474 if (error != 0) 1475 return (error); 1476 #endif 1477 error = shm_mmap(shmfd, objsize, foff, objp); 1478 if (error) 1479 return (error); 1480 return (0); 1481 } 1482 1483 /* 1484 * vm_mmap() 1485 * 1486 * MPSAFE 1487 * 1488 * Internal version of mmap. Currently used by mmap, exec, and sys5 1489 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1490 */ 1491 int 1492 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1493 vm_prot_t maxprot, int flags, 1494 objtype_t handle_type, void *handle, 1495 vm_ooffset_t foff) 1496 { 1497 boolean_t fitit; 1498 vm_object_t object = NULL; 1499 struct thread *td = curthread; 1500 int docow, error, findspace, rv; 1501 boolean_t writecounted; 1502 1503 if (size == 0) 1504 return (0); 1505 1506 size = round_page(size); 1507 1508 if (map == &td->td_proc->p_vmspace->vm_map) { 1509 PROC_LOCK(td->td_proc); 1510 if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) { 1511 PROC_UNLOCK(td->td_proc); 1512 return (ENOMEM); 1513 } 1514 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1515 PROC_UNLOCK(td->td_proc); 1516 return (ENOMEM); 1517 } 1518 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1519 if (ptoa(pmap_wired_count(map->pmap)) + size > 1520 lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1521 racct_set_force(td->td_proc, RACCT_VMEM, 1522 map->size); 1523 PROC_UNLOCK(td->td_proc); 1524 return (ENOMEM); 1525 } 1526 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1527 ptoa(pmap_wired_count(map->pmap)) + size); 1528 if (error != 0) { 1529 racct_set_force(td->td_proc, RACCT_VMEM, 1530 map->size); 1531 PROC_UNLOCK(td->td_proc); 1532 return (error); 1533 } 1534 } 1535 PROC_UNLOCK(td->td_proc); 1536 } 1537 1538 /* 1539 * We currently can only deal with page aligned file offsets. 1540 * The check is here rather than in the syscall because the 1541 * kernel calls this function internally for other mmaping 1542 * operations (such as in exec) and non-aligned offsets will 1543 * cause pmap inconsistencies...so we want to be sure to 1544 * disallow this in all cases. 1545 */ 1546 if (foff & PAGE_MASK) 1547 return (EINVAL); 1548 1549 if ((flags & MAP_FIXED) == 0) { 1550 fitit = TRUE; 1551 *addr = round_page(*addr); 1552 } else { 1553 if (*addr != trunc_page(*addr)) 1554 return (EINVAL); 1555 fitit = FALSE; 1556 } 1557 writecounted = FALSE; 1558 1559 /* 1560 * Lookup/allocate object. 1561 */ 1562 switch (handle_type) { 1563 case OBJT_DEVICE: 1564 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, 1565 handle, &foff, &object); 1566 break; 1567 case OBJT_VNODE: 1568 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1569 handle, &foff, &object, &writecounted); 1570 break; 1571 case OBJT_SWAP: 1572 error = vm_mmap_shm(td, size, prot, &maxprot, &flags, 1573 handle, foff, &object); 1574 break; 1575 case OBJT_DEFAULT: 1576 if (handle == NULL) { 1577 error = 0; 1578 break; 1579 } 1580 /* FALLTHROUGH */ 1581 default: 1582 error = EINVAL; 1583 break; 1584 } 1585 if (error) 1586 return (error); 1587 if (flags & MAP_ANON) { 1588 object = NULL; 1589 docow = 0; 1590 /* 1591 * Unnamed anonymous regions always start at 0. 1592 */ 1593 if (handle == 0) 1594 foff = 0; 1595 } else if (flags & MAP_PREFAULT_READ) 1596 docow = MAP_PREFAULT; 1597 else 1598 docow = MAP_PREFAULT_PARTIAL; 1599 1600 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1601 docow |= MAP_COPY_ON_WRITE; 1602 if (flags & MAP_NOSYNC) 1603 docow |= MAP_DISABLE_SYNCER; 1604 if (flags & MAP_NOCORE) 1605 docow |= MAP_DISABLE_COREDUMP; 1606 /* Shared memory is also shared with children. */ 1607 if (flags & MAP_SHARED) 1608 docow |= MAP_INHERIT_SHARE; 1609 if (writecounted) 1610 docow |= MAP_VN_WRITECOUNT; 1611 1612 if (flags & MAP_STACK) 1613 rv = vm_map_stack(map, *addr, size, prot, maxprot, 1614 docow | MAP_STACK_GROWS_DOWN); 1615 else if (fitit) { 1616 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) 1617 findspace = VMFS_SUPER_SPACE; 1618 else if ((flags & MAP_ALIGNMENT_MASK) != 0) 1619 findspace = VMFS_ALIGNED_SPACE(flags >> 1620 MAP_ALIGNMENT_SHIFT); 1621 else 1622 findspace = VMFS_OPTIMAL_SPACE; 1623 rv = vm_map_find(map, object, foff, addr, size, findspace, 1624 prot, maxprot, docow); 1625 } else 1626 rv = vm_map_fixed(map, object, foff, *addr, size, 1627 prot, maxprot, docow); 1628 1629 if (rv == KERN_SUCCESS) { 1630 /* 1631 * If the process has requested that all future mappings 1632 * be wired, then heed this. 1633 */ 1634 if (map->flags & MAP_WIREFUTURE) { 1635 vm_map_wire(map, *addr, *addr + size, 1636 VM_MAP_WIRE_USER | ((flags & MAP_STACK) ? 1637 VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES)); 1638 } 1639 } else { 1640 /* 1641 * If this mapping was accounted for in the vnode's 1642 * writecount, then undo that now. 1643 */ 1644 if (writecounted) 1645 vnode_pager_release_writecount(object, 0, size); 1646 /* 1647 * Lose the object reference. Will destroy the 1648 * object if it's an unnamed anonymous mapping 1649 * or named anonymous without other references. 1650 */ 1651 vm_object_deallocate(object); 1652 } 1653 return (vm_mmap_to_errno(rv)); 1654 } 1655 1656 /* 1657 * Translate a Mach VM return code to zero on success or the appropriate errno 1658 * on failure. 1659 */ 1660 int 1661 vm_mmap_to_errno(int rv) 1662 { 1663 1664 switch (rv) { 1665 case KERN_SUCCESS: 1666 return (0); 1667 case KERN_INVALID_ADDRESS: 1668 case KERN_NO_SPACE: 1669 return (ENOMEM); 1670 case KERN_PROTECTION_FAILURE: 1671 return (EACCES); 1672 default: 1673 return (EINVAL); 1674 } 1675 } 1676