1 /*- 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35 * 36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_compat.h" 47 #include "opt_hwpmc_hooks.h" 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/capsicum.h> 52 #include <sys/kernel.h> 53 #include <sys/lock.h> 54 #include <sys/mutex.h> 55 #include <sys/sysproto.h> 56 #include <sys/filedesc.h> 57 #include <sys/priv.h> 58 #include <sys/proc.h> 59 #include <sys/procctl.h> 60 #include <sys/racct.h> 61 #include <sys/resource.h> 62 #include <sys/resourcevar.h> 63 #include <sys/rwlock.h> 64 #include <sys/sysctl.h> 65 #include <sys/vnode.h> 66 #include <sys/fcntl.h> 67 #include <sys/file.h> 68 #include <sys/mman.h> 69 #include <sys/mount.h> 70 #include <sys/conf.h> 71 #include <sys/stat.h> 72 #include <sys/syscallsubr.h> 73 #include <sys/sysent.h> 74 #include <sys/vmmeter.h> 75 76 #include <security/mac/mac_framework.h> 77 78 #include <vm/vm.h> 79 #include <vm/vm_param.h> 80 #include <vm/pmap.h> 81 #include <vm/vm_map.h> 82 #include <vm/vm_object.h> 83 #include <vm/vm_page.h> 84 #include <vm/vm_pager.h> 85 #include <vm/vm_pageout.h> 86 #include <vm/vm_extern.h> 87 #include <vm/vm_page.h> 88 #include <vm/vnode_pager.h> 89 90 #ifdef HWPMC_HOOKS 91 #include <sys/pmckern.h> 92 #endif 93 94 int old_mlock = 0; 95 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RW | CTLFLAG_TUN, &old_mlock, 0, 96 "Do not apply RLIMIT_MEMLOCK on mlockall"); 97 TUNABLE_INT("vm.old_mlock", &old_mlock); 98 99 #ifdef MAP_32BIT 100 #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) 101 #endif 102 103 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 104 int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *); 105 static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 106 int *, struct cdev *, vm_ooffset_t *, vm_object_t *); 107 static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 108 int *, struct shmfd *, vm_ooffset_t, vm_object_t *); 109 110 #ifndef _SYS_SYSPROTO_H_ 111 struct sbrk_args { 112 int incr; 113 }; 114 #endif 115 116 /* 117 * MPSAFE 118 */ 119 /* ARGSUSED */ 120 int 121 sys_sbrk(td, uap) 122 struct thread *td; 123 struct sbrk_args *uap; 124 { 125 /* Not yet implemented */ 126 return (EOPNOTSUPP); 127 } 128 129 #ifndef _SYS_SYSPROTO_H_ 130 struct sstk_args { 131 int incr; 132 }; 133 #endif 134 135 /* 136 * MPSAFE 137 */ 138 /* ARGSUSED */ 139 int 140 sys_sstk(td, uap) 141 struct thread *td; 142 struct sstk_args *uap; 143 { 144 /* Not yet implemented */ 145 return (EOPNOTSUPP); 146 } 147 148 #if defined(COMPAT_43) 149 #ifndef _SYS_SYSPROTO_H_ 150 struct getpagesize_args { 151 int dummy; 152 }; 153 #endif 154 155 int 156 ogetpagesize(td, uap) 157 struct thread *td; 158 struct getpagesize_args *uap; 159 { 160 /* MP SAFE */ 161 td->td_retval[0] = PAGE_SIZE; 162 return (0); 163 } 164 #endif /* COMPAT_43 */ 165 166 167 /* 168 * Memory Map (mmap) system call. Note that the file offset 169 * and address are allowed to be NOT page aligned, though if 170 * the MAP_FIXED flag it set, both must have the same remainder 171 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 172 * page-aligned, the actual mapping starts at trunc_page(addr) 173 * and the return value is adjusted up by the page offset. 174 * 175 * Generally speaking, only character devices which are themselves 176 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 177 * there would be no cache coherency between a descriptor and a VM mapping 178 * both to the same character device. 179 */ 180 #ifndef _SYS_SYSPROTO_H_ 181 struct mmap_args { 182 void *addr; 183 size_t len; 184 int prot; 185 int flags; 186 int fd; 187 long pad; 188 off_t pos; 189 }; 190 #endif 191 192 /* 193 * MPSAFE 194 */ 195 int 196 sys_mmap(td, uap) 197 struct thread *td; 198 struct mmap_args *uap; 199 { 200 #ifdef HWPMC_HOOKS 201 struct pmckern_map_in pkm; 202 #endif 203 struct file *fp; 204 struct vnode *vp; 205 vm_offset_t addr; 206 vm_size_t size, pageoff; 207 vm_prot_t cap_maxprot, prot, maxprot; 208 void *handle; 209 objtype_t handle_type; 210 int align, error, flags; 211 off_t pos; 212 struct vmspace *vms = td->td_proc->p_vmspace; 213 cap_rights_t rights; 214 215 addr = (vm_offset_t) uap->addr; 216 size = uap->len; 217 prot = uap->prot & VM_PROT_ALL; 218 flags = uap->flags; 219 pos = uap->pos; 220 221 fp = NULL; 222 223 /* 224 * Enforce the constraints. 225 * Mapping of length 0 is only allowed for old binaries. 226 * Anonymous mapping shall specify -1 as filedescriptor and 227 * zero position for new code. Be nice to ancient a.out 228 * binaries and correct pos for anonymous mapping, since old 229 * ld.so sometimes issues anonymous map requests with non-zero 230 * pos. 231 */ 232 if (!SV_CURPROC_FLAG(SV_AOUT)) { 233 if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) || 234 ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0))) 235 return (EINVAL); 236 } else { 237 if ((flags & MAP_ANON) != 0) 238 pos = 0; 239 } 240 241 if (flags & MAP_STACK) { 242 if ((uap->fd != -1) || 243 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 244 return (EINVAL); 245 flags |= MAP_ANON; 246 pos = 0; 247 } 248 249 /* 250 * Align the file position to a page boundary, 251 * and save its page offset component. 252 */ 253 pageoff = (pos & PAGE_MASK); 254 pos -= pageoff; 255 256 /* Adjust size for rounding (on both ends). */ 257 size += pageoff; /* low end... */ 258 size = (vm_size_t) round_page(size); /* hi end */ 259 260 /* Ensure alignment is at least a page and fits in a pointer. */ 261 align = flags & MAP_ALIGNMENT_MASK; 262 if (align != 0 && align != MAP_ALIGNED_SUPER && 263 (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || 264 align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) 265 return (EINVAL); 266 267 /* 268 * Check for illegal addresses. Watch out for address wrap... Note 269 * that VM_*_ADDRESS are not constants due to casts (argh). 270 */ 271 if (flags & MAP_FIXED) { 272 /* 273 * The specified address must have the same remainder 274 * as the file offset taken modulo PAGE_SIZE, so it 275 * should be aligned after adjustment by pageoff. 276 */ 277 addr -= pageoff; 278 if (addr & PAGE_MASK) 279 return (EINVAL); 280 281 /* Address range must be all in user VM space. */ 282 if (addr < vm_map_min(&vms->vm_map) || 283 addr + size > vm_map_max(&vms->vm_map)) 284 return (EINVAL); 285 if (addr + size < addr) 286 return (EINVAL); 287 #ifdef MAP_32BIT 288 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) 289 return (EINVAL); 290 } else if (flags & MAP_32BIT) { 291 /* 292 * For MAP_32BIT, override the hint if it is too high and 293 * do not bother moving the mapping past the heap (since 294 * the heap is usually above 2GB). 295 */ 296 if (addr + size > MAP_32BIT_MAX_ADDR) 297 addr = 0; 298 #endif 299 } else { 300 /* 301 * XXX for non-fixed mappings where no hint is provided or 302 * the hint would fall in the potential heap space, 303 * place it after the end of the largest possible heap. 304 * 305 * There should really be a pmap call to determine a reasonable 306 * location. 307 */ 308 PROC_LOCK(td->td_proc); 309 if (addr == 0 || 310 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 311 addr < round_page((vm_offset_t)vms->vm_daddr + 312 lim_max(td->td_proc, RLIMIT_DATA)))) 313 addr = round_page((vm_offset_t)vms->vm_daddr + 314 lim_max(td->td_proc, RLIMIT_DATA)); 315 PROC_UNLOCK(td->td_proc); 316 } 317 if (flags & MAP_ANON) { 318 /* 319 * Mapping blank space is trivial. 320 */ 321 handle = NULL; 322 handle_type = OBJT_DEFAULT; 323 maxprot = VM_PROT_ALL; 324 cap_maxprot = VM_PROT_ALL; 325 } else { 326 /* 327 * Mapping file, get fp for validation and don't let the 328 * descriptor disappear on us if we block. Check capability 329 * rights, but also return the maximum rights to be combined 330 * with maxprot later. 331 */ 332 cap_rights_init(&rights, CAP_MMAP); 333 if (prot & PROT_READ) 334 cap_rights_set(&rights, CAP_MMAP_R); 335 if ((flags & MAP_SHARED) != 0) { 336 if (prot & PROT_WRITE) 337 cap_rights_set(&rights, CAP_MMAP_W); 338 } 339 if (prot & PROT_EXEC) 340 cap_rights_set(&rights, CAP_MMAP_X); 341 error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp); 342 if (error != 0) 343 goto done; 344 if (fp->f_type == DTYPE_SHM) { 345 handle = fp->f_data; 346 handle_type = OBJT_SWAP; 347 maxprot = VM_PROT_NONE; 348 349 /* FREAD should always be set. */ 350 if (fp->f_flag & FREAD) 351 maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; 352 if (fp->f_flag & FWRITE) 353 maxprot |= VM_PROT_WRITE; 354 goto map; 355 } 356 if (fp->f_type != DTYPE_VNODE) { 357 error = ENODEV; 358 goto done; 359 } 360 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ 361 defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) 362 /* 363 * POSIX shared-memory objects are defined to have 364 * kernel persistence, and are not defined to support 365 * read(2)/write(2) -- or even open(2). Thus, we can 366 * use MAP_ASYNC to trade on-disk coherence for speed. 367 * The shm_open(3) library routine turns on the FPOSIXSHM 368 * flag to request this behavior. 369 */ 370 if (fp->f_flag & FPOSIXSHM) 371 flags |= MAP_NOSYNC; 372 #endif 373 vp = fp->f_vnode; 374 /* 375 * Ensure that file and memory protections are 376 * compatible. Note that we only worry about 377 * writability if mapping is shared; in this case, 378 * current and max prot are dictated by the open file. 379 * XXX use the vnode instead? Problem is: what 380 * credentials do we use for determination? What if 381 * proc does a setuid? 382 */ 383 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 384 maxprot = VM_PROT_NONE; 385 else 386 maxprot = VM_PROT_EXECUTE; 387 if (fp->f_flag & FREAD) { 388 maxprot |= VM_PROT_READ; 389 } else if (prot & PROT_READ) { 390 error = EACCES; 391 goto done; 392 } 393 /* 394 * If we are sharing potential changes (either via 395 * MAP_SHARED or via the implicit sharing of character 396 * device mappings), and we are trying to get write 397 * permission although we opened it without asking 398 * for it, bail out. 399 */ 400 if ((flags & MAP_SHARED) != 0) { 401 if ((fp->f_flag & FWRITE) != 0) { 402 maxprot |= VM_PROT_WRITE; 403 } else if ((prot & PROT_WRITE) != 0) { 404 error = EACCES; 405 goto done; 406 } 407 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 408 maxprot |= VM_PROT_WRITE; 409 cap_maxprot |= VM_PROT_WRITE; 410 } 411 handle = (void *)vp; 412 handle_type = OBJT_VNODE; 413 } 414 map: 415 td->td_fpop = fp; 416 maxprot &= cap_maxprot; 417 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 418 flags, handle_type, handle, pos); 419 td->td_fpop = NULL; 420 #ifdef HWPMC_HOOKS 421 /* inform hwpmc(4) if an executable is being mapped */ 422 if (error == 0 && handle_type == OBJT_VNODE && 423 (prot & PROT_EXEC)) { 424 pkm.pm_file = handle; 425 pkm.pm_address = (uintptr_t) addr; 426 PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); 427 } 428 #endif 429 if (error == 0) 430 td->td_retval[0] = (register_t) (addr + pageoff); 431 done: 432 if (fp) 433 fdrop(fp, td); 434 435 return (error); 436 } 437 438 int 439 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 440 { 441 struct mmap_args oargs; 442 443 oargs.addr = uap->addr; 444 oargs.len = uap->len; 445 oargs.prot = uap->prot; 446 oargs.flags = uap->flags; 447 oargs.fd = uap->fd; 448 oargs.pos = uap->pos; 449 return (sys_mmap(td, &oargs)); 450 } 451 452 #ifdef COMPAT_43 453 #ifndef _SYS_SYSPROTO_H_ 454 struct ommap_args { 455 caddr_t addr; 456 int len; 457 int prot; 458 int flags; 459 int fd; 460 long pos; 461 }; 462 #endif 463 int 464 ommap(td, uap) 465 struct thread *td; 466 struct ommap_args *uap; 467 { 468 struct mmap_args nargs; 469 static const char cvtbsdprot[8] = { 470 0, 471 PROT_EXEC, 472 PROT_WRITE, 473 PROT_EXEC | PROT_WRITE, 474 PROT_READ, 475 PROT_EXEC | PROT_READ, 476 PROT_WRITE | PROT_READ, 477 PROT_EXEC | PROT_WRITE | PROT_READ, 478 }; 479 480 #define OMAP_ANON 0x0002 481 #define OMAP_COPY 0x0020 482 #define OMAP_SHARED 0x0010 483 #define OMAP_FIXED 0x0100 484 485 nargs.addr = uap->addr; 486 nargs.len = uap->len; 487 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 488 #ifdef COMPAT_FREEBSD32 489 #if defined(__amd64__) || defined(__ia64__) 490 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 491 nargs.prot != 0) 492 nargs.prot |= PROT_EXEC; 493 #endif 494 #endif 495 nargs.flags = 0; 496 if (uap->flags & OMAP_ANON) 497 nargs.flags |= MAP_ANON; 498 if (uap->flags & OMAP_COPY) 499 nargs.flags |= MAP_COPY; 500 if (uap->flags & OMAP_SHARED) 501 nargs.flags |= MAP_SHARED; 502 else 503 nargs.flags |= MAP_PRIVATE; 504 if (uap->flags & OMAP_FIXED) 505 nargs.flags |= MAP_FIXED; 506 nargs.fd = uap->fd; 507 nargs.pos = uap->pos; 508 return (sys_mmap(td, &nargs)); 509 } 510 #endif /* COMPAT_43 */ 511 512 513 #ifndef _SYS_SYSPROTO_H_ 514 struct msync_args { 515 void *addr; 516 size_t len; 517 int flags; 518 }; 519 #endif 520 /* 521 * MPSAFE 522 */ 523 int 524 sys_msync(td, uap) 525 struct thread *td; 526 struct msync_args *uap; 527 { 528 vm_offset_t addr; 529 vm_size_t size, pageoff; 530 int flags; 531 vm_map_t map; 532 int rv; 533 534 addr = (vm_offset_t) uap->addr; 535 size = uap->len; 536 flags = uap->flags; 537 538 pageoff = (addr & PAGE_MASK); 539 addr -= pageoff; 540 size += pageoff; 541 size = (vm_size_t) round_page(size); 542 if (addr + size < addr) 543 return (EINVAL); 544 545 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 546 return (EINVAL); 547 548 map = &td->td_proc->p_vmspace->vm_map; 549 550 /* 551 * Clean the pages and interpret the return value. 552 */ 553 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 554 (flags & MS_INVALIDATE) != 0); 555 switch (rv) { 556 case KERN_SUCCESS: 557 return (0); 558 case KERN_INVALID_ADDRESS: 559 return (EINVAL); /* Sun returns ENOMEM? */ 560 case KERN_INVALID_ARGUMENT: 561 return (EBUSY); 562 case KERN_FAILURE: 563 return (EIO); 564 default: 565 return (EINVAL); 566 } 567 } 568 569 #ifndef _SYS_SYSPROTO_H_ 570 struct munmap_args { 571 void *addr; 572 size_t len; 573 }; 574 #endif 575 /* 576 * MPSAFE 577 */ 578 int 579 sys_munmap(td, uap) 580 struct thread *td; 581 struct munmap_args *uap; 582 { 583 #ifdef HWPMC_HOOKS 584 struct pmckern_map_out pkm; 585 vm_map_entry_t entry; 586 #endif 587 vm_offset_t addr; 588 vm_size_t size, pageoff; 589 vm_map_t map; 590 591 addr = (vm_offset_t) uap->addr; 592 size = uap->len; 593 if (size == 0) 594 return (EINVAL); 595 596 pageoff = (addr & PAGE_MASK); 597 addr -= pageoff; 598 size += pageoff; 599 size = (vm_size_t) round_page(size); 600 if (addr + size < addr) 601 return (EINVAL); 602 603 /* 604 * Check for illegal addresses. Watch out for address wrap... 605 */ 606 map = &td->td_proc->p_vmspace->vm_map; 607 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 608 return (EINVAL); 609 vm_map_lock(map); 610 #ifdef HWPMC_HOOKS 611 /* 612 * Inform hwpmc if the address range being unmapped contains 613 * an executable region. 614 */ 615 pkm.pm_address = (uintptr_t) NULL; 616 if (vm_map_lookup_entry(map, addr, &entry)) { 617 for (; 618 entry != &map->header && entry->start < addr + size; 619 entry = entry->next) { 620 if (vm_map_check_protection(map, entry->start, 621 entry->end, VM_PROT_EXECUTE) == TRUE) { 622 pkm.pm_address = (uintptr_t) addr; 623 pkm.pm_size = (size_t) size; 624 break; 625 } 626 } 627 } 628 #endif 629 vm_map_delete(map, addr, addr + size); 630 631 #ifdef HWPMC_HOOKS 632 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 633 vm_map_lock_downgrade(map); 634 if (pkm.pm_address != (uintptr_t) NULL) 635 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 636 vm_map_unlock_read(map); 637 #else 638 vm_map_unlock(map); 639 #endif 640 /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ 641 return (0); 642 } 643 644 #ifndef _SYS_SYSPROTO_H_ 645 struct mprotect_args { 646 const void *addr; 647 size_t len; 648 int prot; 649 }; 650 #endif 651 /* 652 * MPSAFE 653 */ 654 int 655 sys_mprotect(td, uap) 656 struct thread *td; 657 struct mprotect_args *uap; 658 { 659 vm_offset_t addr; 660 vm_size_t size, pageoff; 661 vm_prot_t prot; 662 663 addr = (vm_offset_t) uap->addr; 664 size = uap->len; 665 prot = uap->prot & VM_PROT_ALL; 666 667 pageoff = (addr & PAGE_MASK); 668 addr -= pageoff; 669 size += pageoff; 670 size = (vm_size_t) round_page(size); 671 if (addr + size < addr) 672 return (EINVAL); 673 674 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 675 addr + size, prot, FALSE)) { 676 case KERN_SUCCESS: 677 return (0); 678 case KERN_PROTECTION_FAILURE: 679 return (EACCES); 680 case KERN_RESOURCE_SHORTAGE: 681 return (ENOMEM); 682 } 683 return (EINVAL); 684 } 685 686 #ifndef _SYS_SYSPROTO_H_ 687 struct minherit_args { 688 void *addr; 689 size_t len; 690 int inherit; 691 }; 692 #endif 693 /* 694 * MPSAFE 695 */ 696 int 697 sys_minherit(td, uap) 698 struct thread *td; 699 struct minherit_args *uap; 700 { 701 vm_offset_t addr; 702 vm_size_t size, pageoff; 703 vm_inherit_t inherit; 704 705 addr = (vm_offset_t)uap->addr; 706 size = uap->len; 707 inherit = uap->inherit; 708 709 pageoff = (addr & PAGE_MASK); 710 addr -= pageoff; 711 size += pageoff; 712 size = (vm_size_t) round_page(size); 713 if (addr + size < addr) 714 return (EINVAL); 715 716 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 717 addr + size, inherit)) { 718 case KERN_SUCCESS: 719 return (0); 720 case KERN_PROTECTION_FAILURE: 721 return (EACCES); 722 } 723 return (EINVAL); 724 } 725 726 #ifndef _SYS_SYSPROTO_H_ 727 struct madvise_args { 728 void *addr; 729 size_t len; 730 int behav; 731 }; 732 #endif 733 734 /* 735 * MPSAFE 736 */ 737 int 738 sys_madvise(td, uap) 739 struct thread *td; 740 struct madvise_args *uap; 741 { 742 vm_offset_t start, end; 743 vm_map_t map; 744 int flags; 745 746 /* 747 * Check for our special case, advising the swap pager we are 748 * "immortal." 749 */ 750 if (uap->behav == MADV_PROTECT) { 751 flags = PPROT_SET; 752 return (kern_procctl(td, P_PID, td->td_proc->p_pid, 753 PROC_SPROTECT, &flags)); 754 } 755 756 /* 757 * Check for illegal behavior 758 */ 759 if (uap->behav < 0 || uap->behav > MADV_CORE) 760 return (EINVAL); 761 /* 762 * Check for illegal addresses. Watch out for address wrap... Note 763 * that VM_*_ADDRESS are not constants due to casts (argh). 764 */ 765 map = &td->td_proc->p_vmspace->vm_map; 766 if ((vm_offset_t)uap->addr < vm_map_min(map) || 767 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 768 return (EINVAL); 769 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 770 return (EINVAL); 771 772 /* 773 * Since this routine is only advisory, we default to conservative 774 * behavior. 775 */ 776 start = trunc_page((vm_offset_t) uap->addr); 777 end = round_page((vm_offset_t) uap->addr + uap->len); 778 779 if (vm_map_madvise(map, start, end, uap->behav)) 780 return (EINVAL); 781 return (0); 782 } 783 784 #ifndef _SYS_SYSPROTO_H_ 785 struct mincore_args { 786 const void *addr; 787 size_t len; 788 char *vec; 789 }; 790 #endif 791 792 /* 793 * MPSAFE 794 */ 795 int 796 sys_mincore(td, uap) 797 struct thread *td; 798 struct mincore_args *uap; 799 { 800 vm_offset_t addr, first_addr; 801 vm_offset_t end, cend; 802 pmap_t pmap; 803 vm_map_t map; 804 char *vec; 805 int error = 0; 806 int vecindex, lastvecindex; 807 vm_map_entry_t current; 808 vm_map_entry_t entry; 809 vm_object_t object; 810 vm_paddr_t locked_pa; 811 vm_page_t m; 812 vm_pindex_t pindex; 813 int mincoreinfo; 814 unsigned int timestamp; 815 boolean_t locked; 816 817 /* 818 * Make sure that the addresses presented are valid for user 819 * mode. 820 */ 821 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 822 end = addr + (vm_size_t)round_page(uap->len); 823 map = &td->td_proc->p_vmspace->vm_map; 824 if (end > vm_map_max(map) || end < addr) 825 return (ENOMEM); 826 827 /* 828 * Address of byte vector 829 */ 830 vec = uap->vec; 831 832 pmap = vmspace_pmap(td->td_proc->p_vmspace); 833 834 vm_map_lock_read(map); 835 RestartScan: 836 timestamp = map->timestamp; 837 838 if (!vm_map_lookup_entry(map, addr, &entry)) { 839 vm_map_unlock_read(map); 840 return (ENOMEM); 841 } 842 843 /* 844 * Do this on a map entry basis so that if the pages are not 845 * in the current processes address space, we can easily look 846 * up the pages elsewhere. 847 */ 848 lastvecindex = -1; 849 for (current = entry; 850 (current != &map->header) && (current->start < end); 851 current = current->next) { 852 853 /* 854 * check for contiguity 855 */ 856 if (current->end < end && 857 (entry->next == &map->header || 858 current->next->start > current->end)) { 859 vm_map_unlock_read(map); 860 return (ENOMEM); 861 } 862 863 /* 864 * ignore submaps (for now) or null objects 865 */ 866 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 867 current->object.vm_object == NULL) 868 continue; 869 870 /* 871 * limit this scan to the current map entry and the 872 * limits for the mincore call 873 */ 874 if (addr < current->start) 875 addr = current->start; 876 cend = current->end; 877 if (cend > end) 878 cend = end; 879 880 /* 881 * scan this entry one page at a time 882 */ 883 while (addr < cend) { 884 /* 885 * Check pmap first, it is likely faster, also 886 * it can provide info as to whether we are the 887 * one referencing or modifying the page. 888 */ 889 object = NULL; 890 locked_pa = 0; 891 retry: 892 m = NULL; 893 mincoreinfo = pmap_mincore(pmap, addr, &locked_pa); 894 if (locked_pa != 0) { 895 /* 896 * The page is mapped by this process but not 897 * both accessed and modified. It is also 898 * managed. Acquire the object lock so that 899 * other mappings might be examined. 900 */ 901 m = PHYS_TO_VM_PAGE(locked_pa); 902 if (m->object != object) { 903 if (object != NULL) 904 VM_OBJECT_WUNLOCK(object); 905 object = m->object; 906 locked = VM_OBJECT_TRYWLOCK(object); 907 vm_page_unlock(m); 908 if (!locked) { 909 VM_OBJECT_WLOCK(object); 910 vm_page_lock(m); 911 goto retry; 912 } 913 } else 914 vm_page_unlock(m); 915 KASSERT(m->valid == VM_PAGE_BITS_ALL, 916 ("mincore: page %p is mapped but invalid", 917 m)); 918 } else if (mincoreinfo == 0) { 919 /* 920 * The page is not mapped by this process. If 921 * the object implements managed pages, then 922 * determine if the page is resident so that 923 * the mappings might be examined. 924 */ 925 if (current->object.vm_object != object) { 926 if (object != NULL) 927 VM_OBJECT_WUNLOCK(object); 928 object = current->object.vm_object; 929 VM_OBJECT_WLOCK(object); 930 } 931 if (object->type == OBJT_DEFAULT || 932 object->type == OBJT_SWAP || 933 object->type == OBJT_VNODE) { 934 pindex = OFF_TO_IDX(current->offset + 935 (addr - current->start)); 936 m = vm_page_lookup(object, pindex); 937 if (m == NULL && 938 vm_page_is_cached(object, pindex)) 939 mincoreinfo = MINCORE_INCORE; 940 if (m != NULL && m->valid == 0) 941 m = NULL; 942 if (m != NULL) 943 mincoreinfo = MINCORE_INCORE; 944 } 945 } 946 if (m != NULL) { 947 /* Examine other mappings to the page. */ 948 if (m->dirty == 0 && pmap_is_modified(m)) 949 vm_page_dirty(m); 950 if (m->dirty != 0) 951 mincoreinfo |= MINCORE_MODIFIED_OTHER; 952 /* 953 * The first test for PGA_REFERENCED is an 954 * optimization. The second test is 955 * required because a concurrent pmap 956 * operation could clear the last reference 957 * and set PGA_REFERENCED before the call to 958 * pmap_is_referenced(). 959 */ 960 if ((m->aflags & PGA_REFERENCED) != 0 || 961 pmap_is_referenced(m) || 962 (m->aflags & PGA_REFERENCED) != 0) 963 mincoreinfo |= MINCORE_REFERENCED_OTHER; 964 } 965 if (object != NULL) 966 VM_OBJECT_WUNLOCK(object); 967 968 /* 969 * subyte may page fault. In case it needs to modify 970 * the map, we release the lock. 971 */ 972 vm_map_unlock_read(map); 973 974 /* 975 * calculate index into user supplied byte vector 976 */ 977 vecindex = OFF_TO_IDX(addr - first_addr); 978 979 /* 980 * If we have skipped map entries, we need to make sure that 981 * the byte vector is zeroed for those skipped entries. 982 */ 983 while ((lastvecindex + 1) < vecindex) { 984 ++lastvecindex; 985 error = subyte(vec + lastvecindex, 0); 986 if (error) { 987 error = EFAULT; 988 goto done2; 989 } 990 } 991 992 /* 993 * Pass the page information to the user 994 */ 995 error = subyte(vec + vecindex, mincoreinfo); 996 if (error) { 997 error = EFAULT; 998 goto done2; 999 } 1000 1001 /* 1002 * If the map has changed, due to the subyte, the previous 1003 * output may be invalid. 1004 */ 1005 vm_map_lock_read(map); 1006 if (timestamp != map->timestamp) 1007 goto RestartScan; 1008 1009 lastvecindex = vecindex; 1010 addr += PAGE_SIZE; 1011 } 1012 } 1013 1014 /* 1015 * subyte may page fault. In case it needs to modify 1016 * the map, we release the lock. 1017 */ 1018 vm_map_unlock_read(map); 1019 1020 /* 1021 * Zero the last entries in the byte vector. 1022 */ 1023 vecindex = OFF_TO_IDX(end - first_addr); 1024 while ((lastvecindex + 1) < vecindex) { 1025 ++lastvecindex; 1026 error = subyte(vec + lastvecindex, 0); 1027 if (error) { 1028 error = EFAULT; 1029 goto done2; 1030 } 1031 } 1032 1033 /* 1034 * If the map has changed, due to the subyte, the previous 1035 * output may be invalid. 1036 */ 1037 vm_map_lock_read(map); 1038 if (timestamp != map->timestamp) 1039 goto RestartScan; 1040 vm_map_unlock_read(map); 1041 done2: 1042 return (error); 1043 } 1044 1045 #ifndef _SYS_SYSPROTO_H_ 1046 struct mlock_args { 1047 const void *addr; 1048 size_t len; 1049 }; 1050 #endif 1051 /* 1052 * MPSAFE 1053 */ 1054 int 1055 sys_mlock(td, uap) 1056 struct thread *td; 1057 struct mlock_args *uap; 1058 { 1059 1060 return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len)); 1061 } 1062 1063 int 1064 vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len) 1065 { 1066 vm_offset_t addr, end, last, start; 1067 vm_size_t npages, size; 1068 vm_map_t map; 1069 unsigned long nsize; 1070 int error; 1071 1072 error = priv_check_cred(cred, PRIV_VM_MLOCK, 0); 1073 if (error) 1074 return (error); 1075 addr = (vm_offset_t)addr0; 1076 size = len; 1077 last = addr + size; 1078 start = trunc_page(addr); 1079 end = round_page(last); 1080 if (last < addr || end < addr) 1081 return (EINVAL); 1082 npages = atop(end - start); 1083 if (npages > vm_page_max_wired) 1084 return (ENOMEM); 1085 map = &proc->p_vmspace->vm_map; 1086 PROC_LOCK(proc); 1087 nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1088 if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) { 1089 PROC_UNLOCK(proc); 1090 return (ENOMEM); 1091 } 1092 PROC_UNLOCK(proc); 1093 if (npages + vm_cnt.v_wire_count > vm_page_max_wired) 1094 return (EAGAIN); 1095 #ifdef RACCT 1096 PROC_LOCK(proc); 1097 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1098 PROC_UNLOCK(proc); 1099 if (error != 0) 1100 return (ENOMEM); 1101 #endif 1102 error = vm_map_wire(map, start, end, 1103 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1104 #ifdef RACCT 1105 if (error != KERN_SUCCESS) { 1106 PROC_LOCK(proc); 1107 racct_set(proc, RACCT_MEMLOCK, 1108 ptoa(pmap_wired_count(map->pmap))); 1109 PROC_UNLOCK(proc); 1110 } 1111 #endif 1112 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1113 } 1114 1115 #ifndef _SYS_SYSPROTO_H_ 1116 struct mlockall_args { 1117 int how; 1118 }; 1119 #endif 1120 1121 /* 1122 * MPSAFE 1123 */ 1124 int 1125 sys_mlockall(td, uap) 1126 struct thread *td; 1127 struct mlockall_args *uap; 1128 { 1129 vm_map_t map; 1130 int error; 1131 1132 map = &td->td_proc->p_vmspace->vm_map; 1133 error = priv_check(td, PRIV_VM_MLOCK); 1134 if (error) 1135 return (error); 1136 1137 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1138 return (EINVAL); 1139 1140 /* 1141 * If wiring all pages in the process would cause it to exceed 1142 * a hard resource limit, return ENOMEM. 1143 */ 1144 if (!old_mlock && uap->how & MCL_CURRENT) { 1145 PROC_LOCK(td->td_proc); 1146 if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1147 PROC_UNLOCK(td->td_proc); 1148 return (ENOMEM); 1149 } 1150 PROC_UNLOCK(td->td_proc); 1151 } 1152 #ifdef RACCT 1153 PROC_LOCK(td->td_proc); 1154 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1155 PROC_UNLOCK(td->td_proc); 1156 if (error != 0) 1157 return (ENOMEM); 1158 #endif 1159 1160 if (uap->how & MCL_FUTURE) { 1161 vm_map_lock(map); 1162 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1163 vm_map_unlock(map); 1164 error = 0; 1165 } 1166 1167 if (uap->how & MCL_CURRENT) { 1168 /* 1169 * P1003.1-2001 mandates that all currently mapped pages 1170 * will be memory resident and locked (wired) upon return 1171 * from mlockall(). vm_map_wire() will wire pages, by 1172 * calling vm_fault_wire() for each page in the region. 1173 */ 1174 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1175 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1176 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1177 } 1178 #ifdef RACCT 1179 if (error != KERN_SUCCESS) { 1180 PROC_LOCK(td->td_proc); 1181 racct_set(td->td_proc, RACCT_MEMLOCK, 1182 ptoa(pmap_wired_count(map->pmap))); 1183 PROC_UNLOCK(td->td_proc); 1184 } 1185 #endif 1186 1187 return (error); 1188 } 1189 1190 #ifndef _SYS_SYSPROTO_H_ 1191 struct munlockall_args { 1192 register_t dummy; 1193 }; 1194 #endif 1195 1196 /* 1197 * MPSAFE 1198 */ 1199 int 1200 sys_munlockall(td, uap) 1201 struct thread *td; 1202 struct munlockall_args *uap; 1203 { 1204 vm_map_t map; 1205 int error; 1206 1207 map = &td->td_proc->p_vmspace->vm_map; 1208 error = priv_check(td, PRIV_VM_MUNLOCK); 1209 if (error) 1210 return (error); 1211 1212 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1213 vm_map_lock(map); 1214 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1215 vm_map_unlock(map); 1216 1217 /* Forcibly unwire all pages. */ 1218 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1219 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1220 #ifdef RACCT 1221 if (error == KERN_SUCCESS) { 1222 PROC_LOCK(td->td_proc); 1223 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1224 PROC_UNLOCK(td->td_proc); 1225 } 1226 #endif 1227 1228 return (error); 1229 } 1230 1231 #ifndef _SYS_SYSPROTO_H_ 1232 struct munlock_args { 1233 const void *addr; 1234 size_t len; 1235 }; 1236 #endif 1237 /* 1238 * MPSAFE 1239 */ 1240 int 1241 sys_munlock(td, uap) 1242 struct thread *td; 1243 struct munlock_args *uap; 1244 { 1245 vm_offset_t addr, end, last, start; 1246 vm_size_t size; 1247 #ifdef RACCT 1248 vm_map_t map; 1249 #endif 1250 int error; 1251 1252 error = priv_check(td, PRIV_VM_MUNLOCK); 1253 if (error) 1254 return (error); 1255 addr = (vm_offset_t)uap->addr; 1256 size = uap->len; 1257 last = addr + size; 1258 start = trunc_page(addr); 1259 end = round_page(last); 1260 if (last < addr || end < addr) 1261 return (EINVAL); 1262 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1263 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1264 #ifdef RACCT 1265 if (error == KERN_SUCCESS) { 1266 PROC_LOCK(td->td_proc); 1267 map = &td->td_proc->p_vmspace->vm_map; 1268 racct_set(td->td_proc, RACCT_MEMLOCK, 1269 ptoa(pmap_wired_count(map->pmap))); 1270 PROC_UNLOCK(td->td_proc); 1271 } 1272 #endif 1273 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1274 } 1275 1276 /* 1277 * vm_mmap_vnode() 1278 * 1279 * Helper function for vm_mmap. Perform sanity check specific for mmap 1280 * operations on vnodes. 1281 * 1282 * For VCHR vnodes, the vnode lock is held over the call to 1283 * vm_mmap_cdev() to keep vp->v_rdev valid. 1284 */ 1285 int 1286 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1287 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1288 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1289 boolean_t *writecounted) 1290 { 1291 struct vattr va; 1292 vm_object_t obj; 1293 vm_offset_t foff; 1294 struct mount *mp; 1295 struct ucred *cred; 1296 int error, flags, locktype; 1297 1298 mp = vp->v_mount; 1299 cred = td->td_ucred; 1300 if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED)) 1301 locktype = LK_EXCLUSIVE; 1302 else 1303 locktype = LK_SHARED; 1304 if ((error = vget(vp, locktype, td)) != 0) 1305 return (error); 1306 foff = *foffp; 1307 flags = *flagsp; 1308 obj = vp->v_object; 1309 if (vp->v_type == VREG) { 1310 /* 1311 * Get the proper underlying object 1312 */ 1313 if (obj == NULL) { 1314 error = EINVAL; 1315 goto done; 1316 } 1317 if (obj->type == OBJT_VNODE && obj->handle != vp) { 1318 vput(vp); 1319 vp = (struct vnode *)obj->handle; 1320 /* 1321 * Bypass filesystems obey the mpsafety of the 1322 * underlying fs. Tmpfs never bypasses. 1323 */ 1324 error = vget(vp, locktype, td); 1325 if (error != 0) 1326 return (error); 1327 } 1328 if (locktype == LK_EXCLUSIVE) { 1329 *writecounted = TRUE; 1330 vnode_pager_update_writecount(obj, 0, objsize); 1331 } 1332 } else if (vp->v_type == VCHR) { 1333 error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp, 1334 vp->v_rdev, foffp, objp); 1335 if (error == 0) 1336 goto mark_atime; 1337 goto done; 1338 } else { 1339 error = EINVAL; 1340 goto done; 1341 } 1342 if ((error = VOP_GETATTR(vp, &va, cred))) 1343 goto done; 1344 #ifdef MAC 1345 error = mac_vnode_check_mmap(cred, vp, prot, flags); 1346 if (error != 0) 1347 goto done; 1348 #endif 1349 if ((flags & MAP_SHARED) != 0) { 1350 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1351 if (prot & PROT_WRITE) { 1352 error = EPERM; 1353 goto done; 1354 } 1355 *maxprotp &= ~VM_PROT_WRITE; 1356 } 1357 } 1358 /* 1359 * If it is a regular file without any references 1360 * we do not need to sync it. 1361 * Adjust object size to be the size of actual file. 1362 */ 1363 objsize = round_page(va.va_size); 1364 if (va.va_nlink == 0) 1365 flags |= MAP_NOSYNC; 1366 if (obj->type == OBJT_VNODE) 1367 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, 1368 cred); 1369 else { 1370 KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP, 1371 ("wrong object type")); 1372 vm_object_reference(obj); 1373 } 1374 if (obj == NULL) { 1375 error = ENOMEM; 1376 goto done; 1377 } 1378 *objp = obj; 1379 *flagsp = flags; 1380 1381 mark_atime: 1382 vfs_mark_atime(vp, cred); 1383 1384 done: 1385 if (error != 0 && *writecounted) { 1386 *writecounted = FALSE; 1387 vnode_pager_update_writecount(obj, objsize, 0); 1388 } 1389 vput(vp); 1390 return (error); 1391 } 1392 1393 /* 1394 * vm_mmap_cdev() 1395 * 1396 * MPSAFE 1397 * 1398 * Helper function for vm_mmap. Perform sanity check specific for mmap 1399 * operations on cdevs. 1400 */ 1401 int 1402 vm_mmap_cdev(struct thread *td, vm_size_t objsize, 1403 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1404 struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp) 1405 { 1406 vm_object_t obj; 1407 struct cdevsw *dsw; 1408 int error, flags, ref; 1409 1410 flags = *flagsp; 1411 1412 dsw = dev_refthread(cdev, &ref); 1413 if (dsw == NULL) 1414 return (ENXIO); 1415 if (dsw->d_flags & D_MMAP_ANON) { 1416 dev_relthread(cdev, ref); 1417 *maxprotp = VM_PROT_ALL; 1418 *flagsp |= MAP_ANON; 1419 return (0); 1420 } 1421 /* 1422 * cdevs do not provide private mappings of any kind. 1423 */ 1424 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1425 (prot & PROT_WRITE) != 0) { 1426 dev_relthread(cdev, ref); 1427 return (EACCES); 1428 } 1429 if (flags & (MAP_PRIVATE|MAP_COPY)) { 1430 dev_relthread(cdev, ref); 1431 return (EINVAL); 1432 } 1433 /* 1434 * Force device mappings to be shared. 1435 */ 1436 flags |= MAP_SHARED; 1437 #ifdef MAC_XXX 1438 error = mac_cdev_check_mmap(td->td_ucred, cdev, prot); 1439 if (error != 0) { 1440 dev_relthread(cdev, ref); 1441 return (error); 1442 } 1443 #endif 1444 /* 1445 * First, try d_mmap_single(). If that is not implemented 1446 * (returns ENODEV), fall back to using the device pager. 1447 * Note that d_mmap_single() must return a reference to the 1448 * object (it needs to bump the reference count of the object 1449 * it returns somehow). 1450 * 1451 * XXX assumes VM_PROT_* == PROT_* 1452 */ 1453 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1454 dev_relthread(cdev, ref); 1455 if (error != ENODEV) 1456 return (error); 1457 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1458 td->td_ucred); 1459 if (obj == NULL) 1460 return (EINVAL); 1461 *objp = obj; 1462 *flagsp = flags; 1463 return (0); 1464 } 1465 1466 /* 1467 * vm_mmap_shm() 1468 * 1469 * MPSAFE 1470 * 1471 * Helper function for vm_mmap. Perform sanity check specific for mmap 1472 * operations on shm file descriptors. 1473 */ 1474 int 1475 vm_mmap_shm(struct thread *td, vm_size_t objsize, 1476 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1477 struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp) 1478 { 1479 int error; 1480 1481 if ((*flagsp & MAP_SHARED) != 0 && 1482 (*maxprotp & VM_PROT_WRITE) == 0 && 1483 (prot & PROT_WRITE) != 0) 1484 return (EACCES); 1485 #ifdef MAC 1486 error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp); 1487 if (error != 0) 1488 return (error); 1489 #endif 1490 error = shm_mmap(shmfd, objsize, foff, objp); 1491 if (error) 1492 return (error); 1493 return (0); 1494 } 1495 1496 /* 1497 * vm_mmap() 1498 * 1499 * MPSAFE 1500 * 1501 * Internal version of mmap. Currently used by mmap, exec, and sys5 1502 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1503 */ 1504 int 1505 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1506 vm_prot_t maxprot, int flags, 1507 objtype_t handle_type, void *handle, 1508 vm_ooffset_t foff) 1509 { 1510 boolean_t fitit; 1511 vm_object_t object = NULL; 1512 struct thread *td = curthread; 1513 int docow, error, findspace, rv; 1514 boolean_t writecounted; 1515 1516 if (size == 0) 1517 return (0); 1518 1519 size = round_page(size); 1520 1521 if (map == &td->td_proc->p_vmspace->vm_map) { 1522 PROC_LOCK(td->td_proc); 1523 if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) { 1524 PROC_UNLOCK(td->td_proc); 1525 return (ENOMEM); 1526 } 1527 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1528 PROC_UNLOCK(td->td_proc); 1529 return (ENOMEM); 1530 } 1531 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1532 if (ptoa(pmap_wired_count(map->pmap)) + size > 1533 lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1534 racct_set_force(td->td_proc, RACCT_VMEM, 1535 map->size); 1536 PROC_UNLOCK(td->td_proc); 1537 return (ENOMEM); 1538 } 1539 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1540 ptoa(pmap_wired_count(map->pmap)) + size); 1541 if (error != 0) { 1542 racct_set_force(td->td_proc, RACCT_VMEM, 1543 map->size); 1544 PROC_UNLOCK(td->td_proc); 1545 return (error); 1546 } 1547 } 1548 PROC_UNLOCK(td->td_proc); 1549 } 1550 1551 /* 1552 * We currently can only deal with page aligned file offsets. 1553 * The check is here rather than in the syscall because the 1554 * kernel calls this function internally for other mmaping 1555 * operations (such as in exec) and non-aligned offsets will 1556 * cause pmap inconsistencies...so we want to be sure to 1557 * disallow this in all cases. 1558 */ 1559 if (foff & PAGE_MASK) 1560 return (EINVAL); 1561 1562 if ((flags & MAP_FIXED) == 0) { 1563 fitit = TRUE; 1564 *addr = round_page(*addr); 1565 } else { 1566 if (*addr != trunc_page(*addr)) 1567 return (EINVAL); 1568 fitit = FALSE; 1569 } 1570 writecounted = FALSE; 1571 1572 /* 1573 * Lookup/allocate object. 1574 */ 1575 switch (handle_type) { 1576 case OBJT_DEVICE: 1577 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, 1578 handle, &foff, &object); 1579 break; 1580 case OBJT_VNODE: 1581 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1582 handle, &foff, &object, &writecounted); 1583 break; 1584 case OBJT_SWAP: 1585 error = vm_mmap_shm(td, size, prot, &maxprot, &flags, 1586 handle, foff, &object); 1587 break; 1588 case OBJT_DEFAULT: 1589 if (handle == NULL) { 1590 error = 0; 1591 break; 1592 } 1593 /* FALLTHROUGH */ 1594 default: 1595 error = EINVAL; 1596 break; 1597 } 1598 if (error) 1599 return (error); 1600 if (flags & MAP_ANON) { 1601 object = NULL; 1602 docow = 0; 1603 /* 1604 * Unnamed anonymous regions always start at 0. 1605 */ 1606 if (handle == 0) 1607 foff = 0; 1608 } else if (flags & MAP_PREFAULT_READ) 1609 docow = MAP_PREFAULT; 1610 else 1611 docow = MAP_PREFAULT_PARTIAL; 1612 1613 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1614 docow |= MAP_COPY_ON_WRITE; 1615 if (flags & MAP_NOSYNC) 1616 docow |= MAP_DISABLE_SYNCER; 1617 if (flags & MAP_NOCORE) 1618 docow |= MAP_DISABLE_COREDUMP; 1619 /* Shared memory is also shared with children. */ 1620 if (flags & MAP_SHARED) 1621 docow |= MAP_INHERIT_SHARE; 1622 if (writecounted) 1623 docow |= MAP_VN_WRITECOUNT; 1624 1625 if (flags & MAP_STACK) 1626 rv = vm_map_stack(map, *addr, size, prot, maxprot, 1627 docow | MAP_STACK_GROWS_DOWN); 1628 else if (fitit) { 1629 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) 1630 findspace = VMFS_SUPER_SPACE; 1631 else if ((flags & MAP_ALIGNMENT_MASK) != 0) 1632 findspace = VMFS_ALIGNED_SPACE(flags >> 1633 MAP_ALIGNMENT_SHIFT); 1634 else 1635 findspace = VMFS_OPTIMAL_SPACE; 1636 rv = vm_map_find(map, object, foff, addr, size, 1637 #ifdef MAP_32BIT 1638 flags & MAP_32BIT ? MAP_32BIT_MAX_ADDR : 1639 #endif 1640 0, findspace, prot, maxprot, docow); 1641 } else 1642 rv = vm_map_fixed(map, object, foff, *addr, size, 1643 prot, maxprot, docow); 1644 1645 if (rv == KERN_SUCCESS) { 1646 /* 1647 * If the process has requested that all future mappings 1648 * be wired, then heed this. 1649 */ 1650 if (map->flags & MAP_WIREFUTURE) { 1651 vm_map_wire(map, *addr, *addr + size, 1652 VM_MAP_WIRE_USER | ((flags & MAP_STACK) ? 1653 VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES)); 1654 } 1655 } else { 1656 /* 1657 * If this mapping was accounted for in the vnode's 1658 * writecount, then undo that now. 1659 */ 1660 if (writecounted) 1661 vnode_pager_release_writecount(object, 0, size); 1662 /* 1663 * Lose the object reference. Will destroy the 1664 * object if it's an unnamed anonymous mapping 1665 * or named anonymous without other references. 1666 */ 1667 vm_object_deallocate(object); 1668 } 1669 return (vm_mmap_to_errno(rv)); 1670 } 1671 1672 /* 1673 * Translate a Mach VM return code to zero on success or the appropriate errno 1674 * on failure. 1675 */ 1676 int 1677 vm_mmap_to_errno(int rv) 1678 { 1679 1680 switch (rv) { 1681 case KERN_SUCCESS: 1682 return (0); 1683 case KERN_INVALID_ADDRESS: 1684 case KERN_NO_SPACE: 1685 return (ENOMEM); 1686 case KERN_PROTECTION_FAILURE: 1687 return (EACCES); 1688 default: 1689 return (EINVAL); 1690 } 1691 } 1692