1 /*- 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35 * 36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_compat.h" 47 #include "opt_hwpmc_hooks.h" 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/capsicum.h> 52 #include <sys/kernel.h> 53 #include <sys/lock.h> 54 #include <sys/mutex.h> 55 #include <sys/sysproto.h> 56 #include <sys/filedesc.h> 57 #include <sys/priv.h> 58 #include <sys/proc.h> 59 #include <sys/procctl.h> 60 #include <sys/racct.h> 61 #include <sys/resource.h> 62 #include <sys/resourcevar.h> 63 #include <sys/rwlock.h> 64 #include <sys/sysctl.h> 65 #include <sys/vnode.h> 66 #include <sys/fcntl.h> 67 #include <sys/file.h> 68 #include <sys/mman.h> 69 #include <sys/mount.h> 70 #include <sys/conf.h> 71 #include <sys/stat.h> 72 #include <sys/syscallsubr.h> 73 #include <sys/sysent.h> 74 #include <sys/vmmeter.h> 75 76 #include <security/mac/mac_framework.h> 77 78 #include <vm/vm.h> 79 #include <vm/vm_param.h> 80 #include <vm/pmap.h> 81 #include <vm/vm_map.h> 82 #include <vm/vm_object.h> 83 #include <vm/vm_page.h> 84 #include <vm/vm_pager.h> 85 #include <vm/vm_pageout.h> 86 #include <vm/vm_extern.h> 87 #include <vm/vm_page.h> 88 #include <vm/vnode_pager.h> 89 90 #ifdef HWPMC_HOOKS 91 #include <sys/pmckern.h> 92 #endif 93 94 int old_mlock = 0; 95 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0, 96 "Do not apply RLIMIT_MEMLOCK on mlockall"); 97 98 #ifdef MAP_32BIT 99 #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) 100 #endif 101 102 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 103 int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *); 104 static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 105 int *, struct cdev *, vm_ooffset_t *, vm_object_t *); 106 static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 107 int *, struct shmfd *, vm_ooffset_t, vm_object_t *); 108 109 #ifndef _SYS_SYSPROTO_H_ 110 struct sbrk_args { 111 int incr; 112 }; 113 #endif 114 115 /* 116 * MPSAFE 117 */ 118 /* ARGSUSED */ 119 int 120 sys_sbrk(td, uap) 121 struct thread *td; 122 struct sbrk_args *uap; 123 { 124 /* Not yet implemented */ 125 return (EOPNOTSUPP); 126 } 127 128 #ifndef _SYS_SYSPROTO_H_ 129 struct sstk_args { 130 int incr; 131 }; 132 #endif 133 134 /* 135 * MPSAFE 136 */ 137 /* ARGSUSED */ 138 int 139 sys_sstk(td, uap) 140 struct thread *td; 141 struct sstk_args *uap; 142 { 143 /* Not yet implemented */ 144 return (EOPNOTSUPP); 145 } 146 147 #if defined(COMPAT_43) 148 #ifndef _SYS_SYSPROTO_H_ 149 struct getpagesize_args { 150 int dummy; 151 }; 152 #endif 153 154 int 155 ogetpagesize(td, uap) 156 struct thread *td; 157 struct getpagesize_args *uap; 158 { 159 /* MP SAFE */ 160 td->td_retval[0] = PAGE_SIZE; 161 return (0); 162 } 163 #endif /* COMPAT_43 */ 164 165 166 /* 167 * Memory Map (mmap) system call. Note that the file offset 168 * and address are allowed to be NOT page aligned, though if 169 * the MAP_FIXED flag it set, both must have the same remainder 170 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 171 * page-aligned, the actual mapping starts at trunc_page(addr) 172 * and the return value is adjusted up by the page offset. 173 * 174 * Generally speaking, only character devices which are themselves 175 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 176 * there would be no cache coherency between a descriptor and a VM mapping 177 * both to the same character device. 178 */ 179 #ifndef _SYS_SYSPROTO_H_ 180 struct mmap_args { 181 void *addr; 182 size_t len; 183 int prot; 184 int flags; 185 int fd; 186 long pad; 187 off_t pos; 188 }; 189 #endif 190 191 /* 192 * MPSAFE 193 */ 194 int 195 sys_mmap(td, uap) 196 struct thread *td; 197 struct mmap_args *uap; 198 { 199 #ifdef HWPMC_HOOKS 200 struct pmckern_map_in pkm; 201 #endif 202 struct file *fp; 203 struct vnode *vp; 204 vm_offset_t addr; 205 vm_size_t size, pageoff; 206 vm_prot_t cap_maxprot, prot, maxprot; 207 void *handle; 208 objtype_t handle_type; 209 int align, error, flags; 210 off_t pos; 211 struct vmspace *vms = td->td_proc->p_vmspace; 212 cap_rights_t rights; 213 214 addr = (vm_offset_t) uap->addr; 215 size = uap->len; 216 prot = uap->prot & VM_PROT_ALL; 217 flags = uap->flags; 218 pos = uap->pos; 219 220 fp = NULL; 221 222 /* 223 * Enforce the constraints. 224 * Mapping of length 0 is only allowed for old binaries. 225 * Anonymous mapping shall specify -1 as filedescriptor and 226 * zero position for new code. Be nice to ancient a.out 227 * binaries and correct pos for anonymous mapping, since old 228 * ld.so sometimes issues anonymous map requests with non-zero 229 * pos. 230 */ 231 if (!SV_CURPROC_FLAG(SV_AOUT)) { 232 if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) || 233 ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0))) 234 return (EINVAL); 235 } else { 236 if ((flags & MAP_ANON) != 0) 237 pos = 0; 238 } 239 240 if (flags & MAP_STACK) { 241 if ((uap->fd != -1) || 242 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 243 return (EINVAL); 244 flags |= MAP_ANON; 245 pos = 0; 246 } 247 if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) 248 return (EINVAL); 249 250 /* 251 * Align the file position to a page boundary, 252 * and save its page offset component. 253 */ 254 pageoff = (pos & PAGE_MASK); 255 pos -= pageoff; 256 257 /* Adjust size for rounding (on both ends). */ 258 size += pageoff; /* low end... */ 259 size = (vm_size_t) round_page(size); /* hi end */ 260 261 /* Ensure alignment is at least a page and fits in a pointer. */ 262 align = flags & MAP_ALIGNMENT_MASK; 263 if (align != 0 && align != MAP_ALIGNED_SUPER && 264 (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || 265 align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) 266 return (EINVAL); 267 268 /* 269 * Check for illegal addresses. Watch out for address wrap... Note 270 * that VM_*_ADDRESS are not constants due to casts (argh). 271 */ 272 if (flags & MAP_FIXED) { 273 /* 274 * The specified address must have the same remainder 275 * as the file offset taken modulo PAGE_SIZE, so it 276 * should be aligned after adjustment by pageoff. 277 */ 278 addr -= pageoff; 279 if (addr & PAGE_MASK) 280 return (EINVAL); 281 282 /* Address range must be all in user VM space. */ 283 if (addr < vm_map_min(&vms->vm_map) || 284 addr + size > vm_map_max(&vms->vm_map)) 285 return (EINVAL); 286 if (addr + size < addr) 287 return (EINVAL); 288 #ifdef MAP_32BIT 289 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) 290 return (EINVAL); 291 } else if (flags & MAP_32BIT) { 292 /* 293 * For MAP_32BIT, override the hint if it is too high and 294 * do not bother moving the mapping past the heap (since 295 * the heap is usually above 2GB). 296 */ 297 if (addr + size > MAP_32BIT_MAX_ADDR) 298 addr = 0; 299 #endif 300 } else { 301 /* 302 * XXX for non-fixed mappings where no hint is provided or 303 * the hint would fall in the potential heap space, 304 * place it after the end of the largest possible heap. 305 * 306 * There should really be a pmap call to determine a reasonable 307 * location. 308 */ 309 PROC_LOCK(td->td_proc); 310 if (addr == 0 || 311 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 312 addr < round_page((vm_offset_t)vms->vm_daddr + 313 lim_max(td->td_proc, RLIMIT_DATA)))) 314 addr = round_page((vm_offset_t)vms->vm_daddr + 315 lim_max(td->td_proc, RLIMIT_DATA)); 316 PROC_UNLOCK(td->td_proc); 317 } 318 if (flags & MAP_ANON) { 319 /* 320 * Mapping blank space is trivial. 321 */ 322 handle = NULL; 323 handle_type = OBJT_DEFAULT; 324 maxprot = VM_PROT_ALL; 325 cap_maxprot = VM_PROT_ALL; 326 } else { 327 /* 328 * Mapping file, get fp for validation and don't let the 329 * descriptor disappear on us if we block. Check capability 330 * rights, but also return the maximum rights to be combined 331 * with maxprot later. 332 */ 333 cap_rights_init(&rights, CAP_MMAP); 334 if (prot & PROT_READ) 335 cap_rights_set(&rights, CAP_MMAP_R); 336 if ((flags & MAP_SHARED) != 0) { 337 if (prot & PROT_WRITE) 338 cap_rights_set(&rights, CAP_MMAP_W); 339 } 340 if (prot & PROT_EXEC) 341 cap_rights_set(&rights, CAP_MMAP_X); 342 error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp); 343 if (error != 0) 344 goto done; 345 if (fp->f_type == DTYPE_SHM) { 346 handle = fp->f_data; 347 handle_type = OBJT_SWAP; 348 maxprot = VM_PROT_NONE; 349 350 /* FREAD should always be set. */ 351 if (fp->f_flag & FREAD) 352 maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; 353 if (fp->f_flag & FWRITE) 354 maxprot |= VM_PROT_WRITE; 355 goto map; 356 } 357 if (fp->f_type != DTYPE_VNODE) { 358 error = ENODEV; 359 goto done; 360 } 361 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ 362 defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) 363 /* 364 * POSIX shared-memory objects are defined to have 365 * kernel persistence, and are not defined to support 366 * read(2)/write(2) -- or even open(2). Thus, we can 367 * use MAP_ASYNC to trade on-disk coherence for speed. 368 * The shm_open(3) library routine turns on the FPOSIXSHM 369 * flag to request this behavior. 370 */ 371 if (fp->f_flag & FPOSIXSHM) 372 flags |= MAP_NOSYNC; 373 #endif 374 vp = fp->f_vnode; 375 /* 376 * Ensure that file and memory protections are 377 * compatible. Note that we only worry about 378 * writability if mapping is shared; in this case, 379 * current and max prot are dictated by the open file. 380 * XXX use the vnode instead? Problem is: what 381 * credentials do we use for determination? What if 382 * proc does a setuid? 383 */ 384 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 385 maxprot = VM_PROT_NONE; 386 else 387 maxprot = VM_PROT_EXECUTE; 388 if (fp->f_flag & FREAD) { 389 maxprot |= VM_PROT_READ; 390 } else if (prot & PROT_READ) { 391 error = EACCES; 392 goto done; 393 } 394 /* 395 * If we are sharing potential changes (either via 396 * MAP_SHARED or via the implicit sharing of character 397 * device mappings), and we are trying to get write 398 * permission although we opened it without asking 399 * for it, bail out. 400 */ 401 if ((flags & MAP_SHARED) != 0) { 402 if ((fp->f_flag & FWRITE) != 0) { 403 maxprot |= VM_PROT_WRITE; 404 } else if ((prot & PROT_WRITE) != 0) { 405 error = EACCES; 406 goto done; 407 } 408 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 409 maxprot |= VM_PROT_WRITE; 410 cap_maxprot |= VM_PROT_WRITE; 411 } 412 handle = (void *)vp; 413 handle_type = OBJT_VNODE; 414 } 415 map: 416 td->td_fpop = fp; 417 maxprot &= cap_maxprot; 418 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 419 flags, handle_type, handle, pos); 420 td->td_fpop = NULL; 421 #ifdef HWPMC_HOOKS 422 /* inform hwpmc(4) if an executable is being mapped */ 423 if (error == 0 && handle_type == OBJT_VNODE && 424 (prot & PROT_EXEC)) { 425 pkm.pm_file = handle; 426 pkm.pm_address = (uintptr_t) addr; 427 PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); 428 } 429 #endif 430 if (error == 0) 431 td->td_retval[0] = (register_t) (addr + pageoff); 432 done: 433 if (fp) 434 fdrop(fp, td); 435 436 return (error); 437 } 438 439 int 440 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 441 { 442 struct mmap_args oargs; 443 444 oargs.addr = uap->addr; 445 oargs.len = uap->len; 446 oargs.prot = uap->prot; 447 oargs.flags = uap->flags; 448 oargs.fd = uap->fd; 449 oargs.pos = uap->pos; 450 return (sys_mmap(td, &oargs)); 451 } 452 453 #ifdef COMPAT_43 454 #ifndef _SYS_SYSPROTO_H_ 455 struct ommap_args { 456 caddr_t addr; 457 int len; 458 int prot; 459 int flags; 460 int fd; 461 long pos; 462 }; 463 #endif 464 int 465 ommap(td, uap) 466 struct thread *td; 467 struct ommap_args *uap; 468 { 469 struct mmap_args nargs; 470 static const char cvtbsdprot[8] = { 471 0, 472 PROT_EXEC, 473 PROT_WRITE, 474 PROT_EXEC | PROT_WRITE, 475 PROT_READ, 476 PROT_EXEC | PROT_READ, 477 PROT_WRITE | PROT_READ, 478 PROT_EXEC | PROT_WRITE | PROT_READ, 479 }; 480 481 #define OMAP_ANON 0x0002 482 #define OMAP_COPY 0x0020 483 #define OMAP_SHARED 0x0010 484 #define OMAP_FIXED 0x0100 485 486 nargs.addr = uap->addr; 487 nargs.len = uap->len; 488 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 489 #ifdef COMPAT_FREEBSD32 490 #if defined(__amd64__) 491 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 492 nargs.prot != 0) 493 nargs.prot |= PROT_EXEC; 494 #endif 495 #endif 496 nargs.flags = 0; 497 if (uap->flags & OMAP_ANON) 498 nargs.flags |= MAP_ANON; 499 if (uap->flags & OMAP_COPY) 500 nargs.flags |= MAP_COPY; 501 if (uap->flags & OMAP_SHARED) 502 nargs.flags |= MAP_SHARED; 503 else 504 nargs.flags |= MAP_PRIVATE; 505 if (uap->flags & OMAP_FIXED) 506 nargs.flags |= MAP_FIXED; 507 nargs.fd = uap->fd; 508 nargs.pos = uap->pos; 509 return (sys_mmap(td, &nargs)); 510 } 511 #endif /* COMPAT_43 */ 512 513 514 #ifndef _SYS_SYSPROTO_H_ 515 struct msync_args { 516 void *addr; 517 size_t len; 518 int flags; 519 }; 520 #endif 521 /* 522 * MPSAFE 523 */ 524 int 525 sys_msync(td, uap) 526 struct thread *td; 527 struct msync_args *uap; 528 { 529 vm_offset_t addr; 530 vm_size_t size, pageoff; 531 int flags; 532 vm_map_t map; 533 int rv; 534 535 addr = (vm_offset_t) uap->addr; 536 size = uap->len; 537 flags = uap->flags; 538 539 pageoff = (addr & PAGE_MASK); 540 addr -= pageoff; 541 size += pageoff; 542 size = (vm_size_t) round_page(size); 543 if (addr + size < addr) 544 return (EINVAL); 545 546 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 547 return (EINVAL); 548 549 map = &td->td_proc->p_vmspace->vm_map; 550 551 /* 552 * Clean the pages and interpret the return value. 553 */ 554 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 555 (flags & MS_INVALIDATE) != 0); 556 switch (rv) { 557 case KERN_SUCCESS: 558 return (0); 559 case KERN_INVALID_ADDRESS: 560 return (ENOMEM); 561 case KERN_INVALID_ARGUMENT: 562 return (EBUSY); 563 case KERN_FAILURE: 564 return (EIO); 565 default: 566 return (EINVAL); 567 } 568 } 569 570 #ifndef _SYS_SYSPROTO_H_ 571 struct munmap_args { 572 void *addr; 573 size_t len; 574 }; 575 #endif 576 /* 577 * MPSAFE 578 */ 579 int 580 sys_munmap(td, uap) 581 struct thread *td; 582 struct munmap_args *uap; 583 { 584 #ifdef HWPMC_HOOKS 585 struct pmckern_map_out pkm; 586 vm_map_entry_t entry; 587 #endif 588 vm_offset_t addr; 589 vm_size_t size, pageoff; 590 vm_map_t map; 591 592 addr = (vm_offset_t) uap->addr; 593 size = uap->len; 594 if (size == 0) 595 return (EINVAL); 596 597 pageoff = (addr & PAGE_MASK); 598 addr -= pageoff; 599 size += pageoff; 600 size = (vm_size_t) round_page(size); 601 if (addr + size < addr) 602 return (EINVAL); 603 604 /* 605 * Check for illegal addresses. Watch out for address wrap... 606 */ 607 map = &td->td_proc->p_vmspace->vm_map; 608 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 609 return (EINVAL); 610 vm_map_lock(map); 611 #ifdef HWPMC_HOOKS 612 /* 613 * Inform hwpmc if the address range being unmapped contains 614 * an executable region. 615 */ 616 pkm.pm_address = (uintptr_t) NULL; 617 if (vm_map_lookup_entry(map, addr, &entry)) { 618 for (; 619 entry != &map->header && entry->start < addr + size; 620 entry = entry->next) { 621 if (vm_map_check_protection(map, entry->start, 622 entry->end, VM_PROT_EXECUTE) == TRUE) { 623 pkm.pm_address = (uintptr_t) addr; 624 pkm.pm_size = (size_t) size; 625 break; 626 } 627 } 628 } 629 #endif 630 vm_map_delete(map, addr, addr + size); 631 632 #ifdef HWPMC_HOOKS 633 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 634 vm_map_lock_downgrade(map); 635 if (pkm.pm_address != (uintptr_t) NULL) 636 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 637 vm_map_unlock_read(map); 638 #else 639 vm_map_unlock(map); 640 #endif 641 /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ 642 return (0); 643 } 644 645 #ifndef _SYS_SYSPROTO_H_ 646 struct mprotect_args { 647 const void *addr; 648 size_t len; 649 int prot; 650 }; 651 #endif 652 /* 653 * MPSAFE 654 */ 655 int 656 sys_mprotect(td, uap) 657 struct thread *td; 658 struct mprotect_args *uap; 659 { 660 vm_offset_t addr; 661 vm_size_t size, pageoff; 662 vm_prot_t prot; 663 664 addr = (vm_offset_t) uap->addr; 665 size = uap->len; 666 prot = uap->prot & VM_PROT_ALL; 667 668 pageoff = (addr & PAGE_MASK); 669 addr -= pageoff; 670 size += pageoff; 671 size = (vm_size_t) round_page(size); 672 if (addr + size < addr) 673 return (EINVAL); 674 675 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 676 addr + size, prot, FALSE)) { 677 case KERN_SUCCESS: 678 return (0); 679 case KERN_PROTECTION_FAILURE: 680 return (EACCES); 681 case KERN_RESOURCE_SHORTAGE: 682 return (ENOMEM); 683 } 684 return (EINVAL); 685 } 686 687 #ifndef _SYS_SYSPROTO_H_ 688 struct minherit_args { 689 void *addr; 690 size_t len; 691 int inherit; 692 }; 693 #endif 694 /* 695 * MPSAFE 696 */ 697 int 698 sys_minherit(td, uap) 699 struct thread *td; 700 struct minherit_args *uap; 701 { 702 vm_offset_t addr; 703 vm_size_t size, pageoff; 704 vm_inherit_t inherit; 705 706 addr = (vm_offset_t)uap->addr; 707 size = uap->len; 708 inherit = uap->inherit; 709 710 pageoff = (addr & PAGE_MASK); 711 addr -= pageoff; 712 size += pageoff; 713 size = (vm_size_t) round_page(size); 714 if (addr + size < addr) 715 return (EINVAL); 716 717 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 718 addr + size, inherit)) { 719 case KERN_SUCCESS: 720 return (0); 721 case KERN_PROTECTION_FAILURE: 722 return (EACCES); 723 } 724 return (EINVAL); 725 } 726 727 #ifndef _SYS_SYSPROTO_H_ 728 struct madvise_args { 729 void *addr; 730 size_t len; 731 int behav; 732 }; 733 #endif 734 735 /* 736 * MPSAFE 737 */ 738 int 739 sys_madvise(td, uap) 740 struct thread *td; 741 struct madvise_args *uap; 742 { 743 vm_offset_t start, end; 744 vm_map_t map; 745 int flags; 746 747 /* 748 * Check for our special case, advising the swap pager we are 749 * "immortal." 750 */ 751 if (uap->behav == MADV_PROTECT) { 752 flags = PPROT_SET; 753 return (kern_procctl(td, P_PID, td->td_proc->p_pid, 754 PROC_SPROTECT, &flags)); 755 } 756 757 /* 758 * Check for illegal behavior 759 */ 760 if (uap->behav < 0 || uap->behav > MADV_CORE) 761 return (EINVAL); 762 /* 763 * Check for illegal addresses. Watch out for address wrap... Note 764 * that VM_*_ADDRESS are not constants due to casts (argh). 765 */ 766 map = &td->td_proc->p_vmspace->vm_map; 767 if ((vm_offset_t)uap->addr < vm_map_min(map) || 768 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 769 return (EINVAL); 770 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 771 return (EINVAL); 772 773 /* 774 * Since this routine is only advisory, we default to conservative 775 * behavior. 776 */ 777 start = trunc_page((vm_offset_t) uap->addr); 778 end = round_page((vm_offset_t) uap->addr + uap->len); 779 780 if (vm_map_madvise(map, start, end, uap->behav)) 781 return (EINVAL); 782 return (0); 783 } 784 785 #ifndef _SYS_SYSPROTO_H_ 786 struct mincore_args { 787 const void *addr; 788 size_t len; 789 char *vec; 790 }; 791 #endif 792 793 /* 794 * MPSAFE 795 */ 796 int 797 sys_mincore(td, uap) 798 struct thread *td; 799 struct mincore_args *uap; 800 { 801 vm_offset_t addr, first_addr; 802 vm_offset_t end, cend; 803 pmap_t pmap; 804 vm_map_t map; 805 char *vec; 806 int error = 0; 807 int vecindex, lastvecindex; 808 vm_map_entry_t current; 809 vm_map_entry_t entry; 810 vm_object_t object; 811 vm_paddr_t locked_pa; 812 vm_page_t m; 813 vm_pindex_t pindex; 814 int mincoreinfo; 815 unsigned int timestamp; 816 boolean_t locked; 817 818 /* 819 * Make sure that the addresses presented are valid for user 820 * mode. 821 */ 822 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 823 end = addr + (vm_size_t)round_page(uap->len); 824 map = &td->td_proc->p_vmspace->vm_map; 825 if (end > vm_map_max(map) || end < addr) 826 return (ENOMEM); 827 828 /* 829 * Address of byte vector 830 */ 831 vec = uap->vec; 832 833 pmap = vmspace_pmap(td->td_proc->p_vmspace); 834 835 vm_map_lock_read(map); 836 RestartScan: 837 timestamp = map->timestamp; 838 839 if (!vm_map_lookup_entry(map, addr, &entry)) { 840 vm_map_unlock_read(map); 841 return (ENOMEM); 842 } 843 844 /* 845 * Do this on a map entry basis so that if the pages are not 846 * in the current processes address space, we can easily look 847 * up the pages elsewhere. 848 */ 849 lastvecindex = -1; 850 for (current = entry; 851 (current != &map->header) && (current->start < end); 852 current = current->next) { 853 854 /* 855 * check for contiguity 856 */ 857 if (current->end < end && 858 (entry->next == &map->header || 859 current->next->start > current->end)) { 860 vm_map_unlock_read(map); 861 return (ENOMEM); 862 } 863 864 /* 865 * ignore submaps (for now) or null objects 866 */ 867 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 868 current->object.vm_object == NULL) 869 continue; 870 871 /* 872 * limit this scan to the current map entry and the 873 * limits for the mincore call 874 */ 875 if (addr < current->start) 876 addr = current->start; 877 cend = current->end; 878 if (cend > end) 879 cend = end; 880 881 /* 882 * scan this entry one page at a time 883 */ 884 while (addr < cend) { 885 /* 886 * Check pmap first, it is likely faster, also 887 * it can provide info as to whether we are the 888 * one referencing or modifying the page. 889 */ 890 object = NULL; 891 locked_pa = 0; 892 retry: 893 m = NULL; 894 mincoreinfo = pmap_mincore(pmap, addr, &locked_pa); 895 if (locked_pa != 0) { 896 /* 897 * The page is mapped by this process but not 898 * both accessed and modified. It is also 899 * managed. Acquire the object lock so that 900 * other mappings might be examined. 901 */ 902 m = PHYS_TO_VM_PAGE(locked_pa); 903 if (m->object != object) { 904 if (object != NULL) 905 VM_OBJECT_WUNLOCK(object); 906 object = m->object; 907 locked = VM_OBJECT_TRYWLOCK(object); 908 vm_page_unlock(m); 909 if (!locked) { 910 VM_OBJECT_WLOCK(object); 911 vm_page_lock(m); 912 goto retry; 913 } 914 } else 915 vm_page_unlock(m); 916 KASSERT(m->valid == VM_PAGE_BITS_ALL, 917 ("mincore: page %p is mapped but invalid", 918 m)); 919 } else if (mincoreinfo == 0) { 920 /* 921 * The page is not mapped by this process. If 922 * the object implements managed pages, then 923 * determine if the page is resident so that 924 * the mappings might be examined. 925 */ 926 if (current->object.vm_object != object) { 927 if (object != NULL) 928 VM_OBJECT_WUNLOCK(object); 929 object = current->object.vm_object; 930 VM_OBJECT_WLOCK(object); 931 } 932 if (object->type == OBJT_DEFAULT || 933 object->type == OBJT_SWAP || 934 object->type == OBJT_VNODE) { 935 pindex = OFF_TO_IDX(current->offset + 936 (addr - current->start)); 937 m = vm_page_lookup(object, pindex); 938 if (m == NULL && 939 vm_page_is_cached(object, pindex)) 940 mincoreinfo = MINCORE_INCORE; 941 if (m != NULL && m->valid == 0) 942 m = NULL; 943 if (m != NULL) 944 mincoreinfo = MINCORE_INCORE; 945 } 946 } 947 if (m != NULL) { 948 /* Examine other mappings to the page. */ 949 if (m->dirty == 0 && pmap_is_modified(m)) 950 vm_page_dirty(m); 951 if (m->dirty != 0) 952 mincoreinfo |= MINCORE_MODIFIED_OTHER; 953 /* 954 * The first test for PGA_REFERENCED is an 955 * optimization. The second test is 956 * required because a concurrent pmap 957 * operation could clear the last reference 958 * and set PGA_REFERENCED before the call to 959 * pmap_is_referenced(). 960 */ 961 if ((m->aflags & PGA_REFERENCED) != 0 || 962 pmap_is_referenced(m) || 963 (m->aflags & PGA_REFERENCED) != 0) 964 mincoreinfo |= MINCORE_REFERENCED_OTHER; 965 } 966 if (object != NULL) 967 VM_OBJECT_WUNLOCK(object); 968 969 /* 970 * subyte may page fault. In case it needs to modify 971 * the map, we release the lock. 972 */ 973 vm_map_unlock_read(map); 974 975 /* 976 * calculate index into user supplied byte vector 977 */ 978 vecindex = OFF_TO_IDX(addr - first_addr); 979 980 /* 981 * If we have skipped map entries, we need to make sure that 982 * the byte vector is zeroed for those skipped entries. 983 */ 984 while ((lastvecindex + 1) < vecindex) { 985 ++lastvecindex; 986 error = subyte(vec + lastvecindex, 0); 987 if (error) { 988 error = EFAULT; 989 goto done2; 990 } 991 } 992 993 /* 994 * Pass the page information to the user 995 */ 996 error = subyte(vec + vecindex, mincoreinfo); 997 if (error) { 998 error = EFAULT; 999 goto done2; 1000 } 1001 1002 /* 1003 * If the map has changed, due to the subyte, the previous 1004 * output may be invalid. 1005 */ 1006 vm_map_lock_read(map); 1007 if (timestamp != map->timestamp) 1008 goto RestartScan; 1009 1010 lastvecindex = vecindex; 1011 addr += PAGE_SIZE; 1012 } 1013 } 1014 1015 /* 1016 * subyte may page fault. In case it needs to modify 1017 * the map, we release the lock. 1018 */ 1019 vm_map_unlock_read(map); 1020 1021 /* 1022 * Zero the last entries in the byte vector. 1023 */ 1024 vecindex = OFF_TO_IDX(end - first_addr); 1025 while ((lastvecindex + 1) < vecindex) { 1026 ++lastvecindex; 1027 error = subyte(vec + lastvecindex, 0); 1028 if (error) { 1029 error = EFAULT; 1030 goto done2; 1031 } 1032 } 1033 1034 /* 1035 * If the map has changed, due to the subyte, the previous 1036 * output may be invalid. 1037 */ 1038 vm_map_lock_read(map); 1039 if (timestamp != map->timestamp) 1040 goto RestartScan; 1041 vm_map_unlock_read(map); 1042 done2: 1043 return (error); 1044 } 1045 1046 #ifndef _SYS_SYSPROTO_H_ 1047 struct mlock_args { 1048 const void *addr; 1049 size_t len; 1050 }; 1051 #endif 1052 /* 1053 * MPSAFE 1054 */ 1055 int 1056 sys_mlock(td, uap) 1057 struct thread *td; 1058 struct mlock_args *uap; 1059 { 1060 1061 return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len)); 1062 } 1063 1064 int 1065 vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len) 1066 { 1067 vm_offset_t addr, end, last, start; 1068 vm_size_t npages, size; 1069 vm_map_t map; 1070 unsigned long nsize; 1071 int error; 1072 1073 error = priv_check_cred(cred, PRIV_VM_MLOCK, 0); 1074 if (error) 1075 return (error); 1076 addr = (vm_offset_t)addr0; 1077 size = len; 1078 last = addr + size; 1079 start = trunc_page(addr); 1080 end = round_page(last); 1081 if (last < addr || end < addr) 1082 return (EINVAL); 1083 npages = atop(end - start); 1084 if (npages > vm_page_max_wired) 1085 return (ENOMEM); 1086 map = &proc->p_vmspace->vm_map; 1087 PROC_LOCK(proc); 1088 nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1089 if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) { 1090 PROC_UNLOCK(proc); 1091 return (ENOMEM); 1092 } 1093 PROC_UNLOCK(proc); 1094 if (npages + vm_cnt.v_wire_count > vm_page_max_wired) 1095 return (EAGAIN); 1096 #ifdef RACCT 1097 PROC_LOCK(proc); 1098 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1099 PROC_UNLOCK(proc); 1100 if (error != 0) 1101 return (ENOMEM); 1102 #endif 1103 error = vm_map_wire(map, start, end, 1104 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1105 #ifdef RACCT 1106 if (error != KERN_SUCCESS) { 1107 PROC_LOCK(proc); 1108 racct_set(proc, RACCT_MEMLOCK, 1109 ptoa(pmap_wired_count(map->pmap))); 1110 PROC_UNLOCK(proc); 1111 } 1112 #endif 1113 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1114 } 1115 1116 #ifndef _SYS_SYSPROTO_H_ 1117 struct mlockall_args { 1118 int how; 1119 }; 1120 #endif 1121 1122 /* 1123 * MPSAFE 1124 */ 1125 int 1126 sys_mlockall(td, uap) 1127 struct thread *td; 1128 struct mlockall_args *uap; 1129 { 1130 vm_map_t map; 1131 int error; 1132 1133 map = &td->td_proc->p_vmspace->vm_map; 1134 error = priv_check(td, PRIV_VM_MLOCK); 1135 if (error) 1136 return (error); 1137 1138 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1139 return (EINVAL); 1140 1141 /* 1142 * If wiring all pages in the process would cause it to exceed 1143 * a hard resource limit, return ENOMEM. 1144 */ 1145 if (!old_mlock && uap->how & MCL_CURRENT) { 1146 PROC_LOCK(td->td_proc); 1147 if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1148 PROC_UNLOCK(td->td_proc); 1149 return (ENOMEM); 1150 } 1151 PROC_UNLOCK(td->td_proc); 1152 } 1153 #ifdef RACCT 1154 PROC_LOCK(td->td_proc); 1155 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1156 PROC_UNLOCK(td->td_proc); 1157 if (error != 0) 1158 return (ENOMEM); 1159 #endif 1160 1161 if (uap->how & MCL_FUTURE) { 1162 vm_map_lock(map); 1163 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1164 vm_map_unlock(map); 1165 error = 0; 1166 } 1167 1168 if (uap->how & MCL_CURRENT) { 1169 /* 1170 * P1003.1-2001 mandates that all currently mapped pages 1171 * will be memory resident and locked (wired) upon return 1172 * from mlockall(). vm_map_wire() will wire pages, by 1173 * calling vm_fault_wire() for each page in the region. 1174 */ 1175 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1176 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1177 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1178 } 1179 #ifdef RACCT 1180 if (error != KERN_SUCCESS) { 1181 PROC_LOCK(td->td_proc); 1182 racct_set(td->td_proc, RACCT_MEMLOCK, 1183 ptoa(pmap_wired_count(map->pmap))); 1184 PROC_UNLOCK(td->td_proc); 1185 } 1186 #endif 1187 1188 return (error); 1189 } 1190 1191 #ifndef _SYS_SYSPROTO_H_ 1192 struct munlockall_args { 1193 register_t dummy; 1194 }; 1195 #endif 1196 1197 /* 1198 * MPSAFE 1199 */ 1200 int 1201 sys_munlockall(td, uap) 1202 struct thread *td; 1203 struct munlockall_args *uap; 1204 { 1205 vm_map_t map; 1206 int error; 1207 1208 map = &td->td_proc->p_vmspace->vm_map; 1209 error = priv_check(td, PRIV_VM_MUNLOCK); 1210 if (error) 1211 return (error); 1212 1213 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1214 vm_map_lock(map); 1215 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1216 vm_map_unlock(map); 1217 1218 /* Forcibly unwire all pages. */ 1219 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1220 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1221 #ifdef RACCT 1222 if (error == KERN_SUCCESS) { 1223 PROC_LOCK(td->td_proc); 1224 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1225 PROC_UNLOCK(td->td_proc); 1226 } 1227 #endif 1228 1229 return (error); 1230 } 1231 1232 #ifndef _SYS_SYSPROTO_H_ 1233 struct munlock_args { 1234 const void *addr; 1235 size_t len; 1236 }; 1237 #endif 1238 /* 1239 * MPSAFE 1240 */ 1241 int 1242 sys_munlock(td, uap) 1243 struct thread *td; 1244 struct munlock_args *uap; 1245 { 1246 vm_offset_t addr, end, last, start; 1247 vm_size_t size; 1248 #ifdef RACCT 1249 vm_map_t map; 1250 #endif 1251 int error; 1252 1253 error = priv_check(td, PRIV_VM_MUNLOCK); 1254 if (error) 1255 return (error); 1256 addr = (vm_offset_t)uap->addr; 1257 size = uap->len; 1258 last = addr + size; 1259 start = trunc_page(addr); 1260 end = round_page(last); 1261 if (last < addr || end < addr) 1262 return (EINVAL); 1263 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1264 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1265 #ifdef RACCT 1266 if (error == KERN_SUCCESS) { 1267 PROC_LOCK(td->td_proc); 1268 map = &td->td_proc->p_vmspace->vm_map; 1269 racct_set(td->td_proc, RACCT_MEMLOCK, 1270 ptoa(pmap_wired_count(map->pmap))); 1271 PROC_UNLOCK(td->td_proc); 1272 } 1273 #endif 1274 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1275 } 1276 1277 /* 1278 * vm_mmap_vnode() 1279 * 1280 * Helper function for vm_mmap. Perform sanity check specific for mmap 1281 * operations on vnodes. 1282 * 1283 * For VCHR vnodes, the vnode lock is held over the call to 1284 * vm_mmap_cdev() to keep vp->v_rdev valid. 1285 */ 1286 int 1287 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1288 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1289 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1290 boolean_t *writecounted) 1291 { 1292 struct vattr va; 1293 vm_object_t obj; 1294 vm_offset_t foff; 1295 struct mount *mp; 1296 struct ucred *cred; 1297 int error, flags, locktype; 1298 1299 mp = vp->v_mount; 1300 cred = td->td_ucred; 1301 if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED)) 1302 locktype = LK_EXCLUSIVE; 1303 else 1304 locktype = LK_SHARED; 1305 if ((error = vget(vp, locktype, td)) != 0) 1306 return (error); 1307 foff = *foffp; 1308 flags = *flagsp; 1309 obj = vp->v_object; 1310 if (vp->v_type == VREG) { 1311 /* 1312 * Get the proper underlying object 1313 */ 1314 if (obj == NULL) { 1315 error = EINVAL; 1316 goto done; 1317 } 1318 if (obj->type == OBJT_VNODE && obj->handle != vp) { 1319 vput(vp); 1320 vp = (struct vnode *)obj->handle; 1321 /* 1322 * Bypass filesystems obey the mpsafety of the 1323 * underlying fs. Tmpfs never bypasses. 1324 */ 1325 error = vget(vp, locktype, td); 1326 if (error != 0) 1327 return (error); 1328 } 1329 if (locktype == LK_EXCLUSIVE) { 1330 *writecounted = TRUE; 1331 vnode_pager_update_writecount(obj, 0, objsize); 1332 } 1333 } else if (vp->v_type == VCHR) { 1334 error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp, 1335 vp->v_rdev, foffp, objp); 1336 if (error == 0) 1337 goto mark_atime; 1338 goto done; 1339 } else { 1340 error = EINVAL; 1341 goto done; 1342 } 1343 if ((error = VOP_GETATTR(vp, &va, cred))) 1344 goto done; 1345 #ifdef MAC 1346 error = mac_vnode_check_mmap(cred, vp, prot, flags); 1347 if (error != 0) 1348 goto done; 1349 #endif 1350 if ((flags & MAP_SHARED) != 0) { 1351 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1352 if (prot & PROT_WRITE) { 1353 error = EPERM; 1354 goto done; 1355 } 1356 *maxprotp &= ~VM_PROT_WRITE; 1357 } 1358 } 1359 /* 1360 * If it is a regular file without any references 1361 * we do not need to sync it. 1362 * Adjust object size to be the size of actual file. 1363 */ 1364 objsize = round_page(va.va_size); 1365 if (va.va_nlink == 0) 1366 flags |= MAP_NOSYNC; 1367 if (obj->type == OBJT_VNODE) 1368 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, 1369 cred); 1370 else { 1371 KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP, 1372 ("wrong object type")); 1373 vm_object_reference(obj); 1374 } 1375 if (obj == NULL) { 1376 error = ENOMEM; 1377 goto done; 1378 } 1379 *objp = obj; 1380 *flagsp = flags; 1381 1382 mark_atime: 1383 vfs_mark_atime(vp, cred); 1384 1385 done: 1386 if (error != 0 && *writecounted) { 1387 *writecounted = FALSE; 1388 vnode_pager_update_writecount(obj, objsize, 0); 1389 } 1390 vput(vp); 1391 return (error); 1392 } 1393 1394 /* 1395 * vm_mmap_cdev() 1396 * 1397 * MPSAFE 1398 * 1399 * Helper function for vm_mmap. Perform sanity check specific for mmap 1400 * operations on cdevs. 1401 */ 1402 int 1403 vm_mmap_cdev(struct thread *td, vm_size_t objsize, 1404 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1405 struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp) 1406 { 1407 vm_object_t obj; 1408 struct cdevsw *dsw; 1409 int error, flags, ref; 1410 1411 flags = *flagsp; 1412 1413 dsw = dev_refthread(cdev, &ref); 1414 if (dsw == NULL) 1415 return (ENXIO); 1416 if (dsw->d_flags & D_MMAP_ANON) { 1417 dev_relthread(cdev, ref); 1418 *maxprotp = VM_PROT_ALL; 1419 *flagsp |= MAP_ANON; 1420 return (0); 1421 } 1422 /* 1423 * cdevs do not provide private mappings of any kind. 1424 */ 1425 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1426 (prot & PROT_WRITE) != 0) { 1427 dev_relthread(cdev, ref); 1428 return (EACCES); 1429 } 1430 if (flags & (MAP_PRIVATE|MAP_COPY)) { 1431 dev_relthread(cdev, ref); 1432 return (EINVAL); 1433 } 1434 /* 1435 * Force device mappings to be shared. 1436 */ 1437 flags |= MAP_SHARED; 1438 #ifdef MAC_XXX 1439 error = mac_cdev_check_mmap(td->td_ucred, cdev, prot); 1440 if (error != 0) { 1441 dev_relthread(cdev, ref); 1442 return (error); 1443 } 1444 #endif 1445 /* 1446 * First, try d_mmap_single(). If that is not implemented 1447 * (returns ENODEV), fall back to using the device pager. 1448 * Note that d_mmap_single() must return a reference to the 1449 * object (it needs to bump the reference count of the object 1450 * it returns somehow). 1451 * 1452 * XXX assumes VM_PROT_* == PROT_* 1453 */ 1454 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1455 dev_relthread(cdev, ref); 1456 if (error != ENODEV) 1457 return (error); 1458 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1459 td->td_ucred); 1460 if (obj == NULL) 1461 return (EINVAL); 1462 *objp = obj; 1463 *flagsp = flags; 1464 return (0); 1465 } 1466 1467 /* 1468 * vm_mmap_shm() 1469 * 1470 * MPSAFE 1471 * 1472 * Helper function for vm_mmap. Perform sanity check specific for mmap 1473 * operations on shm file descriptors. 1474 */ 1475 int 1476 vm_mmap_shm(struct thread *td, vm_size_t objsize, 1477 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1478 struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp) 1479 { 1480 int error; 1481 1482 if ((*flagsp & MAP_SHARED) != 0 && 1483 (*maxprotp & VM_PROT_WRITE) == 0 && 1484 (prot & PROT_WRITE) != 0) 1485 return (EACCES); 1486 #ifdef MAC 1487 error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp); 1488 if (error != 0) 1489 return (error); 1490 #endif 1491 error = shm_mmap(shmfd, objsize, foff, objp); 1492 if (error) 1493 return (error); 1494 return (0); 1495 } 1496 1497 /* 1498 * vm_mmap() 1499 * 1500 * MPSAFE 1501 * 1502 * Internal version of mmap. Currently used by mmap, exec, and sys5 1503 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1504 */ 1505 int 1506 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1507 vm_prot_t maxprot, int flags, 1508 objtype_t handle_type, void *handle, 1509 vm_ooffset_t foff) 1510 { 1511 boolean_t fitit; 1512 vm_object_t object = NULL; 1513 struct thread *td = curthread; 1514 int docow, error, findspace, rv; 1515 boolean_t writecounted; 1516 1517 if (size == 0) 1518 return (0); 1519 1520 size = round_page(size); 1521 1522 if (map == &td->td_proc->p_vmspace->vm_map) { 1523 PROC_LOCK(td->td_proc); 1524 if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) { 1525 PROC_UNLOCK(td->td_proc); 1526 return (ENOMEM); 1527 } 1528 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1529 PROC_UNLOCK(td->td_proc); 1530 return (ENOMEM); 1531 } 1532 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1533 if (ptoa(pmap_wired_count(map->pmap)) + size > 1534 lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1535 racct_set_force(td->td_proc, RACCT_VMEM, 1536 map->size); 1537 PROC_UNLOCK(td->td_proc); 1538 return (ENOMEM); 1539 } 1540 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1541 ptoa(pmap_wired_count(map->pmap)) + size); 1542 if (error != 0) { 1543 racct_set_force(td->td_proc, RACCT_VMEM, 1544 map->size); 1545 PROC_UNLOCK(td->td_proc); 1546 return (error); 1547 } 1548 } 1549 PROC_UNLOCK(td->td_proc); 1550 } 1551 1552 /* 1553 * We currently can only deal with page aligned file offsets. 1554 * The check is here rather than in the syscall because the 1555 * kernel calls this function internally for other mmaping 1556 * operations (such as in exec) and non-aligned offsets will 1557 * cause pmap inconsistencies...so we want to be sure to 1558 * disallow this in all cases. 1559 */ 1560 if (foff & PAGE_MASK) 1561 return (EINVAL); 1562 1563 if ((flags & MAP_FIXED) == 0) { 1564 fitit = TRUE; 1565 *addr = round_page(*addr); 1566 } else { 1567 if (*addr != trunc_page(*addr)) 1568 return (EINVAL); 1569 fitit = FALSE; 1570 } 1571 writecounted = FALSE; 1572 1573 /* 1574 * Lookup/allocate object. 1575 */ 1576 switch (handle_type) { 1577 case OBJT_DEVICE: 1578 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, 1579 handle, &foff, &object); 1580 break; 1581 case OBJT_VNODE: 1582 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1583 handle, &foff, &object, &writecounted); 1584 break; 1585 case OBJT_SWAP: 1586 error = vm_mmap_shm(td, size, prot, &maxprot, &flags, 1587 handle, foff, &object); 1588 break; 1589 case OBJT_DEFAULT: 1590 if (handle == NULL) { 1591 error = 0; 1592 break; 1593 } 1594 /* FALLTHROUGH */ 1595 default: 1596 error = EINVAL; 1597 break; 1598 } 1599 if (error) 1600 return (error); 1601 if (flags & MAP_ANON) { 1602 object = NULL; 1603 docow = 0; 1604 /* 1605 * Unnamed anonymous regions always start at 0. 1606 */ 1607 if (handle == 0) 1608 foff = 0; 1609 } else if (flags & MAP_PREFAULT_READ) 1610 docow = MAP_PREFAULT; 1611 else 1612 docow = MAP_PREFAULT_PARTIAL; 1613 1614 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1615 docow |= MAP_COPY_ON_WRITE; 1616 if (flags & MAP_NOSYNC) 1617 docow |= MAP_DISABLE_SYNCER; 1618 if (flags & MAP_NOCORE) 1619 docow |= MAP_DISABLE_COREDUMP; 1620 /* Shared memory is also shared with children. */ 1621 if (flags & MAP_SHARED) 1622 docow |= MAP_INHERIT_SHARE; 1623 if (writecounted) 1624 docow |= MAP_VN_WRITECOUNT; 1625 if (flags & MAP_STACK) { 1626 if (object != NULL) 1627 return (EINVAL); 1628 docow |= MAP_STACK_GROWS_DOWN; 1629 } 1630 if ((flags & MAP_EXCL) != 0) 1631 docow |= MAP_CHECK_EXCL; 1632 1633 if (fitit) { 1634 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) 1635 findspace = VMFS_SUPER_SPACE; 1636 else if ((flags & MAP_ALIGNMENT_MASK) != 0) 1637 findspace = VMFS_ALIGNED_SPACE(flags >> 1638 MAP_ALIGNMENT_SHIFT); 1639 else 1640 findspace = VMFS_OPTIMAL_SPACE; 1641 rv = vm_map_find(map, object, foff, addr, size, 1642 #ifdef MAP_32BIT 1643 flags & MAP_32BIT ? MAP_32BIT_MAX_ADDR : 1644 #endif 1645 0, findspace, prot, maxprot, docow); 1646 } else { 1647 rv = vm_map_fixed(map, object, foff, *addr, size, 1648 prot, maxprot, docow); 1649 } 1650 1651 if (rv == KERN_SUCCESS) { 1652 /* 1653 * If the process has requested that all future mappings 1654 * be wired, then heed this. 1655 */ 1656 if (map->flags & MAP_WIREFUTURE) { 1657 vm_map_wire(map, *addr, *addr + size, 1658 VM_MAP_WIRE_USER | ((flags & MAP_STACK) ? 1659 VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES)); 1660 } 1661 } else { 1662 /* 1663 * If this mapping was accounted for in the vnode's 1664 * writecount, then undo that now. 1665 */ 1666 if (writecounted) 1667 vnode_pager_release_writecount(object, 0, size); 1668 /* 1669 * Lose the object reference. Will destroy the 1670 * object if it's an unnamed anonymous mapping 1671 * or named anonymous without other references. 1672 */ 1673 vm_object_deallocate(object); 1674 } 1675 return (vm_mmap_to_errno(rv)); 1676 } 1677 1678 /* 1679 * Translate a Mach VM return code to zero on success or the appropriate errno 1680 * on failure. 1681 */ 1682 int 1683 vm_mmap_to_errno(int rv) 1684 { 1685 1686 switch (rv) { 1687 case KERN_SUCCESS: 1688 return (0); 1689 case KERN_INVALID_ADDRESS: 1690 case KERN_NO_SPACE: 1691 return (ENOMEM); 1692 case KERN_PROTECTION_FAILURE: 1693 return (EACCES); 1694 default: 1695 return (EINVAL); 1696 } 1697 } 1698