1 /*- 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35 * 36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_compat.h" 47 #include "opt_hwpmc_hooks.h" 48 #include "opt_mac.h" 49 50 #include <sys/param.h> 51 #include <sys/systm.h> 52 #include <sys/kernel.h> 53 #include <sys/lock.h> 54 #include <sys/mutex.h> 55 #include <sys/sysproto.h> 56 #include <sys/filedesc.h> 57 #include <sys/priv.h> 58 #include <sys/proc.h> 59 #include <sys/resource.h> 60 #include <sys/resourcevar.h> 61 #include <sys/vnode.h> 62 #include <sys/fcntl.h> 63 #include <sys/file.h> 64 #include <sys/mman.h> 65 #include <sys/mount.h> 66 #include <sys/conf.h> 67 #include <sys/stat.h> 68 #include <sys/vmmeter.h> 69 #include <sys/sysctl.h> 70 71 #include <security/mac/mac_framework.h> 72 73 #include <vm/vm.h> 74 #include <vm/vm_param.h> 75 #include <vm/pmap.h> 76 #include <vm/vm_map.h> 77 #include <vm/vm_object.h> 78 #include <vm/vm_page.h> 79 #include <vm/vm_pager.h> 80 #include <vm/vm_pageout.h> 81 #include <vm/vm_extern.h> 82 #include <vm/vm_page.h> 83 #include <vm/vm_kern.h> 84 85 #ifdef HWPMC_HOOKS 86 #include <sys/pmckern.h> 87 #endif 88 89 #ifndef _SYS_SYSPROTO_H_ 90 struct sbrk_args { 91 int incr; 92 }; 93 #endif 94 95 static int max_proc_mmap; 96 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, ""); 97 98 /* 99 * Set the maximum number of vm_map_entry structures per process. Roughly 100 * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100 101 * of our KVM malloc space still results in generous limits. We want a 102 * default that is good enough to prevent the kernel running out of resources 103 * if attacked from compromised user account but generous enough such that 104 * multi-threaded processes are not unduly inconvenienced. 105 */ 106 static void vmmapentry_rsrc_init(void *); 107 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, NULL) 108 109 static void 110 vmmapentry_rsrc_init(dummy) 111 void *dummy; 112 { 113 max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry); 114 max_proc_mmap /= 100; 115 } 116 117 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 118 int *, struct vnode *, vm_ooffset_t, vm_object_t *); 119 static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 120 int *, struct cdev *, vm_ooffset_t, vm_object_t *); 121 122 /* 123 * MPSAFE 124 */ 125 /* ARGSUSED */ 126 int 127 sbrk(td, uap) 128 struct thread *td; 129 struct sbrk_args *uap; 130 { 131 /* Not yet implemented */ 132 return (EOPNOTSUPP); 133 } 134 135 #ifndef _SYS_SYSPROTO_H_ 136 struct sstk_args { 137 int incr; 138 }; 139 #endif 140 141 /* 142 * MPSAFE 143 */ 144 /* ARGSUSED */ 145 int 146 sstk(td, uap) 147 struct thread *td; 148 struct sstk_args *uap; 149 { 150 /* Not yet implemented */ 151 return (EOPNOTSUPP); 152 } 153 154 #if defined(COMPAT_43) 155 #ifndef _SYS_SYSPROTO_H_ 156 struct getpagesize_args { 157 int dummy; 158 }; 159 #endif 160 161 /* ARGSUSED */ 162 int 163 ogetpagesize(td, uap) 164 struct thread *td; 165 struct getpagesize_args *uap; 166 { 167 /* MP SAFE */ 168 td->td_retval[0] = PAGE_SIZE; 169 return (0); 170 } 171 #endif /* COMPAT_43 */ 172 173 174 /* 175 * Memory Map (mmap) system call. Note that the file offset 176 * and address are allowed to be NOT page aligned, though if 177 * the MAP_FIXED flag it set, both must have the same remainder 178 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 179 * page-aligned, the actual mapping starts at trunc_page(addr) 180 * and the return value is adjusted up by the page offset. 181 * 182 * Generally speaking, only character devices which are themselves 183 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 184 * there would be no cache coherency between a descriptor and a VM mapping 185 * both to the same character device. 186 * 187 * Block devices can be mmap'd no matter what they represent. Cache coherency 188 * is maintained as long as you do not write directly to the underlying 189 * character device. 190 */ 191 #ifndef _SYS_SYSPROTO_H_ 192 struct mmap_args { 193 void *addr; 194 size_t len; 195 int prot; 196 int flags; 197 int fd; 198 long pad; 199 off_t pos; 200 }; 201 #endif 202 203 /* 204 * MPSAFE 205 */ 206 int 207 mmap(td, uap) 208 struct thread *td; 209 struct mmap_args *uap; 210 { 211 #ifdef HWPMC_HOOKS 212 struct pmckern_map_in pkm; 213 #endif 214 struct file *fp; 215 struct vnode *vp; 216 vm_offset_t addr; 217 vm_size_t size, pageoff; 218 vm_prot_t prot, maxprot; 219 void *handle; 220 objtype_t handle_type; 221 int flags, error; 222 off_t pos; 223 struct vmspace *vms = td->td_proc->p_vmspace; 224 225 addr = (vm_offset_t) uap->addr; 226 size = uap->len; 227 prot = uap->prot & VM_PROT_ALL; 228 flags = uap->flags; 229 pos = uap->pos; 230 231 fp = NULL; 232 /* make sure mapping fits into numeric range etc */ 233 if ((ssize_t) uap->len < 0 || 234 ((flags & MAP_ANON) && uap->fd != -1)) 235 return (EINVAL); 236 237 if (flags & MAP_STACK) { 238 if ((uap->fd != -1) || 239 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 240 return (EINVAL); 241 flags |= MAP_ANON; 242 pos = 0; 243 } 244 245 /* 246 * Align the file position to a page boundary, 247 * and save its page offset component. 248 */ 249 pageoff = (pos & PAGE_MASK); 250 pos -= pageoff; 251 252 /* Adjust size for rounding (on both ends). */ 253 size += pageoff; /* low end... */ 254 size = (vm_size_t) round_page(size); /* hi end */ 255 256 /* 257 * Check for illegal addresses. Watch out for address wrap... Note 258 * that VM_*_ADDRESS are not constants due to casts (argh). 259 */ 260 if (flags & MAP_FIXED) { 261 /* 262 * The specified address must have the same remainder 263 * as the file offset taken modulo PAGE_SIZE, so it 264 * should be aligned after adjustment by pageoff. 265 */ 266 addr -= pageoff; 267 if (addr & PAGE_MASK) 268 return (EINVAL); 269 /* Address range must be all in user VM space. */ 270 if (addr < vm_map_min(&vms->vm_map) || 271 addr + size > vm_map_max(&vms->vm_map)) 272 return (EINVAL); 273 if (addr + size < addr) 274 return (EINVAL); 275 } else { 276 /* 277 * XXX for non-fixed mappings where no hint is provided or 278 * the hint would fall in the potential heap space, 279 * place it after the end of the largest possible heap. 280 * 281 * There should really be a pmap call to determine a reasonable 282 * location. 283 */ 284 PROC_LOCK(td->td_proc); 285 if (addr == 0 || 286 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 287 addr < round_page((vm_offset_t)vms->vm_daddr + 288 lim_max(td->td_proc, RLIMIT_DATA)))) 289 addr = round_page((vm_offset_t)vms->vm_daddr + 290 lim_max(td->td_proc, RLIMIT_DATA)); 291 PROC_UNLOCK(td->td_proc); 292 } 293 if (flags & MAP_ANON) { 294 /* 295 * Mapping blank space is trivial. 296 */ 297 handle = NULL; 298 handle_type = OBJT_DEFAULT; 299 maxprot = VM_PROT_ALL; 300 pos = 0; 301 } else { 302 /* 303 * Mapping file, get fp for validation. Obtain vnode and make 304 * sure it is of appropriate type. 305 * don't let the descriptor disappear on us if we block 306 */ 307 if ((error = fget(td, uap->fd, &fp)) != 0) 308 goto done; 309 if (fp->f_type != DTYPE_VNODE) { 310 error = ENODEV; 311 goto done; 312 } 313 /* 314 * POSIX shared-memory objects are defined to have 315 * kernel persistence, and are not defined to support 316 * read(2)/write(2) -- or even open(2). Thus, we can 317 * use MAP_ASYNC to trade on-disk coherence for speed. 318 * The shm_open(3) library routine turns on the FPOSIXSHM 319 * flag to request this behavior. 320 */ 321 if (fp->f_flag & FPOSIXSHM) 322 flags |= MAP_NOSYNC; 323 vp = fp->f_vnode; 324 /* 325 * Ensure that file and memory protections are 326 * compatible. Note that we only worry about 327 * writability if mapping is shared; in this case, 328 * current and max prot are dictated by the open file. 329 * XXX use the vnode instead? Problem is: what 330 * credentials do we use for determination? What if 331 * proc does a setuid? 332 */ 333 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 334 maxprot = VM_PROT_NONE; 335 else 336 maxprot = VM_PROT_EXECUTE; 337 if (fp->f_flag & FREAD) { 338 maxprot |= VM_PROT_READ; 339 } else if (prot & PROT_READ) { 340 error = EACCES; 341 goto done; 342 } 343 /* 344 * If we are sharing potential changes (either via 345 * MAP_SHARED or via the implicit sharing of character 346 * device mappings), and we are trying to get write 347 * permission although we opened it without asking 348 * for it, bail out. 349 */ 350 if ((flags & MAP_SHARED) != 0) { 351 if ((fp->f_flag & FWRITE) != 0) { 352 maxprot |= VM_PROT_WRITE; 353 } else if ((prot & PROT_WRITE) != 0) { 354 error = EACCES; 355 goto done; 356 } 357 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 358 maxprot |= VM_PROT_WRITE; 359 } 360 handle = (void *)vp; 361 handle_type = OBJT_VNODE; 362 } 363 364 /* 365 * Do not allow more then a certain number of vm_map_entry structures 366 * per process. Scale with the number of rforks sharing the map 367 * to make the limit reasonable for threads. 368 */ 369 if (max_proc_mmap && 370 vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) { 371 error = ENOMEM; 372 goto done; 373 } 374 375 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 376 flags, handle_type, handle, pos); 377 #ifdef HWPMC_HOOKS 378 /* inform hwpmc(4) if an executable is being mapped */ 379 if (error == 0 && handle_type == OBJT_VNODE && 380 (prot & PROT_EXEC)) { 381 pkm.pm_file = handle; 382 pkm.pm_address = (uintptr_t) addr; 383 PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); 384 } 385 #endif 386 if (error == 0) 387 td->td_retval[0] = (register_t) (addr + pageoff); 388 done: 389 if (fp) 390 fdrop(fp, td); 391 392 return (error); 393 } 394 395 #ifdef COMPAT_43 396 #ifndef _SYS_SYSPROTO_H_ 397 struct ommap_args { 398 caddr_t addr; 399 int len; 400 int prot; 401 int flags; 402 int fd; 403 long pos; 404 }; 405 #endif 406 int 407 ommap(td, uap) 408 struct thread *td; 409 struct ommap_args *uap; 410 { 411 struct mmap_args nargs; 412 static const char cvtbsdprot[8] = { 413 0, 414 PROT_EXEC, 415 PROT_WRITE, 416 PROT_EXEC | PROT_WRITE, 417 PROT_READ, 418 PROT_EXEC | PROT_READ, 419 PROT_WRITE | PROT_READ, 420 PROT_EXEC | PROT_WRITE | PROT_READ, 421 }; 422 423 #define OMAP_ANON 0x0002 424 #define OMAP_COPY 0x0020 425 #define OMAP_SHARED 0x0010 426 #define OMAP_FIXED 0x0100 427 428 nargs.addr = uap->addr; 429 nargs.len = uap->len; 430 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 431 nargs.flags = 0; 432 if (uap->flags & OMAP_ANON) 433 nargs.flags |= MAP_ANON; 434 if (uap->flags & OMAP_COPY) 435 nargs.flags |= MAP_COPY; 436 if (uap->flags & OMAP_SHARED) 437 nargs.flags |= MAP_SHARED; 438 else 439 nargs.flags |= MAP_PRIVATE; 440 if (uap->flags & OMAP_FIXED) 441 nargs.flags |= MAP_FIXED; 442 nargs.fd = uap->fd; 443 nargs.pos = uap->pos; 444 return (mmap(td, &nargs)); 445 } 446 #endif /* COMPAT_43 */ 447 448 449 #ifndef _SYS_SYSPROTO_H_ 450 struct msync_args { 451 void *addr; 452 int len; 453 int flags; 454 }; 455 #endif 456 /* 457 * MPSAFE 458 */ 459 int 460 msync(td, uap) 461 struct thread *td; 462 struct msync_args *uap; 463 { 464 vm_offset_t addr; 465 vm_size_t size, pageoff; 466 int flags; 467 vm_map_t map; 468 int rv; 469 470 addr = (vm_offset_t) uap->addr; 471 size = uap->len; 472 flags = uap->flags; 473 474 pageoff = (addr & PAGE_MASK); 475 addr -= pageoff; 476 size += pageoff; 477 size = (vm_size_t) round_page(size); 478 if (addr + size < addr) 479 return (EINVAL); 480 481 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 482 return (EINVAL); 483 484 map = &td->td_proc->p_vmspace->vm_map; 485 486 /* 487 * Clean the pages and interpret the return value. 488 */ 489 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 490 (flags & MS_INVALIDATE) != 0); 491 switch (rv) { 492 case KERN_SUCCESS: 493 return (0); 494 case KERN_INVALID_ADDRESS: 495 return (EINVAL); /* Sun returns ENOMEM? */ 496 case KERN_INVALID_ARGUMENT: 497 return (EBUSY); 498 default: 499 return (EINVAL); 500 } 501 } 502 503 #ifndef _SYS_SYSPROTO_H_ 504 struct munmap_args { 505 void *addr; 506 size_t len; 507 }; 508 #endif 509 /* 510 * MPSAFE 511 */ 512 int 513 munmap(td, uap) 514 struct thread *td; 515 struct munmap_args *uap; 516 { 517 #ifdef HWPMC_HOOKS 518 struct pmckern_map_out pkm; 519 vm_map_entry_t entry; 520 #endif 521 vm_offset_t addr; 522 vm_size_t size, pageoff; 523 vm_map_t map; 524 525 addr = (vm_offset_t) uap->addr; 526 size = uap->len; 527 if (size == 0) 528 return (EINVAL); 529 530 pageoff = (addr & PAGE_MASK); 531 addr -= pageoff; 532 size += pageoff; 533 size = (vm_size_t) round_page(size); 534 if (addr + size < addr) 535 return (EINVAL); 536 537 /* 538 * Check for illegal addresses. Watch out for address wrap... 539 */ 540 map = &td->td_proc->p_vmspace->vm_map; 541 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 542 return (EINVAL); 543 vm_map_lock(map); 544 /* 545 * Make sure entire range is allocated. 546 */ 547 if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) { 548 vm_map_unlock(map); 549 return (EINVAL); 550 } 551 #ifdef HWPMC_HOOKS 552 /* 553 * Inform hwpmc if the address range being unmapped contains 554 * an executable region. 555 */ 556 if (vm_map_lookup_entry(map, addr, &entry)) { 557 for (; 558 entry != &map->header && entry->start < addr + size; 559 entry = entry->next) { 560 if (vm_map_check_protection(map, entry->start, 561 entry->end, VM_PROT_EXECUTE) == TRUE) { 562 pkm.pm_address = (uintptr_t) addr; 563 pkm.pm_size = (size_t) size; 564 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, 565 (void *) &pkm); 566 break; 567 } 568 } 569 } 570 #endif 571 /* returns nothing but KERN_SUCCESS anyway */ 572 vm_map_delete(map, addr, addr + size); 573 vm_map_unlock(map); 574 return (0); 575 } 576 577 #ifndef _SYS_SYSPROTO_H_ 578 struct mprotect_args { 579 const void *addr; 580 size_t len; 581 int prot; 582 }; 583 #endif 584 /* 585 * MPSAFE 586 */ 587 int 588 mprotect(td, uap) 589 struct thread *td; 590 struct mprotect_args *uap; 591 { 592 vm_offset_t addr; 593 vm_size_t size, pageoff; 594 vm_prot_t prot; 595 596 addr = (vm_offset_t) uap->addr; 597 size = uap->len; 598 prot = uap->prot & VM_PROT_ALL; 599 #if defined(VM_PROT_READ_IS_EXEC) 600 if (prot & VM_PROT_READ) 601 prot |= VM_PROT_EXECUTE; 602 #endif 603 604 pageoff = (addr & PAGE_MASK); 605 addr -= pageoff; 606 size += pageoff; 607 size = (vm_size_t) round_page(size); 608 if (addr + size < addr) 609 return (EINVAL); 610 611 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 612 addr + size, prot, FALSE)) { 613 case KERN_SUCCESS: 614 return (0); 615 case KERN_PROTECTION_FAILURE: 616 return (EACCES); 617 } 618 return (EINVAL); 619 } 620 621 #ifndef _SYS_SYSPROTO_H_ 622 struct minherit_args { 623 void *addr; 624 size_t len; 625 int inherit; 626 }; 627 #endif 628 /* 629 * MPSAFE 630 */ 631 int 632 minherit(td, uap) 633 struct thread *td; 634 struct minherit_args *uap; 635 { 636 vm_offset_t addr; 637 vm_size_t size, pageoff; 638 vm_inherit_t inherit; 639 640 addr = (vm_offset_t)uap->addr; 641 size = uap->len; 642 inherit = uap->inherit; 643 644 pageoff = (addr & PAGE_MASK); 645 addr -= pageoff; 646 size += pageoff; 647 size = (vm_size_t) round_page(size); 648 if (addr + size < addr) 649 return (EINVAL); 650 651 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 652 addr + size, inherit)) { 653 case KERN_SUCCESS: 654 return (0); 655 case KERN_PROTECTION_FAILURE: 656 return (EACCES); 657 } 658 return (EINVAL); 659 } 660 661 #ifndef _SYS_SYSPROTO_H_ 662 struct madvise_args { 663 void *addr; 664 size_t len; 665 int behav; 666 }; 667 #endif 668 669 /* 670 * MPSAFE 671 */ 672 /* ARGSUSED */ 673 int 674 madvise(td, uap) 675 struct thread *td; 676 struct madvise_args *uap; 677 { 678 vm_offset_t start, end; 679 vm_map_t map; 680 struct proc *p; 681 int error; 682 683 /* 684 * Check for our special case, advising the swap pager we are 685 * "immortal." 686 */ 687 if (uap->behav == MADV_PROTECT) { 688 error = priv_check(td, PRIV_VM_MADV_PROTECT); 689 if (error == 0) { 690 p = td->td_proc; 691 PROC_LOCK(p); 692 p->p_flag |= P_PROTECTED; 693 PROC_UNLOCK(p); 694 } 695 return (error); 696 } 697 /* 698 * Check for illegal behavior 699 */ 700 if (uap->behav < 0 || uap->behav > MADV_CORE) 701 return (EINVAL); 702 /* 703 * Check for illegal addresses. Watch out for address wrap... Note 704 * that VM_*_ADDRESS are not constants due to casts (argh). 705 */ 706 map = &td->td_proc->p_vmspace->vm_map; 707 if ((vm_offset_t)uap->addr < vm_map_min(map) || 708 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 709 return (EINVAL); 710 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 711 return (EINVAL); 712 713 /* 714 * Since this routine is only advisory, we default to conservative 715 * behavior. 716 */ 717 start = trunc_page((vm_offset_t) uap->addr); 718 end = round_page((vm_offset_t) uap->addr + uap->len); 719 720 if (vm_map_madvise(map, start, end, uap->behav)) 721 return (EINVAL); 722 return (0); 723 } 724 725 #ifndef _SYS_SYSPROTO_H_ 726 struct mincore_args { 727 const void *addr; 728 size_t len; 729 char *vec; 730 }; 731 #endif 732 733 /* 734 * MPSAFE 735 */ 736 /* ARGSUSED */ 737 int 738 mincore(td, uap) 739 struct thread *td; 740 struct mincore_args *uap; 741 { 742 vm_offset_t addr, first_addr; 743 vm_offset_t end, cend; 744 pmap_t pmap; 745 vm_map_t map; 746 char *vec; 747 int error = 0; 748 int vecindex, lastvecindex; 749 vm_map_entry_t current; 750 vm_map_entry_t entry; 751 int mincoreinfo; 752 unsigned int timestamp; 753 754 /* 755 * Make sure that the addresses presented are valid for user 756 * mode. 757 */ 758 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 759 end = addr + (vm_size_t)round_page(uap->len); 760 map = &td->td_proc->p_vmspace->vm_map; 761 if (end > vm_map_max(map) || end < addr) 762 return (ENOMEM); 763 764 /* 765 * Address of byte vector 766 */ 767 vec = uap->vec; 768 769 pmap = vmspace_pmap(td->td_proc->p_vmspace); 770 771 vm_map_lock_read(map); 772 RestartScan: 773 timestamp = map->timestamp; 774 775 if (!vm_map_lookup_entry(map, addr, &entry)) { 776 vm_map_unlock_read(map); 777 return (ENOMEM); 778 } 779 780 /* 781 * Do this on a map entry basis so that if the pages are not 782 * in the current processes address space, we can easily look 783 * up the pages elsewhere. 784 */ 785 lastvecindex = -1; 786 for (current = entry; 787 (current != &map->header) && (current->start < end); 788 current = current->next) { 789 790 /* 791 * check for contiguity 792 */ 793 if (current->end < end && 794 (entry->next == &map->header || 795 current->next->start > current->end)) { 796 vm_map_unlock_read(map); 797 return (ENOMEM); 798 } 799 800 /* 801 * ignore submaps (for now) or null objects 802 */ 803 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 804 current->object.vm_object == NULL) 805 continue; 806 807 /* 808 * limit this scan to the current map entry and the 809 * limits for the mincore call 810 */ 811 if (addr < current->start) 812 addr = current->start; 813 cend = current->end; 814 if (cend > end) 815 cend = end; 816 817 /* 818 * scan this entry one page at a time 819 */ 820 while (addr < cend) { 821 /* 822 * Check pmap first, it is likely faster, also 823 * it can provide info as to whether we are the 824 * one referencing or modifying the page. 825 */ 826 mincoreinfo = pmap_mincore(pmap, addr); 827 if (!mincoreinfo) { 828 vm_pindex_t pindex; 829 vm_ooffset_t offset; 830 vm_page_t m; 831 /* 832 * calculate the page index into the object 833 */ 834 offset = current->offset + (addr - current->start); 835 pindex = OFF_TO_IDX(offset); 836 VM_OBJECT_LOCK(current->object.vm_object); 837 m = vm_page_lookup(current->object.vm_object, 838 pindex); 839 /* 840 * if the page is resident, then gather information about 841 * it. 842 */ 843 if (m != NULL && m->valid != 0) { 844 mincoreinfo = MINCORE_INCORE; 845 vm_page_lock_queues(); 846 if (m->dirty || 847 pmap_is_modified(m)) 848 mincoreinfo |= MINCORE_MODIFIED_OTHER; 849 if ((m->flags & PG_REFERENCED) || 850 pmap_ts_referenced(m)) { 851 vm_page_flag_set(m, PG_REFERENCED); 852 mincoreinfo |= MINCORE_REFERENCED_OTHER; 853 } 854 vm_page_unlock_queues(); 855 } 856 VM_OBJECT_UNLOCK(current->object.vm_object); 857 } 858 859 /* 860 * subyte may page fault. In case it needs to modify 861 * the map, we release the lock. 862 */ 863 vm_map_unlock_read(map); 864 865 /* 866 * calculate index into user supplied byte vector 867 */ 868 vecindex = OFF_TO_IDX(addr - first_addr); 869 870 /* 871 * If we have skipped map entries, we need to make sure that 872 * the byte vector is zeroed for those skipped entries. 873 */ 874 while ((lastvecindex + 1) < vecindex) { 875 error = subyte(vec + lastvecindex, 0); 876 if (error) { 877 error = EFAULT; 878 goto done2; 879 } 880 ++lastvecindex; 881 } 882 883 /* 884 * Pass the page information to the user 885 */ 886 error = subyte(vec + vecindex, mincoreinfo); 887 if (error) { 888 error = EFAULT; 889 goto done2; 890 } 891 892 /* 893 * If the map has changed, due to the subyte, the previous 894 * output may be invalid. 895 */ 896 vm_map_lock_read(map); 897 if (timestamp != map->timestamp) 898 goto RestartScan; 899 900 lastvecindex = vecindex; 901 addr += PAGE_SIZE; 902 } 903 } 904 905 /* 906 * subyte may page fault. In case it needs to modify 907 * the map, we release the lock. 908 */ 909 vm_map_unlock_read(map); 910 911 /* 912 * Zero the last entries in the byte vector. 913 */ 914 vecindex = OFF_TO_IDX(end - first_addr); 915 while ((lastvecindex + 1) < vecindex) { 916 error = subyte(vec + lastvecindex, 0); 917 if (error) { 918 error = EFAULT; 919 goto done2; 920 } 921 ++lastvecindex; 922 } 923 924 /* 925 * If the map has changed, due to the subyte, the previous 926 * output may be invalid. 927 */ 928 vm_map_lock_read(map); 929 if (timestamp != map->timestamp) 930 goto RestartScan; 931 vm_map_unlock_read(map); 932 done2: 933 return (error); 934 } 935 936 #ifndef _SYS_SYSPROTO_H_ 937 struct mlock_args { 938 const void *addr; 939 size_t len; 940 }; 941 #endif 942 /* 943 * MPSAFE 944 */ 945 int 946 mlock(td, uap) 947 struct thread *td; 948 struct mlock_args *uap; 949 { 950 struct proc *proc; 951 vm_offset_t addr, end, last, start; 952 vm_size_t npages, size; 953 int error; 954 955 error = priv_check(td, PRIV_VM_MLOCK); 956 if (error) 957 return (error); 958 addr = (vm_offset_t)uap->addr; 959 size = uap->len; 960 last = addr + size; 961 start = trunc_page(addr); 962 end = round_page(last); 963 if (last < addr || end < addr) 964 return (EINVAL); 965 npages = atop(end - start); 966 if (npages > vm_page_max_wired) 967 return (ENOMEM); 968 proc = td->td_proc; 969 PROC_LOCK(proc); 970 if (ptoa(npages + 971 pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) > 972 lim_cur(proc, RLIMIT_MEMLOCK)) { 973 PROC_UNLOCK(proc); 974 return (ENOMEM); 975 } 976 PROC_UNLOCK(proc); 977 if (npages + cnt.v_wire_count > vm_page_max_wired) 978 return (EAGAIN); 979 error = vm_map_wire(&proc->p_vmspace->vm_map, start, end, 980 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 981 return (error == KERN_SUCCESS ? 0 : ENOMEM); 982 } 983 984 #ifndef _SYS_SYSPROTO_H_ 985 struct mlockall_args { 986 int how; 987 }; 988 #endif 989 990 /* 991 * MPSAFE 992 */ 993 int 994 mlockall(td, uap) 995 struct thread *td; 996 struct mlockall_args *uap; 997 { 998 vm_map_t map; 999 int error; 1000 1001 map = &td->td_proc->p_vmspace->vm_map; 1002 error = 0; 1003 1004 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1005 return (EINVAL); 1006 1007 #if 0 1008 /* 1009 * If wiring all pages in the process would cause it to exceed 1010 * a hard resource limit, return ENOMEM. 1011 */ 1012 PROC_LOCK(td->td_proc); 1013 if (map->size - ptoa(pmap_wired_count(vm_map_pmap(map)) > 1014 lim_cur(td->td_proc, RLIMIT_MEMLOCK))) { 1015 PROC_UNLOCK(td->td_proc); 1016 return (ENOMEM); 1017 } 1018 PROC_UNLOCK(td->td_proc); 1019 #else 1020 error = priv_check(td, PRIV_VM_MLOCK); 1021 if (error) 1022 return (error); 1023 #endif 1024 1025 if (uap->how & MCL_FUTURE) { 1026 vm_map_lock(map); 1027 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1028 vm_map_unlock(map); 1029 error = 0; 1030 } 1031 1032 if (uap->how & MCL_CURRENT) { 1033 /* 1034 * P1003.1-2001 mandates that all currently mapped pages 1035 * will be memory resident and locked (wired) upon return 1036 * from mlockall(). vm_map_wire() will wire pages, by 1037 * calling vm_fault_wire() for each page in the region. 1038 */ 1039 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1040 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1041 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1042 } 1043 1044 return (error); 1045 } 1046 1047 #ifndef _SYS_SYSPROTO_H_ 1048 struct munlockall_args { 1049 register_t dummy; 1050 }; 1051 #endif 1052 1053 /* 1054 * MPSAFE 1055 */ 1056 int 1057 munlockall(td, uap) 1058 struct thread *td; 1059 struct munlockall_args *uap; 1060 { 1061 vm_map_t map; 1062 int error; 1063 1064 map = &td->td_proc->p_vmspace->vm_map; 1065 error = priv_check(td, PRIV_VM_MUNLOCK); 1066 if (error) 1067 return (error); 1068 1069 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1070 vm_map_lock(map); 1071 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1072 vm_map_unlock(map); 1073 1074 /* Forcibly unwire all pages. */ 1075 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1076 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1077 1078 return (error); 1079 } 1080 1081 #ifndef _SYS_SYSPROTO_H_ 1082 struct munlock_args { 1083 const void *addr; 1084 size_t len; 1085 }; 1086 #endif 1087 /* 1088 * MPSAFE 1089 */ 1090 int 1091 munlock(td, uap) 1092 struct thread *td; 1093 struct munlock_args *uap; 1094 { 1095 vm_offset_t addr, end, last, start; 1096 vm_size_t size; 1097 int error; 1098 1099 error = priv_check(td, PRIV_VM_MUNLOCK); 1100 if (error) 1101 return (error); 1102 addr = (vm_offset_t)uap->addr; 1103 size = uap->len; 1104 last = addr + size; 1105 start = trunc_page(addr); 1106 end = round_page(last); 1107 if (last < addr || end < addr) 1108 return (EINVAL); 1109 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1110 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1111 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1112 } 1113 1114 /* 1115 * vm_mmap_vnode() 1116 * 1117 * MPSAFE 1118 * 1119 * Helper function for vm_mmap. Perform sanity check specific for mmap 1120 * operations on vnodes. 1121 */ 1122 int 1123 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1124 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1125 struct vnode *vp, vm_ooffset_t foff, vm_object_t *objp) 1126 { 1127 struct vattr va; 1128 void *handle; 1129 vm_object_t obj; 1130 struct mount *mp; 1131 int error, flags, type; 1132 int vfslocked; 1133 1134 mp = vp->v_mount; 1135 vfslocked = VFS_LOCK_GIANT(mp); 1136 if ((error = vget(vp, LK_EXCLUSIVE, td)) != 0) { 1137 VFS_UNLOCK_GIANT(vfslocked); 1138 return (error); 1139 } 1140 flags = *flagsp; 1141 obj = vp->v_object; 1142 if (vp->v_type == VREG) { 1143 /* 1144 * Get the proper underlying object 1145 */ 1146 if (obj == NULL) { 1147 error = EINVAL; 1148 goto done; 1149 } 1150 if (obj->handle != vp) { 1151 vput(vp); 1152 vp = (struct vnode*)obj->handle; 1153 vget(vp, LK_EXCLUSIVE, td); 1154 } 1155 type = OBJT_VNODE; 1156 handle = vp; 1157 } else if (vp->v_type == VCHR) { 1158 type = OBJT_DEVICE; 1159 handle = vp->v_rdev; 1160 1161 /* XXX: lack thredref on device */ 1162 if(vp->v_rdev->si_devsw->d_flags & D_MMAP_ANON) { 1163 *maxprotp = VM_PROT_ALL; 1164 *flagsp |= MAP_ANON; 1165 error = 0; 1166 goto done; 1167 } 1168 /* 1169 * cdevs does not provide private mappings of any kind. 1170 */ 1171 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1172 (prot & PROT_WRITE) != 0) { 1173 error = EACCES; 1174 goto done; 1175 } 1176 if (flags & (MAP_PRIVATE|MAP_COPY)) { 1177 error = EINVAL; 1178 goto done; 1179 } 1180 /* 1181 * Force device mappings to be shared. 1182 */ 1183 flags |= MAP_SHARED; 1184 } else { 1185 error = EINVAL; 1186 goto done; 1187 } 1188 if ((error = VOP_GETATTR(vp, &va, td->td_ucred, td))) { 1189 goto done; 1190 } 1191 #ifdef MAC 1192 error = mac_check_vnode_mmap(td->td_ucred, vp, prot, flags); 1193 if (error != 0) 1194 goto done; 1195 #endif 1196 if ((flags & MAP_SHARED) != 0) { 1197 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1198 if (prot & PROT_WRITE) { 1199 error = EPERM; 1200 goto done; 1201 } 1202 *maxprotp &= ~VM_PROT_WRITE; 1203 } 1204 } 1205 /* 1206 * If it is a regular file without any references 1207 * we do not need to sync it. 1208 * Adjust object size to be the size of actual file. 1209 */ 1210 if (vp->v_type == VREG) { 1211 objsize = round_page(va.va_size); 1212 if (va.va_nlink == 0) 1213 flags |= MAP_NOSYNC; 1214 } 1215 obj = vm_pager_allocate(type, handle, objsize, prot, foff); 1216 if (obj == NULL) { 1217 error = (type == OBJT_DEVICE ? EINVAL : ENOMEM); 1218 goto done; 1219 } 1220 *objp = obj; 1221 *flagsp = flags; 1222 vfs_mark_atime(vp, td); 1223 1224 done: 1225 vput(vp); 1226 VFS_UNLOCK_GIANT(vfslocked); 1227 return (error); 1228 } 1229 1230 /* 1231 * vm_mmap_cdev() 1232 * 1233 * MPSAFE 1234 * 1235 * Helper function for vm_mmap. Perform sanity check specific for mmap 1236 * operations on cdevs. 1237 */ 1238 int 1239 vm_mmap_cdev(struct thread *td, vm_size_t objsize, 1240 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1241 struct cdev *cdev, vm_ooffset_t foff, vm_object_t *objp) 1242 { 1243 vm_object_t obj; 1244 int flags; 1245 1246 flags = *flagsp; 1247 1248 /* XXX: lack thredref on device */ 1249 if (cdev->si_devsw->d_flags & D_MMAP_ANON) { 1250 *maxprotp = VM_PROT_ALL; 1251 *flagsp |= MAP_ANON; 1252 return (0); 1253 } 1254 /* 1255 * cdevs does not provide private mappings of any kind. 1256 */ 1257 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1258 (prot & PROT_WRITE) != 0) 1259 return (EACCES); 1260 if (flags & (MAP_PRIVATE|MAP_COPY)) 1261 return (EINVAL); 1262 /* 1263 * Force device mappings to be shared. 1264 */ 1265 flags |= MAP_SHARED; 1266 #ifdef MAC_XXX 1267 error = mac_check_cdev_mmap(td->td_ucred, cdev, prot); 1268 if (error != 0) 1269 return (error); 1270 #endif 1271 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, foff); 1272 if (obj == NULL) 1273 return (EINVAL); 1274 *objp = obj; 1275 *flagsp = flags; 1276 return (0); 1277 } 1278 1279 /* 1280 * vm_mmap() 1281 * 1282 * MPSAFE 1283 * 1284 * Internal version of mmap. Currently used by mmap, exec, and sys5 1285 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1286 */ 1287 int 1288 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1289 vm_prot_t maxprot, int flags, 1290 objtype_t handle_type, void *handle, 1291 vm_ooffset_t foff) 1292 { 1293 boolean_t fitit; 1294 vm_object_t object; 1295 int rv = KERN_SUCCESS; 1296 int docow, error; 1297 struct thread *td = curthread; 1298 1299 if (size == 0) 1300 return (0); 1301 1302 size = round_page(size); 1303 1304 PROC_LOCK(td->td_proc); 1305 if (td->td_proc->p_vmspace->vm_map.size + size > 1306 lim_cur(td->td_proc, RLIMIT_VMEM)) { 1307 PROC_UNLOCK(td->td_proc); 1308 return(ENOMEM); 1309 } 1310 PROC_UNLOCK(td->td_proc); 1311 1312 /* 1313 * We currently can only deal with page aligned file offsets. 1314 * The check is here rather than in the syscall because the 1315 * kernel calls this function internally for other mmaping 1316 * operations (such as in exec) and non-aligned offsets will 1317 * cause pmap inconsistencies...so we want to be sure to 1318 * disallow this in all cases. 1319 */ 1320 if (foff & PAGE_MASK) 1321 return (EINVAL); 1322 1323 if ((flags & MAP_FIXED) == 0) { 1324 fitit = TRUE; 1325 *addr = round_page(*addr); 1326 } else { 1327 if (*addr != trunc_page(*addr)) 1328 return (EINVAL); 1329 fitit = FALSE; 1330 (void) vm_map_remove(map, *addr, *addr + size); 1331 } 1332 /* 1333 * Lookup/allocate object. 1334 */ 1335 switch (handle_type) { 1336 case OBJT_DEVICE: 1337 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, 1338 handle, foff, &object); 1339 break; 1340 case OBJT_VNODE: 1341 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1342 handle, foff, &object); 1343 break; 1344 case OBJT_DEFAULT: 1345 if (handle == NULL) { 1346 error = 0; 1347 break; 1348 } 1349 /* FALLTHROUGH */ 1350 default: 1351 error = EINVAL; 1352 } 1353 if (error) 1354 return (error); 1355 if (flags & MAP_ANON) { 1356 object = NULL; 1357 docow = 0; 1358 /* 1359 * Unnamed anonymous regions always start at 0. 1360 */ 1361 if (handle == 0) 1362 foff = 0; 1363 } else { 1364 docow = MAP_PREFAULT_PARTIAL; 1365 } 1366 1367 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1368 docow |= MAP_COPY_ON_WRITE; 1369 if (flags & MAP_NOSYNC) 1370 docow |= MAP_DISABLE_SYNCER; 1371 if (flags & MAP_NOCORE) 1372 docow |= MAP_DISABLE_COREDUMP; 1373 1374 #if defined(VM_PROT_READ_IS_EXEC) 1375 if (prot & VM_PROT_READ) 1376 prot |= VM_PROT_EXECUTE; 1377 1378 if (maxprot & VM_PROT_READ) 1379 maxprot |= VM_PROT_EXECUTE; 1380 #endif 1381 1382 if (fitit) 1383 *addr = pmap_addr_hint(object, *addr, size); 1384 1385 if (flags & MAP_STACK) 1386 rv = vm_map_stack(map, *addr, size, prot, maxprot, 1387 docow | MAP_STACK_GROWS_DOWN); 1388 else 1389 rv = vm_map_find(map, object, foff, addr, size, fitit, 1390 prot, maxprot, docow); 1391 1392 if (rv != KERN_SUCCESS) { 1393 /* 1394 * Lose the object reference. Will destroy the 1395 * object if it's an unnamed anonymous mapping 1396 * or named anonymous without other references. 1397 */ 1398 vm_object_deallocate(object); 1399 } else if (flags & MAP_SHARED) { 1400 /* 1401 * Shared memory is also shared with children. 1402 */ 1403 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); 1404 if (rv != KERN_SUCCESS) 1405 (void) vm_map_remove(map, *addr, *addr + size); 1406 } 1407 1408 /* 1409 * If the process has requested that all future mappings 1410 * be wired, then heed this. 1411 */ 1412 if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE)) 1413 vm_map_wire(map, *addr, *addr + size, 1414 VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); 1415 1416 switch (rv) { 1417 case KERN_SUCCESS: 1418 return (0); 1419 case KERN_INVALID_ADDRESS: 1420 case KERN_NO_SPACE: 1421 return (ENOMEM); 1422 case KERN_PROTECTION_FAILURE: 1423 return (EACCES); 1424 default: 1425 return (EINVAL); 1426 } 1427 } 1428