1 /*- 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35 * 36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_compat.h" 47 #include "opt_hwpmc_hooks.h" 48 #include "opt_mac.h" 49 50 #include <sys/param.h> 51 #include <sys/systm.h> 52 #include <sys/kernel.h> 53 #include <sys/lock.h> 54 #include <sys/mutex.h> 55 #include <sys/sysproto.h> 56 #include <sys/filedesc.h> 57 #include <sys/proc.h> 58 #include <sys/resource.h> 59 #include <sys/resourcevar.h> 60 #include <sys/vnode.h> 61 #include <sys/fcntl.h> 62 #include <sys/file.h> 63 #include <sys/mac.h> 64 #include <sys/mman.h> 65 #include <sys/mount.h> 66 #include <sys/conf.h> 67 #include <sys/stat.h> 68 #include <sys/vmmeter.h> 69 #include <sys/sysctl.h> 70 71 #include <vm/vm.h> 72 #include <vm/vm_param.h> 73 #include <vm/pmap.h> 74 #include <vm/vm_map.h> 75 #include <vm/vm_object.h> 76 #include <vm/vm_page.h> 77 #include <vm/vm_pager.h> 78 #include <vm/vm_pageout.h> 79 #include <vm/vm_extern.h> 80 #include <vm/vm_page.h> 81 #include <vm/vm_kern.h> 82 83 #ifdef HWPMC_HOOKS 84 #include <sys/pmckern.h> 85 #endif 86 87 #ifndef _SYS_SYSPROTO_H_ 88 struct sbrk_args { 89 int incr; 90 }; 91 #endif 92 93 static int max_proc_mmap; 94 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, ""); 95 96 /* 97 * Set the maximum number of vm_map_entry structures per process. Roughly 98 * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100 99 * of our KVM malloc space still results in generous limits. We want a 100 * default that is good enough to prevent the kernel running out of resources 101 * if attacked from compromised user account but generous enough such that 102 * multi-threaded processes are not unduly inconvenienced. 103 */ 104 static void vmmapentry_rsrc_init(void *); 105 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, NULL) 106 107 static void 108 vmmapentry_rsrc_init(dummy) 109 void *dummy; 110 { 111 max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry); 112 max_proc_mmap /= 100; 113 } 114 115 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 116 int *, struct vnode *, vm_ooffset_t, vm_object_t *); 117 static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 118 int *, struct cdev *, vm_ooffset_t, vm_object_t *); 119 120 /* 121 * MPSAFE 122 */ 123 /* ARGSUSED */ 124 int 125 sbrk(td, uap) 126 struct thread *td; 127 struct sbrk_args *uap; 128 { 129 /* Not yet implemented */ 130 return (EOPNOTSUPP); 131 } 132 133 #ifndef _SYS_SYSPROTO_H_ 134 struct sstk_args { 135 int incr; 136 }; 137 #endif 138 139 /* 140 * MPSAFE 141 */ 142 /* ARGSUSED */ 143 int 144 sstk(td, uap) 145 struct thread *td; 146 struct sstk_args *uap; 147 { 148 /* Not yet implemented */ 149 return (EOPNOTSUPP); 150 } 151 152 #if defined(COMPAT_43) 153 #ifndef _SYS_SYSPROTO_H_ 154 struct getpagesize_args { 155 int dummy; 156 }; 157 #endif 158 159 /* ARGSUSED */ 160 int 161 ogetpagesize(td, uap) 162 struct thread *td; 163 struct getpagesize_args *uap; 164 { 165 /* MP SAFE */ 166 td->td_retval[0] = PAGE_SIZE; 167 return (0); 168 } 169 #endif /* COMPAT_43 */ 170 171 172 /* 173 * Memory Map (mmap) system call. Note that the file offset 174 * and address are allowed to be NOT page aligned, though if 175 * the MAP_FIXED flag it set, both must have the same remainder 176 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 177 * page-aligned, the actual mapping starts at trunc_page(addr) 178 * and the return value is adjusted up by the page offset. 179 * 180 * Generally speaking, only character devices which are themselves 181 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 182 * there would be no cache coherency between a descriptor and a VM mapping 183 * both to the same character device. 184 * 185 * Block devices can be mmap'd no matter what they represent. Cache coherency 186 * is maintained as long as you do not write directly to the underlying 187 * character device. 188 */ 189 #ifndef _SYS_SYSPROTO_H_ 190 struct mmap_args { 191 void *addr; 192 size_t len; 193 int prot; 194 int flags; 195 int fd; 196 long pad; 197 off_t pos; 198 }; 199 #endif 200 201 /* 202 * MPSAFE 203 */ 204 int 205 mmap(td, uap) 206 struct thread *td; 207 struct mmap_args *uap; 208 { 209 #ifdef HWPMC_HOOKS 210 struct pmckern_map_in pkm; 211 #endif 212 struct file *fp; 213 struct vnode *vp; 214 vm_offset_t addr; 215 vm_size_t size, pageoff; 216 vm_prot_t prot, maxprot; 217 void *handle; 218 objtype_t handle_type; 219 int flags, error; 220 off_t pos; 221 struct vmspace *vms = td->td_proc->p_vmspace; 222 223 addr = (vm_offset_t) uap->addr; 224 size = uap->len; 225 prot = uap->prot & VM_PROT_ALL; 226 flags = uap->flags; 227 pos = uap->pos; 228 229 fp = NULL; 230 /* make sure mapping fits into numeric range etc */ 231 if ((ssize_t) uap->len < 0 || 232 ((flags & MAP_ANON) && uap->fd != -1)) 233 return (EINVAL); 234 235 if (flags & MAP_STACK) { 236 if ((uap->fd != -1) || 237 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 238 return (EINVAL); 239 flags |= MAP_ANON; 240 pos = 0; 241 } 242 243 /* 244 * Align the file position to a page boundary, 245 * and save its page offset component. 246 */ 247 pageoff = (pos & PAGE_MASK); 248 pos -= pageoff; 249 250 /* Adjust size for rounding (on both ends). */ 251 size += pageoff; /* low end... */ 252 size = (vm_size_t) round_page(size); /* hi end */ 253 254 /* 255 * Check for illegal addresses. Watch out for address wrap... Note 256 * that VM_*_ADDRESS are not constants due to casts (argh). 257 */ 258 if (flags & MAP_FIXED) { 259 /* 260 * The specified address must have the same remainder 261 * as the file offset taken modulo PAGE_SIZE, so it 262 * should be aligned after adjustment by pageoff. 263 */ 264 addr -= pageoff; 265 if (addr & PAGE_MASK) 266 return (EINVAL); 267 /* Address range must be all in user VM space. */ 268 if (addr < vm_map_min(&vms->vm_map) || 269 addr + size > vm_map_max(&vms->vm_map)) 270 return (EINVAL); 271 if (addr + size < addr) 272 return (EINVAL); 273 } else { 274 /* 275 * XXX for non-fixed mappings where no hint is provided or 276 * the hint would fall in the potential heap space, 277 * place it after the end of the largest possible heap. 278 * 279 * There should really be a pmap call to determine a reasonable 280 * location. 281 */ 282 PROC_LOCK(td->td_proc); 283 if (addr == 0 || 284 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 285 addr < round_page((vm_offset_t)vms->vm_daddr + 286 lim_max(td->td_proc, RLIMIT_DATA)))) 287 addr = round_page((vm_offset_t)vms->vm_daddr + 288 lim_max(td->td_proc, RLIMIT_DATA)); 289 PROC_UNLOCK(td->td_proc); 290 } 291 if (flags & MAP_ANON) { 292 /* 293 * Mapping blank space is trivial. 294 */ 295 handle = NULL; 296 handle_type = OBJT_DEFAULT; 297 maxprot = VM_PROT_ALL; 298 pos = 0; 299 } else { 300 /* 301 * Mapping file, get fp for validation. Obtain vnode and make 302 * sure it is of appropriate type. 303 * don't let the descriptor disappear on us if we block 304 */ 305 if ((error = fget(td, uap->fd, &fp)) != 0) 306 goto done; 307 if (fp->f_type != DTYPE_VNODE) { 308 error = ENODEV; 309 goto done; 310 } 311 /* 312 * POSIX shared-memory objects are defined to have 313 * kernel persistence, and are not defined to support 314 * read(2)/write(2) -- or even open(2). Thus, we can 315 * use MAP_ASYNC to trade on-disk coherence for speed. 316 * The shm_open(3) library routine turns on the FPOSIXSHM 317 * flag to request this behavior. 318 */ 319 if (fp->f_flag & FPOSIXSHM) 320 flags |= MAP_NOSYNC; 321 vp = fp->f_vnode; 322 /* 323 * Ensure that file and memory protections are 324 * compatible. Note that we only worry about 325 * writability if mapping is shared; in this case, 326 * current and max prot are dictated by the open file. 327 * XXX use the vnode instead? Problem is: what 328 * credentials do we use for determination? What if 329 * proc does a setuid? 330 */ 331 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 332 maxprot = VM_PROT_NONE; 333 else 334 maxprot = VM_PROT_EXECUTE; 335 if (fp->f_flag & FREAD) { 336 maxprot |= VM_PROT_READ; 337 } else if (prot & PROT_READ) { 338 error = EACCES; 339 goto done; 340 } 341 /* 342 * If we are sharing potential changes (either via 343 * MAP_SHARED or via the implicit sharing of character 344 * device mappings), and we are trying to get write 345 * permission although we opened it without asking 346 * for it, bail out. 347 */ 348 if ((flags & MAP_SHARED) != 0) { 349 if ((fp->f_flag & FWRITE) != 0) { 350 maxprot |= VM_PROT_WRITE; 351 } else if ((prot & PROT_WRITE) != 0) { 352 error = EACCES; 353 goto done; 354 } 355 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 356 maxprot |= VM_PROT_WRITE; 357 } 358 handle = (void *)vp; 359 handle_type = OBJT_VNODE; 360 } 361 362 /* 363 * Do not allow more then a certain number of vm_map_entry structures 364 * per process. Scale with the number of rforks sharing the map 365 * to make the limit reasonable for threads. 366 */ 367 if (max_proc_mmap && 368 vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) { 369 error = ENOMEM; 370 goto done; 371 } 372 373 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 374 flags, handle_type, handle, pos); 375 #ifdef HWPMC_HOOKS 376 /* inform hwpmc(4) if an executable is being mapped */ 377 if (error == 0 && handle_type == OBJT_VNODE && 378 (prot & PROT_EXEC)) { 379 pkm.pm_file = handle; 380 pkm.pm_address = (uintptr_t) addr; 381 PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); 382 } 383 #endif 384 if (error == 0) 385 td->td_retval[0] = (register_t) (addr + pageoff); 386 done: 387 if (fp) 388 fdrop(fp, td); 389 390 return (error); 391 } 392 393 #ifdef COMPAT_43 394 #ifndef _SYS_SYSPROTO_H_ 395 struct ommap_args { 396 caddr_t addr; 397 int len; 398 int prot; 399 int flags; 400 int fd; 401 long pos; 402 }; 403 #endif 404 int 405 ommap(td, uap) 406 struct thread *td; 407 struct ommap_args *uap; 408 { 409 struct mmap_args nargs; 410 static const char cvtbsdprot[8] = { 411 0, 412 PROT_EXEC, 413 PROT_WRITE, 414 PROT_EXEC | PROT_WRITE, 415 PROT_READ, 416 PROT_EXEC | PROT_READ, 417 PROT_WRITE | PROT_READ, 418 PROT_EXEC | PROT_WRITE | PROT_READ, 419 }; 420 421 #define OMAP_ANON 0x0002 422 #define OMAP_COPY 0x0020 423 #define OMAP_SHARED 0x0010 424 #define OMAP_FIXED 0x0100 425 426 nargs.addr = uap->addr; 427 nargs.len = uap->len; 428 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 429 nargs.flags = 0; 430 if (uap->flags & OMAP_ANON) 431 nargs.flags |= MAP_ANON; 432 if (uap->flags & OMAP_COPY) 433 nargs.flags |= MAP_COPY; 434 if (uap->flags & OMAP_SHARED) 435 nargs.flags |= MAP_SHARED; 436 else 437 nargs.flags |= MAP_PRIVATE; 438 if (uap->flags & OMAP_FIXED) 439 nargs.flags |= MAP_FIXED; 440 nargs.fd = uap->fd; 441 nargs.pos = uap->pos; 442 return (mmap(td, &nargs)); 443 } 444 #endif /* COMPAT_43 */ 445 446 447 #ifndef _SYS_SYSPROTO_H_ 448 struct msync_args { 449 void *addr; 450 int len; 451 int flags; 452 }; 453 #endif 454 /* 455 * MPSAFE 456 */ 457 int 458 msync(td, uap) 459 struct thread *td; 460 struct msync_args *uap; 461 { 462 vm_offset_t addr; 463 vm_size_t size, pageoff; 464 int flags; 465 vm_map_t map; 466 int rv; 467 468 addr = (vm_offset_t) uap->addr; 469 size = uap->len; 470 flags = uap->flags; 471 472 pageoff = (addr & PAGE_MASK); 473 addr -= pageoff; 474 size += pageoff; 475 size = (vm_size_t) round_page(size); 476 if (addr + size < addr) 477 return (EINVAL); 478 479 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 480 return (EINVAL); 481 482 map = &td->td_proc->p_vmspace->vm_map; 483 484 /* 485 * Clean the pages and interpret the return value. 486 */ 487 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 488 (flags & MS_INVALIDATE) != 0); 489 switch (rv) { 490 case KERN_SUCCESS: 491 return (0); 492 case KERN_INVALID_ADDRESS: 493 return (EINVAL); /* Sun returns ENOMEM? */ 494 case KERN_INVALID_ARGUMENT: 495 return (EBUSY); 496 default: 497 return (EINVAL); 498 } 499 } 500 501 #ifndef _SYS_SYSPROTO_H_ 502 struct munmap_args { 503 void *addr; 504 size_t len; 505 }; 506 #endif 507 /* 508 * MPSAFE 509 */ 510 int 511 munmap(td, uap) 512 struct thread *td; 513 struct munmap_args *uap; 514 { 515 #ifdef HWPMC_HOOKS 516 struct pmckern_map_out pkm; 517 vm_map_entry_t entry; 518 #endif 519 vm_offset_t addr; 520 vm_size_t size, pageoff; 521 vm_map_t map; 522 523 addr = (vm_offset_t) uap->addr; 524 size = uap->len; 525 if (size == 0) 526 return (EINVAL); 527 528 pageoff = (addr & PAGE_MASK); 529 addr -= pageoff; 530 size += pageoff; 531 size = (vm_size_t) round_page(size); 532 if (addr + size < addr) 533 return (EINVAL); 534 535 /* 536 * Check for illegal addresses. Watch out for address wrap... 537 */ 538 map = &td->td_proc->p_vmspace->vm_map; 539 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 540 return (EINVAL); 541 vm_map_lock(map); 542 /* 543 * Make sure entire range is allocated. 544 */ 545 if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) { 546 vm_map_unlock(map); 547 return (EINVAL); 548 } 549 #ifdef HWPMC_HOOKS 550 /* 551 * Inform hwpmc if the address range being unmapped contains 552 * an executable region. 553 */ 554 if (vm_map_lookup_entry(map, addr, &entry)) { 555 for (; 556 entry != &map->header && entry->start < addr + size; 557 entry = entry->next) { 558 if (vm_map_check_protection(map, entry->start, 559 entry->end, VM_PROT_EXECUTE) == TRUE) { 560 pkm.pm_address = (uintptr_t) addr; 561 pkm.pm_size = (size_t) size; 562 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, 563 (void *) &pkm); 564 break; 565 } 566 } 567 } 568 #endif 569 /* returns nothing but KERN_SUCCESS anyway */ 570 vm_map_delete(map, addr, addr + size); 571 vm_map_unlock(map); 572 return (0); 573 } 574 575 #ifndef _SYS_SYSPROTO_H_ 576 struct mprotect_args { 577 const void *addr; 578 size_t len; 579 int prot; 580 }; 581 #endif 582 /* 583 * MPSAFE 584 */ 585 int 586 mprotect(td, uap) 587 struct thread *td; 588 struct mprotect_args *uap; 589 { 590 vm_offset_t addr; 591 vm_size_t size, pageoff; 592 vm_prot_t prot; 593 594 addr = (vm_offset_t) uap->addr; 595 size = uap->len; 596 prot = uap->prot & VM_PROT_ALL; 597 #if defined(VM_PROT_READ_IS_EXEC) 598 if (prot & VM_PROT_READ) 599 prot |= VM_PROT_EXECUTE; 600 #endif 601 602 pageoff = (addr & PAGE_MASK); 603 addr -= pageoff; 604 size += pageoff; 605 size = (vm_size_t) round_page(size); 606 if (addr + size < addr) 607 return (EINVAL); 608 609 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 610 addr + size, prot, FALSE)) { 611 case KERN_SUCCESS: 612 return (0); 613 case KERN_PROTECTION_FAILURE: 614 return (EACCES); 615 } 616 return (EINVAL); 617 } 618 619 #ifndef _SYS_SYSPROTO_H_ 620 struct minherit_args { 621 void *addr; 622 size_t len; 623 int inherit; 624 }; 625 #endif 626 /* 627 * MPSAFE 628 */ 629 int 630 minherit(td, uap) 631 struct thread *td; 632 struct minherit_args *uap; 633 { 634 vm_offset_t addr; 635 vm_size_t size, pageoff; 636 vm_inherit_t inherit; 637 638 addr = (vm_offset_t)uap->addr; 639 size = uap->len; 640 inherit = uap->inherit; 641 642 pageoff = (addr & PAGE_MASK); 643 addr -= pageoff; 644 size += pageoff; 645 size = (vm_size_t) round_page(size); 646 if (addr + size < addr) 647 return (EINVAL); 648 649 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 650 addr + size, inherit)) { 651 case KERN_SUCCESS: 652 return (0); 653 case KERN_PROTECTION_FAILURE: 654 return (EACCES); 655 } 656 return (EINVAL); 657 } 658 659 #ifndef _SYS_SYSPROTO_H_ 660 struct madvise_args { 661 void *addr; 662 size_t len; 663 int behav; 664 }; 665 #endif 666 667 /* 668 * MPSAFE 669 */ 670 /* ARGSUSED */ 671 int 672 madvise(td, uap) 673 struct thread *td; 674 struct madvise_args *uap; 675 { 676 vm_offset_t start, end; 677 vm_map_t map; 678 struct proc *p; 679 int error; 680 681 /* 682 * Check for our special case, advising the swap pager we are 683 * "immortal." 684 */ 685 if (uap->behav == MADV_PROTECT) { 686 error = suser(td); 687 if (error == 0) { 688 p = td->td_proc; 689 PROC_LOCK(p); 690 p->p_flag |= P_PROTECTED; 691 PROC_UNLOCK(p); 692 } 693 return (error); 694 } 695 /* 696 * Check for illegal behavior 697 */ 698 if (uap->behav < 0 || uap->behav > MADV_CORE) 699 return (EINVAL); 700 /* 701 * Check for illegal addresses. Watch out for address wrap... Note 702 * that VM_*_ADDRESS are not constants due to casts (argh). 703 */ 704 map = &td->td_proc->p_vmspace->vm_map; 705 if ((vm_offset_t)uap->addr < vm_map_min(map) || 706 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 707 return (EINVAL); 708 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 709 return (EINVAL); 710 711 /* 712 * Since this routine is only advisory, we default to conservative 713 * behavior. 714 */ 715 start = trunc_page((vm_offset_t) uap->addr); 716 end = round_page((vm_offset_t) uap->addr + uap->len); 717 718 if (vm_map_madvise(map, start, end, uap->behav)) 719 return (EINVAL); 720 return (0); 721 } 722 723 #ifndef _SYS_SYSPROTO_H_ 724 struct mincore_args { 725 const void *addr; 726 size_t len; 727 char *vec; 728 }; 729 #endif 730 731 /* 732 * MPSAFE 733 */ 734 /* ARGSUSED */ 735 int 736 mincore(td, uap) 737 struct thread *td; 738 struct mincore_args *uap; 739 { 740 vm_offset_t addr, first_addr; 741 vm_offset_t end, cend; 742 pmap_t pmap; 743 vm_map_t map; 744 char *vec; 745 int error = 0; 746 int vecindex, lastvecindex; 747 vm_map_entry_t current; 748 vm_map_entry_t entry; 749 int mincoreinfo; 750 unsigned int timestamp; 751 752 /* 753 * Make sure that the addresses presented are valid for user 754 * mode. 755 */ 756 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 757 end = addr + (vm_size_t)round_page(uap->len); 758 map = &td->td_proc->p_vmspace->vm_map; 759 if (end > vm_map_max(map) || end < addr) 760 return (ENOMEM); 761 762 /* 763 * Address of byte vector 764 */ 765 vec = uap->vec; 766 767 pmap = vmspace_pmap(td->td_proc->p_vmspace); 768 769 vm_map_lock_read(map); 770 RestartScan: 771 timestamp = map->timestamp; 772 773 if (!vm_map_lookup_entry(map, addr, &entry)) { 774 vm_map_unlock_read(map); 775 return (ENOMEM); 776 } 777 778 /* 779 * Do this on a map entry basis so that if the pages are not 780 * in the current processes address space, we can easily look 781 * up the pages elsewhere. 782 */ 783 lastvecindex = -1; 784 for (current = entry; 785 (current != &map->header) && (current->start < end); 786 current = current->next) { 787 788 /* 789 * check for contiguity 790 */ 791 if (current->end < end && 792 (entry->next == &map->header || 793 current->next->start > current->end)) { 794 vm_map_unlock_read(map); 795 return (ENOMEM); 796 } 797 798 /* 799 * ignore submaps (for now) or null objects 800 */ 801 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 802 current->object.vm_object == NULL) 803 continue; 804 805 /* 806 * limit this scan to the current map entry and the 807 * limits for the mincore call 808 */ 809 if (addr < current->start) 810 addr = current->start; 811 cend = current->end; 812 if (cend > end) 813 cend = end; 814 815 /* 816 * scan this entry one page at a time 817 */ 818 while (addr < cend) { 819 /* 820 * Check pmap first, it is likely faster, also 821 * it can provide info as to whether we are the 822 * one referencing or modifying the page. 823 */ 824 mincoreinfo = pmap_mincore(pmap, addr); 825 if (!mincoreinfo) { 826 vm_pindex_t pindex; 827 vm_ooffset_t offset; 828 vm_page_t m; 829 /* 830 * calculate the page index into the object 831 */ 832 offset = current->offset + (addr - current->start); 833 pindex = OFF_TO_IDX(offset); 834 VM_OBJECT_LOCK(current->object.vm_object); 835 m = vm_page_lookup(current->object.vm_object, 836 pindex); 837 /* 838 * if the page is resident, then gather information about 839 * it. 840 */ 841 if (m != NULL && m->valid != 0) { 842 mincoreinfo = MINCORE_INCORE; 843 vm_page_lock_queues(); 844 if (m->dirty || 845 pmap_is_modified(m)) 846 mincoreinfo |= MINCORE_MODIFIED_OTHER; 847 if ((m->flags & PG_REFERENCED) || 848 pmap_ts_referenced(m)) { 849 vm_page_flag_set(m, PG_REFERENCED); 850 mincoreinfo |= MINCORE_REFERENCED_OTHER; 851 } 852 vm_page_unlock_queues(); 853 } 854 VM_OBJECT_UNLOCK(current->object.vm_object); 855 } 856 857 /* 858 * subyte may page fault. In case it needs to modify 859 * the map, we release the lock. 860 */ 861 vm_map_unlock_read(map); 862 863 /* 864 * calculate index into user supplied byte vector 865 */ 866 vecindex = OFF_TO_IDX(addr - first_addr); 867 868 /* 869 * If we have skipped map entries, we need to make sure that 870 * the byte vector is zeroed for those skipped entries. 871 */ 872 while ((lastvecindex + 1) < vecindex) { 873 error = subyte(vec + lastvecindex, 0); 874 if (error) { 875 error = EFAULT; 876 goto done2; 877 } 878 ++lastvecindex; 879 } 880 881 /* 882 * Pass the page information to the user 883 */ 884 error = subyte(vec + vecindex, mincoreinfo); 885 if (error) { 886 error = EFAULT; 887 goto done2; 888 } 889 890 /* 891 * If the map has changed, due to the subyte, the previous 892 * output may be invalid. 893 */ 894 vm_map_lock_read(map); 895 if (timestamp != map->timestamp) 896 goto RestartScan; 897 898 lastvecindex = vecindex; 899 addr += PAGE_SIZE; 900 } 901 } 902 903 /* 904 * subyte may page fault. In case it needs to modify 905 * the map, we release the lock. 906 */ 907 vm_map_unlock_read(map); 908 909 /* 910 * Zero the last entries in the byte vector. 911 */ 912 vecindex = OFF_TO_IDX(end - first_addr); 913 while ((lastvecindex + 1) < vecindex) { 914 error = subyte(vec + lastvecindex, 0); 915 if (error) { 916 error = EFAULT; 917 goto done2; 918 } 919 ++lastvecindex; 920 } 921 922 /* 923 * If the map has changed, due to the subyte, the previous 924 * output may be invalid. 925 */ 926 vm_map_lock_read(map); 927 if (timestamp != map->timestamp) 928 goto RestartScan; 929 vm_map_unlock_read(map); 930 done2: 931 return (error); 932 } 933 934 #ifndef _SYS_SYSPROTO_H_ 935 struct mlock_args { 936 const void *addr; 937 size_t len; 938 }; 939 #endif 940 /* 941 * MPSAFE 942 */ 943 int 944 mlock(td, uap) 945 struct thread *td; 946 struct mlock_args *uap; 947 { 948 struct proc *proc; 949 vm_offset_t addr, end, last, start; 950 vm_size_t npages, size; 951 int error; 952 953 error = suser(td); 954 if (error) 955 return (error); 956 addr = (vm_offset_t)uap->addr; 957 size = uap->len; 958 last = addr + size; 959 start = trunc_page(addr); 960 end = round_page(last); 961 if (last < addr || end < addr) 962 return (EINVAL); 963 npages = atop(end - start); 964 if (npages > vm_page_max_wired) 965 return (ENOMEM); 966 proc = td->td_proc; 967 PROC_LOCK(proc); 968 if (ptoa(npages + 969 pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) > 970 lim_cur(proc, RLIMIT_MEMLOCK)) { 971 PROC_UNLOCK(proc); 972 return (ENOMEM); 973 } 974 PROC_UNLOCK(proc); 975 if (npages + cnt.v_wire_count > vm_page_max_wired) 976 return (EAGAIN); 977 error = vm_map_wire(&proc->p_vmspace->vm_map, start, end, 978 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 979 return (error == KERN_SUCCESS ? 0 : ENOMEM); 980 } 981 982 #ifndef _SYS_SYSPROTO_H_ 983 struct mlockall_args { 984 int how; 985 }; 986 #endif 987 988 /* 989 * MPSAFE 990 */ 991 int 992 mlockall(td, uap) 993 struct thread *td; 994 struct mlockall_args *uap; 995 { 996 vm_map_t map; 997 int error; 998 999 map = &td->td_proc->p_vmspace->vm_map; 1000 error = 0; 1001 1002 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1003 return (EINVAL); 1004 1005 #if 0 1006 /* 1007 * If wiring all pages in the process would cause it to exceed 1008 * a hard resource limit, return ENOMEM. 1009 */ 1010 PROC_LOCK(td->td_proc); 1011 if (map->size - ptoa(pmap_wired_count(vm_map_pmap(map)) > 1012 lim_cur(td->td_proc, RLIMIT_MEMLOCK))) { 1013 PROC_UNLOCK(td->td_proc); 1014 return (ENOMEM); 1015 } 1016 PROC_UNLOCK(td->td_proc); 1017 #else 1018 error = suser(td); 1019 if (error) 1020 return (error); 1021 #endif 1022 1023 if (uap->how & MCL_FUTURE) { 1024 vm_map_lock(map); 1025 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1026 vm_map_unlock(map); 1027 error = 0; 1028 } 1029 1030 if (uap->how & MCL_CURRENT) { 1031 /* 1032 * P1003.1-2001 mandates that all currently mapped pages 1033 * will be memory resident and locked (wired) upon return 1034 * from mlockall(). vm_map_wire() will wire pages, by 1035 * calling vm_fault_wire() for each page in the region. 1036 */ 1037 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1038 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1039 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1040 } 1041 1042 return (error); 1043 } 1044 1045 #ifndef _SYS_SYSPROTO_H_ 1046 struct munlockall_args { 1047 register_t dummy; 1048 }; 1049 #endif 1050 1051 /* 1052 * MPSAFE 1053 */ 1054 int 1055 munlockall(td, uap) 1056 struct thread *td; 1057 struct munlockall_args *uap; 1058 { 1059 vm_map_t map; 1060 int error; 1061 1062 map = &td->td_proc->p_vmspace->vm_map; 1063 error = suser(td); 1064 if (error) 1065 return (error); 1066 1067 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1068 vm_map_lock(map); 1069 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1070 vm_map_unlock(map); 1071 1072 /* Forcibly unwire all pages. */ 1073 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1074 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1075 1076 return (error); 1077 } 1078 1079 #ifndef _SYS_SYSPROTO_H_ 1080 struct munlock_args { 1081 const void *addr; 1082 size_t len; 1083 }; 1084 #endif 1085 /* 1086 * MPSAFE 1087 */ 1088 int 1089 munlock(td, uap) 1090 struct thread *td; 1091 struct munlock_args *uap; 1092 { 1093 vm_offset_t addr, end, last, start; 1094 vm_size_t size; 1095 int error; 1096 1097 error = suser(td); 1098 if (error) 1099 return (error); 1100 addr = (vm_offset_t)uap->addr; 1101 size = uap->len; 1102 last = addr + size; 1103 start = trunc_page(addr); 1104 end = round_page(last); 1105 if (last < addr || end < addr) 1106 return (EINVAL); 1107 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1108 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1109 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1110 } 1111 1112 /* 1113 * vm_mmap_vnode() 1114 * 1115 * MPSAFE 1116 * 1117 * Helper function for vm_mmap. Perform sanity check specific for mmap 1118 * operations on vnodes. 1119 */ 1120 int 1121 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1122 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1123 struct vnode *vp, vm_ooffset_t foff, vm_object_t *objp) 1124 { 1125 struct vattr va; 1126 void *handle; 1127 vm_object_t obj; 1128 struct mount *mp; 1129 int error, flags, type; 1130 int vfslocked; 1131 1132 mp = vp->v_mount; 1133 vfslocked = VFS_LOCK_GIANT(mp); 1134 if ((error = vget(vp, LK_EXCLUSIVE, td)) != 0) { 1135 VFS_UNLOCK_GIANT(vfslocked); 1136 return (error); 1137 } 1138 flags = *flagsp; 1139 obj = vp->v_object; 1140 if (vp->v_type == VREG) { 1141 /* 1142 * Get the proper underlying object 1143 */ 1144 if (obj == NULL) { 1145 error = EINVAL; 1146 goto done; 1147 } 1148 if (obj->handle != vp) { 1149 vput(vp); 1150 vp = (struct vnode*)obj->handle; 1151 vget(vp, LK_EXCLUSIVE, td); 1152 } 1153 type = OBJT_VNODE; 1154 handle = vp; 1155 } else if (vp->v_type == VCHR) { 1156 type = OBJT_DEVICE; 1157 handle = vp->v_rdev; 1158 1159 /* XXX: lack thredref on device */ 1160 if(vp->v_rdev->si_devsw->d_flags & D_MMAP_ANON) { 1161 *maxprotp = VM_PROT_ALL; 1162 *flagsp |= MAP_ANON; 1163 error = 0; 1164 goto done; 1165 } 1166 /* 1167 * cdevs does not provide private mappings of any kind. 1168 */ 1169 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1170 (prot & PROT_WRITE) != 0) { 1171 error = EACCES; 1172 goto done; 1173 } 1174 if (flags & (MAP_PRIVATE|MAP_COPY)) { 1175 error = EINVAL; 1176 goto done; 1177 } 1178 /* 1179 * Force device mappings to be shared. 1180 */ 1181 flags |= MAP_SHARED; 1182 } else { 1183 error = EINVAL; 1184 goto done; 1185 } 1186 if ((error = VOP_GETATTR(vp, &va, td->td_ucred, td))) { 1187 goto done; 1188 } 1189 #ifdef MAC 1190 error = mac_check_vnode_mmap(td->td_ucred, vp, prot, flags); 1191 if (error != 0) 1192 goto done; 1193 #endif 1194 if ((flags & MAP_SHARED) != 0) { 1195 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1196 if (prot & PROT_WRITE) { 1197 error = EPERM; 1198 goto done; 1199 } 1200 *maxprotp &= ~VM_PROT_WRITE; 1201 } 1202 } 1203 /* 1204 * If it is a regular file without any references 1205 * we do not need to sync it. 1206 * Adjust object size to be the size of actual file. 1207 */ 1208 if (vp->v_type == VREG) { 1209 objsize = round_page(va.va_size); 1210 if (va.va_nlink == 0) 1211 flags |= MAP_NOSYNC; 1212 } 1213 obj = vm_pager_allocate(type, handle, objsize, prot, foff); 1214 if (obj == NULL) { 1215 error = (type == OBJT_DEVICE ? EINVAL : ENOMEM); 1216 goto done; 1217 } 1218 *objp = obj; 1219 *flagsp = flags; 1220 vfs_mark_atime(vp, td); 1221 1222 done: 1223 vput(vp); 1224 VFS_UNLOCK_GIANT(vfslocked); 1225 return (error); 1226 } 1227 1228 /* 1229 * vm_mmap_cdev() 1230 * 1231 * MPSAFE 1232 * 1233 * Helper function for vm_mmap. Perform sanity check specific for mmap 1234 * operations on cdevs. 1235 */ 1236 int 1237 vm_mmap_cdev(struct thread *td, vm_size_t objsize, 1238 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1239 struct cdev *cdev, vm_ooffset_t foff, vm_object_t *objp) 1240 { 1241 vm_object_t obj; 1242 int flags; 1243 1244 flags = *flagsp; 1245 1246 /* XXX: lack thredref on device */ 1247 if (cdev->si_devsw->d_flags & D_MMAP_ANON) { 1248 *maxprotp = VM_PROT_ALL; 1249 *flagsp |= MAP_ANON; 1250 return (0); 1251 } 1252 /* 1253 * cdevs does not provide private mappings of any kind. 1254 */ 1255 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1256 (prot & PROT_WRITE) != 0) 1257 return (EACCES); 1258 if (flags & (MAP_PRIVATE|MAP_COPY)) 1259 return (EINVAL); 1260 /* 1261 * Force device mappings to be shared. 1262 */ 1263 flags |= MAP_SHARED; 1264 #ifdef MAC_XXX 1265 error = mac_check_cdev_mmap(td->td_ucred, cdev, prot); 1266 if (error != 0) 1267 return (error); 1268 #endif 1269 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, foff); 1270 if (obj == NULL) 1271 return (EINVAL); 1272 *objp = obj; 1273 *flagsp = flags; 1274 return (0); 1275 } 1276 1277 /* 1278 * vm_mmap() 1279 * 1280 * MPSAFE 1281 * 1282 * Internal version of mmap. Currently used by mmap, exec, and sys5 1283 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1284 */ 1285 int 1286 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1287 vm_prot_t maxprot, int flags, 1288 objtype_t handle_type, void *handle, 1289 vm_ooffset_t foff) 1290 { 1291 boolean_t fitit; 1292 vm_object_t object; 1293 int rv = KERN_SUCCESS; 1294 int docow, error; 1295 struct thread *td = curthread; 1296 1297 if (size == 0) 1298 return (0); 1299 1300 size = round_page(size); 1301 1302 PROC_LOCK(td->td_proc); 1303 if (td->td_proc->p_vmspace->vm_map.size + size > 1304 lim_cur(td->td_proc, RLIMIT_VMEM)) { 1305 PROC_UNLOCK(td->td_proc); 1306 return(ENOMEM); 1307 } 1308 PROC_UNLOCK(td->td_proc); 1309 1310 /* 1311 * We currently can only deal with page aligned file offsets. 1312 * The check is here rather than in the syscall because the 1313 * kernel calls this function internally for other mmaping 1314 * operations (such as in exec) and non-aligned offsets will 1315 * cause pmap inconsistencies...so we want to be sure to 1316 * disallow this in all cases. 1317 */ 1318 if (foff & PAGE_MASK) 1319 return (EINVAL); 1320 1321 if ((flags & MAP_FIXED) == 0) { 1322 fitit = TRUE; 1323 *addr = round_page(*addr); 1324 } else { 1325 if (*addr != trunc_page(*addr)) 1326 return (EINVAL); 1327 fitit = FALSE; 1328 (void) vm_map_remove(map, *addr, *addr + size); 1329 } 1330 /* 1331 * Lookup/allocate object. 1332 */ 1333 switch (handle_type) { 1334 case OBJT_DEVICE: 1335 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, 1336 handle, foff, &object); 1337 break; 1338 case OBJT_VNODE: 1339 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1340 handle, foff, &object); 1341 break; 1342 case OBJT_DEFAULT: 1343 if (handle == NULL) { 1344 error = 0; 1345 break; 1346 } 1347 /* FALLTHROUGH */ 1348 default: 1349 error = EINVAL; 1350 } 1351 if (error) 1352 return (error); 1353 if (flags & MAP_ANON) { 1354 object = NULL; 1355 docow = 0; 1356 /* 1357 * Unnamed anonymous regions always start at 0. 1358 */ 1359 if (handle == 0) 1360 foff = 0; 1361 } else { 1362 docow = MAP_PREFAULT_PARTIAL; 1363 } 1364 1365 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1366 docow |= MAP_COPY_ON_WRITE; 1367 if (flags & MAP_NOSYNC) 1368 docow |= MAP_DISABLE_SYNCER; 1369 if (flags & MAP_NOCORE) 1370 docow |= MAP_DISABLE_COREDUMP; 1371 1372 #if defined(VM_PROT_READ_IS_EXEC) 1373 if (prot & VM_PROT_READ) 1374 prot |= VM_PROT_EXECUTE; 1375 1376 if (maxprot & VM_PROT_READ) 1377 maxprot |= VM_PROT_EXECUTE; 1378 #endif 1379 1380 if (fitit) 1381 *addr = pmap_addr_hint(object, *addr, size); 1382 1383 if (flags & MAP_STACK) 1384 rv = vm_map_stack(map, *addr, size, prot, maxprot, 1385 docow | MAP_STACK_GROWS_DOWN); 1386 else 1387 rv = vm_map_find(map, object, foff, addr, size, fitit, 1388 prot, maxprot, docow); 1389 1390 if (rv != KERN_SUCCESS) { 1391 /* 1392 * Lose the object reference. Will destroy the 1393 * object if it's an unnamed anonymous mapping 1394 * or named anonymous without other references. 1395 */ 1396 vm_object_deallocate(object); 1397 } else if (flags & MAP_SHARED) { 1398 /* 1399 * Shared memory is also shared with children. 1400 */ 1401 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); 1402 if (rv != KERN_SUCCESS) 1403 (void) vm_map_remove(map, *addr, *addr + size); 1404 } 1405 1406 /* 1407 * If the process has requested that all future mappings 1408 * be wired, then heed this. 1409 */ 1410 if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE)) 1411 vm_map_wire(map, *addr, *addr + size, 1412 VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); 1413 1414 switch (rv) { 1415 case KERN_SUCCESS: 1416 return (0); 1417 case KERN_INVALID_ADDRESS: 1418 case KERN_NO_SPACE: 1419 return (ENOMEM); 1420 case KERN_PROTECTION_FAILURE: 1421 return (EACCES); 1422 default: 1423 return (EINVAL); 1424 } 1425 } 1426