1 /*- 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35 * 36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_compat.h" 47 #include "opt_hwpmc_hooks.h" 48 #include "opt_mac.h" 49 50 #include <sys/param.h> 51 #include <sys/systm.h> 52 #include <sys/kernel.h> 53 #include <sys/lock.h> 54 #include <sys/mutex.h> 55 #include <sys/sysproto.h> 56 #include <sys/filedesc.h> 57 #include <sys/proc.h> 58 #include <sys/resource.h> 59 #include <sys/resourcevar.h> 60 #include <sys/vnode.h> 61 #include <sys/fcntl.h> 62 #include <sys/file.h> 63 #include <sys/mac.h> 64 #include <sys/mman.h> 65 #include <sys/mount.h> 66 #include <sys/conf.h> 67 #include <sys/stat.h> 68 #include <sys/vmmeter.h> 69 #include <sys/sysctl.h> 70 71 #include <vm/vm.h> 72 #include <vm/vm_param.h> 73 #include <vm/pmap.h> 74 #include <vm/vm_map.h> 75 #include <vm/vm_object.h> 76 #include <vm/vm_page.h> 77 #include <vm/vm_pager.h> 78 #include <vm/vm_pageout.h> 79 #include <vm/vm_extern.h> 80 #include <vm/vm_page.h> 81 #include <vm/vm_kern.h> 82 83 #ifdef HWPMC_HOOKS 84 #include <sys/pmckern.h> 85 #endif 86 87 #ifndef _SYS_SYSPROTO_H_ 88 struct sbrk_args { 89 int incr; 90 }; 91 #endif 92 93 static int max_proc_mmap; 94 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, ""); 95 96 /* 97 * Set the maximum number of vm_map_entry structures per process. Roughly 98 * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100 99 * of our KVM malloc space still results in generous limits. We want a 100 * default that is good enough to prevent the kernel running out of resources 101 * if attacked from compromised user account but generous enough such that 102 * multi-threaded processes are not unduly inconvenienced. 103 */ 104 static void vmmapentry_rsrc_init(void *); 105 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, NULL) 106 107 static void 108 vmmapentry_rsrc_init(dummy) 109 void *dummy; 110 { 111 max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry); 112 max_proc_mmap /= 100; 113 } 114 115 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 116 int *, struct vnode *, vm_ooffset_t, vm_object_t *); 117 static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 118 int *, struct cdev *, vm_ooffset_t, vm_object_t *); 119 120 /* 121 * MPSAFE 122 */ 123 /* ARGSUSED */ 124 int 125 sbrk(td, uap) 126 struct thread *td; 127 struct sbrk_args *uap; 128 { 129 /* Not yet implemented */ 130 return (EOPNOTSUPP); 131 } 132 133 #ifndef _SYS_SYSPROTO_H_ 134 struct sstk_args { 135 int incr; 136 }; 137 #endif 138 139 /* 140 * MPSAFE 141 */ 142 /* ARGSUSED */ 143 int 144 sstk(td, uap) 145 struct thread *td; 146 struct sstk_args *uap; 147 { 148 /* Not yet implemented */ 149 return (EOPNOTSUPP); 150 } 151 152 #if defined(COMPAT_43) 153 #ifndef _SYS_SYSPROTO_H_ 154 struct getpagesize_args { 155 int dummy; 156 }; 157 #endif 158 159 /* ARGSUSED */ 160 int 161 ogetpagesize(td, uap) 162 struct thread *td; 163 struct getpagesize_args *uap; 164 { 165 /* MP SAFE */ 166 td->td_retval[0] = PAGE_SIZE; 167 return (0); 168 } 169 #endif /* COMPAT_43 */ 170 171 172 /* 173 * Memory Map (mmap) system call. Note that the file offset 174 * and address are allowed to be NOT page aligned, though if 175 * the MAP_FIXED flag it set, both must have the same remainder 176 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 177 * page-aligned, the actual mapping starts at trunc_page(addr) 178 * and the return value is adjusted up by the page offset. 179 * 180 * Generally speaking, only character devices which are themselves 181 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 182 * there would be no cache coherency between a descriptor and a VM mapping 183 * both to the same character device. 184 * 185 * Block devices can be mmap'd no matter what they represent. Cache coherency 186 * is maintained as long as you do not write directly to the underlying 187 * character device. 188 */ 189 #ifndef _SYS_SYSPROTO_H_ 190 struct mmap_args { 191 void *addr; 192 size_t len; 193 int prot; 194 int flags; 195 int fd; 196 long pad; 197 off_t pos; 198 }; 199 #endif 200 201 /* 202 * MPSAFE 203 */ 204 int 205 mmap(td, uap) 206 struct thread *td; 207 struct mmap_args *uap; 208 { 209 #ifdef HWPMC_HOOKS 210 struct pmckern_map_in pkm; 211 #endif 212 struct file *fp; 213 struct vnode *vp; 214 vm_offset_t addr; 215 vm_size_t size, pageoff; 216 vm_prot_t prot, maxprot; 217 void *handle; 218 objtype_t handle_type; 219 int flags, error; 220 off_t pos; 221 struct vmspace *vms = td->td_proc->p_vmspace; 222 223 addr = (vm_offset_t) uap->addr; 224 size = uap->len; 225 prot = uap->prot & VM_PROT_ALL; 226 flags = uap->flags; 227 pos = uap->pos; 228 229 fp = NULL; 230 /* make sure mapping fits into numeric range etc */ 231 if ((ssize_t) uap->len < 0 || 232 ((flags & MAP_ANON) && uap->fd != -1)) 233 return (EINVAL); 234 235 if (flags & MAP_STACK) { 236 if ((uap->fd != -1) || 237 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 238 return (EINVAL); 239 flags |= MAP_ANON; 240 pos = 0; 241 } 242 243 /* 244 * Align the file position to a page boundary, 245 * and save its page offset component. 246 */ 247 pageoff = (pos & PAGE_MASK); 248 pos -= pageoff; 249 250 /* Adjust size for rounding (on both ends). */ 251 size += pageoff; /* low end... */ 252 size = (vm_size_t) round_page(size); /* hi end */ 253 254 /* 255 * Check for illegal addresses. Watch out for address wrap... Note 256 * that VM_*_ADDRESS are not constants due to casts (argh). 257 */ 258 if (flags & MAP_FIXED) { 259 /* 260 * The specified address must have the same remainder 261 * as the file offset taken modulo PAGE_SIZE, so it 262 * should be aligned after adjustment by pageoff. 263 */ 264 addr -= pageoff; 265 if (addr & PAGE_MASK) 266 return (EINVAL); 267 /* Address range must be all in user VM space. */ 268 if (addr < vm_map_min(&vms->vm_map) || 269 addr + size > vm_map_max(&vms->vm_map)) 270 return (EINVAL); 271 if (addr + size < addr) 272 return (EINVAL); 273 } else { 274 /* 275 * XXX for non-fixed mappings where no hint is provided or 276 * the hint would fall in the potential heap space, 277 * place it after the end of the largest possible heap. 278 * 279 * There should really be a pmap call to determine a reasonable 280 * location. 281 */ 282 PROC_LOCK(td->td_proc); 283 if (addr == 0 || 284 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 285 addr < round_page((vm_offset_t)vms->vm_daddr + 286 lim_max(td->td_proc, RLIMIT_DATA)))) 287 addr = round_page((vm_offset_t)vms->vm_daddr + 288 lim_max(td->td_proc, RLIMIT_DATA)); 289 PROC_UNLOCK(td->td_proc); 290 } 291 if (flags & MAP_ANON) { 292 /* 293 * Mapping blank space is trivial. 294 */ 295 handle = NULL; 296 handle_type = OBJT_DEFAULT; 297 maxprot = VM_PROT_ALL; 298 pos = 0; 299 } else { 300 /* 301 * Mapping file, get fp for validation. Obtain vnode and make 302 * sure it is of appropriate type. 303 * don't let the descriptor disappear on us if we block 304 */ 305 if ((error = fget(td, uap->fd, &fp)) != 0) 306 goto done; 307 if (fp->f_type != DTYPE_VNODE) { 308 error = EINVAL; 309 goto done; 310 } 311 /* 312 * POSIX shared-memory objects are defined to have 313 * kernel persistence, and are not defined to support 314 * read(2)/write(2) -- or even open(2). Thus, we can 315 * use MAP_ASYNC to trade on-disk coherence for speed. 316 * The shm_open(3) library routine turns on the FPOSIXSHM 317 * flag to request this behavior. 318 */ 319 if (fp->f_flag & FPOSIXSHM) 320 flags |= MAP_NOSYNC; 321 vp = fp->f_vnode; 322 /* 323 * Ensure that file and memory protections are 324 * compatible. Note that we only worry about 325 * writability if mapping is shared; in this case, 326 * current and max prot are dictated by the open file. 327 * XXX use the vnode instead? Problem is: what 328 * credentials do we use for determination? What if 329 * proc does a setuid? 330 */ 331 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 332 maxprot = VM_PROT_NONE; 333 else 334 maxprot = VM_PROT_EXECUTE; 335 if (fp->f_flag & FREAD) { 336 maxprot |= VM_PROT_READ; 337 } else if (prot & PROT_READ) { 338 error = EACCES; 339 goto done; 340 } 341 /* 342 * If we are sharing potential changes (either via 343 * MAP_SHARED or via the implicit sharing of character 344 * device mappings), and we are trying to get write 345 * permission although we opened it without asking 346 * for it, bail out. 347 */ 348 if ((flags & MAP_SHARED) != 0) { 349 if ((fp->f_flag & FWRITE) != 0) { 350 maxprot |= VM_PROT_WRITE; 351 } else if ((prot & PROT_WRITE) != 0) { 352 error = EACCES; 353 goto done; 354 } 355 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 356 maxprot |= VM_PROT_WRITE; 357 } 358 handle = (void *)vp; 359 handle_type = OBJT_VNODE; 360 } 361 362 /* 363 * Do not allow more then a certain number of vm_map_entry structures 364 * per process. Scale with the number of rforks sharing the map 365 * to make the limit reasonable for threads. 366 */ 367 if (max_proc_mmap && 368 vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) { 369 error = ENOMEM; 370 goto done; 371 } 372 373 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 374 flags, handle_type, handle, pos); 375 #ifdef HWPMC_HOOKS 376 /* inform hwpmc(4) if an executable is being mapped */ 377 if (error == 0 && handle_type == OBJT_VNODE && 378 (prot & PROT_EXEC)) { 379 pkm.pm_file = handle; 380 pkm.pm_address = (uintptr_t) addr; 381 PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); 382 } 383 #endif 384 if (error == 0) 385 td->td_retval[0] = (register_t) (addr + pageoff); 386 done: 387 if (fp) 388 fdrop(fp, td); 389 390 return (error); 391 } 392 393 #ifdef COMPAT_43 394 #ifndef _SYS_SYSPROTO_H_ 395 struct ommap_args { 396 caddr_t addr; 397 int len; 398 int prot; 399 int flags; 400 int fd; 401 long pos; 402 }; 403 #endif 404 int 405 ommap(td, uap) 406 struct thread *td; 407 struct ommap_args *uap; 408 { 409 struct mmap_args nargs; 410 static const char cvtbsdprot[8] = { 411 0, 412 PROT_EXEC, 413 PROT_WRITE, 414 PROT_EXEC | PROT_WRITE, 415 PROT_READ, 416 PROT_EXEC | PROT_READ, 417 PROT_WRITE | PROT_READ, 418 PROT_EXEC | PROT_WRITE | PROT_READ, 419 }; 420 421 #define OMAP_ANON 0x0002 422 #define OMAP_COPY 0x0020 423 #define OMAP_SHARED 0x0010 424 #define OMAP_FIXED 0x0100 425 426 nargs.addr = uap->addr; 427 nargs.len = uap->len; 428 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 429 nargs.flags = 0; 430 if (uap->flags & OMAP_ANON) 431 nargs.flags |= MAP_ANON; 432 if (uap->flags & OMAP_COPY) 433 nargs.flags |= MAP_COPY; 434 if (uap->flags & OMAP_SHARED) 435 nargs.flags |= MAP_SHARED; 436 else 437 nargs.flags |= MAP_PRIVATE; 438 if (uap->flags & OMAP_FIXED) 439 nargs.flags |= MAP_FIXED; 440 nargs.fd = uap->fd; 441 nargs.pos = uap->pos; 442 return (mmap(td, &nargs)); 443 } 444 #endif /* COMPAT_43 */ 445 446 447 #ifndef _SYS_SYSPROTO_H_ 448 struct msync_args { 449 void *addr; 450 int len; 451 int flags; 452 }; 453 #endif 454 /* 455 * MPSAFE 456 */ 457 int 458 msync(td, uap) 459 struct thread *td; 460 struct msync_args *uap; 461 { 462 vm_offset_t addr; 463 vm_size_t size, pageoff; 464 int flags; 465 vm_map_t map; 466 int rv; 467 468 addr = (vm_offset_t) uap->addr; 469 size = uap->len; 470 flags = uap->flags; 471 472 pageoff = (addr & PAGE_MASK); 473 addr -= pageoff; 474 size += pageoff; 475 size = (vm_size_t) round_page(size); 476 if (addr + size < addr) 477 return (EINVAL); 478 479 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 480 return (EINVAL); 481 482 map = &td->td_proc->p_vmspace->vm_map; 483 484 /* 485 * Clean the pages and interpret the return value. 486 */ 487 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 488 (flags & MS_INVALIDATE) != 0); 489 switch (rv) { 490 case KERN_SUCCESS: 491 return (0); 492 case KERN_INVALID_ADDRESS: 493 return (EINVAL); /* Sun returns ENOMEM? */ 494 case KERN_INVALID_ARGUMENT: 495 return (EBUSY); 496 default: 497 return (EINVAL); 498 } 499 } 500 501 #ifndef _SYS_SYSPROTO_H_ 502 struct munmap_args { 503 void *addr; 504 size_t len; 505 }; 506 #endif 507 /* 508 * MPSAFE 509 */ 510 int 511 munmap(td, uap) 512 struct thread *td; 513 struct munmap_args *uap; 514 { 515 #ifdef HWPMC_HOOKS 516 struct pmckern_map_out pkm; 517 vm_map_entry_t entry; 518 #endif 519 vm_offset_t addr; 520 vm_size_t size, pageoff; 521 vm_map_t map; 522 523 addr = (vm_offset_t) uap->addr; 524 size = uap->len; 525 if (size == 0) 526 return (EINVAL); 527 528 pageoff = (addr & PAGE_MASK); 529 addr -= pageoff; 530 size += pageoff; 531 size = (vm_size_t) round_page(size); 532 if (addr + size < addr) 533 return (EINVAL); 534 535 /* 536 * Check for illegal addresses. Watch out for address wrap... 537 */ 538 map = &td->td_proc->p_vmspace->vm_map; 539 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 540 return (EINVAL); 541 vm_map_lock(map); 542 /* 543 * Make sure entire range is allocated. 544 */ 545 if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) { 546 vm_map_unlock(map); 547 return (EINVAL); 548 } 549 #ifdef HWPMC_HOOKS 550 /* 551 * Inform hwpmc if the address range being unmapped contains 552 * an executable region. 553 */ 554 if (vm_map_lookup_entry(map, addr, &entry)) { 555 for (; 556 entry != &map->header && entry->start < addr + size; 557 entry = entry->next) { 558 if (vm_map_check_protection(map, entry->start, 559 entry->end, VM_PROT_EXECUTE) == TRUE) { 560 pkm.pm_address = (uintptr_t) addr; 561 pkm.pm_size = (size_t) size; 562 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, 563 (void *) &pkm); 564 break; 565 } 566 } 567 } 568 #endif 569 /* returns nothing but KERN_SUCCESS anyway */ 570 vm_map_delete(map, addr, addr + size); 571 vm_map_unlock(map); 572 return (0); 573 } 574 575 #ifndef _SYS_SYSPROTO_H_ 576 struct mprotect_args { 577 const void *addr; 578 size_t len; 579 int prot; 580 }; 581 #endif 582 /* 583 * MPSAFE 584 */ 585 int 586 mprotect(td, uap) 587 struct thread *td; 588 struct mprotect_args *uap; 589 { 590 vm_offset_t addr; 591 vm_size_t size, pageoff; 592 vm_prot_t prot; 593 594 addr = (vm_offset_t) uap->addr; 595 size = uap->len; 596 prot = uap->prot & VM_PROT_ALL; 597 #if defined(VM_PROT_READ_IS_EXEC) 598 if (prot & VM_PROT_READ) 599 prot |= VM_PROT_EXECUTE; 600 #endif 601 602 pageoff = (addr & PAGE_MASK); 603 addr -= pageoff; 604 size += pageoff; 605 size = (vm_size_t) round_page(size); 606 if (addr + size < addr) 607 return (EINVAL); 608 609 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 610 addr + size, prot, FALSE)) { 611 case KERN_SUCCESS: 612 return (0); 613 case KERN_PROTECTION_FAILURE: 614 return (EACCES); 615 } 616 return (EINVAL); 617 } 618 619 #ifndef _SYS_SYSPROTO_H_ 620 struct minherit_args { 621 void *addr; 622 size_t len; 623 int inherit; 624 }; 625 #endif 626 /* 627 * MPSAFE 628 */ 629 int 630 minherit(td, uap) 631 struct thread *td; 632 struct minherit_args *uap; 633 { 634 vm_offset_t addr; 635 vm_size_t size, pageoff; 636 vm_inherit_t inherit; 637 638 addr = (vm_offset_t)uap->addr; 639 size = uap->len; 640 inherit = uap->inherit; 641 642 pageoff = (addr & PAGE_MASK); 643 addr -= pageoff; 644 size += pageoff; 645 size = (vm_size_t) round_page(size); 646 if (addr + size < addr) 647 return (EINVAL); 648 649 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 650 addr + size, inherit)) { 651 case KERN_SUCCESS: 652 return (0); 653 case KERN_PROTECTION_FAILURE: 654 return (EACCES); 655 } 656 return (EINVAL); 657 } 658 659 #ifndef _SYS_SYSPROTO_H_ 660 struct madvise_args { 661 void *addr; 662 size_t len; 663 int behav; 664 }; 665 #endif 666 667 /* 668 * MPSAFE 669 */ 670 /* ARGSUSED */ 671 int 672 madvise(td, uap) 673 struct thread *td; 674 struct madvise_args *uap; 675 { 676 vm_offset_t start, end; 677 vm_map_t map; 678 struct proc *p; 679 int error; 680 681 /* 682 * Check for our special case, advising the swap pager we are 683 * "immortal." 684 */ 685 if (uap->behav == MADV_PROTECT) { 686 error = suser(td); 687 if (error == 0) { 688 p = td->td_proc; 689 PROC_LOCK(p); 690 p->p_flag |= P_PROTECTED; 691 PROC_UNLOCK(p); 692 } 693 return (error); 694 } 695 /* 696 * Check for illegal behavior 697 */ 698 if (uap->behav < 0 || uap->behav > MADV_CORE) 699 return (EINVAL); 700 /* 701 * Check for illegal addresses. Watch out for address wrap... Note 702 * that VM_*_ADDRESS are not constants due to casts (argh). 703 */ 704 map = &td->td_proc->p_vmspace->vm_map; 705 if ((vm_offset_t)uap->addr < vm_map_min(map) || 706 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 707 return (EINVAL); 708 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 709 return (EINVAL); 710 711 /* 712 * Since this routine is only advisory, we default to conservative 713 * behavior. 714 */ 715 start = trunc_page((vm_offset_t) uap->addr); 716 end = round_page((vm_offset_t) uap->addr + uap->len); 717 718 if (vm_map_madvise(map, start, end, uap->behav)) 719 return (EINVAL); 720 return (0); 721 } 722 723 #ifndef _SYS_SYSPROTO_H_ 724 struct mincore_args { 725 const void *addr; 726 size_t len; 727 char *vec; 728 }; 729 #endif 730 731 /* 732 * MPSAFE 733 */ 734 /* ARGSUSED */ 735 int 736 mincore(td, uap) 737 struct thread *td; 738 struct mincore_args *uap; 739 { 740 vm_offset_t addr, first_addr; 741 vm_offset_t end, cend; 742 pmap_t pmap; 743 vm_map_t map; 744 char *vec; 745 int error = 0; 746 int vecindex, lastvecindex; 747 vm_map_entry_t current; 748 vm_map_entry_t entry; 749 int mincoreinfo; 750 unsigned int timestamp; 751 752 /* 753 * Make sure that the addresses presented are valid for user 754 * mode. 755 */ 756 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 757 end = addr + (vm_size_t)round_page(uap->len); 758 map = &td->td_proc->p_vmspace->vm_map; 759 if (end > vm_map_max(map) || end < addr) 760 return (EINVAL); 761 762 /* 763 * Address of byte vector 764 */ 765 vec = uap->vec; 766 767 pmap = vmspace_pmap(td->td_proc->p_vmspace); 768 769 vm_map_lock_read(map); 770 RestartScan: 771 timestamp = map->timestamp; 772 773 if (!vm_map_lookup_entry(map, addr, &entry)) 774 entry = entry->next; 775 776 /* 777 * Do this on a map entry basis so that if the pages are not 778 * in the current processes address space, we can easily look 779 * up the pages elsewhere. 780 */ 781 lastvecindex = -1; 782 for (current = entry; 783 (current != &map->header) && (current->start < end); 784 current = current->next) { 785 786 /* 787 * ignore submaps (for now) or null objects 788 */ 789 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 790 current->object.vm_object == NULL) 791 continue; 792 793 /* 794 * limit this scan to the current map entry and the 795 * limits for the mincore call 796 */ 797 if (addr < current->start) 798 addr = current->start; 799 cend = current->end; 800 if (cend > end) 801 cend = end; 802 803 /* 804 * scan this entry one page at a time 805 */ 806 while (addr < cend) { 807 /* 808 * Check pmap first, it is likely faster, also 809 * it can provide info as to whether we are the 810 * one referencing or modifying the page. 811 */ 812 mincoreinfo = pmap_mincore(pmap, addr); 813 if (!mincoreinfo) { 814 vm_pindex_t pindex; 815 vm_ooffset_t offset; 816 vm_page_t m; 817 /* 818 * calculate the page index into the object 819 */ 820 offset = current->offset + (addr - current->start); 821 pindex = OFF_TO_IDX(offset); 822 VM_OBJECT_LOCK(current->object.vm_object); 823 m = vm_page_lookup(current->object.vm_object, 824 pindex); 825 /* 826 * if the page is resident, then gather information about 827 * it. 828 */ 829 if (m != NULL && m->valid != 0) { 830 mincoreinfo = MINCORE_INCORE; 831 vm_page_lock_queues(); 832 if (m->dirty || 833 pmap_is_modified(m)) 834 mincoreinfo |= MINCORE_MODIFIED_OTHER; 835 if ((m->flags & PG_REFERENCED) || 836 pmap_ts_referenced(m)) { 837 vm_page_flag_set(m, PG_REFERENCED); 838 mincoreinfo |= MINCORE_REFERENCED_OTHER; 839 } 840 vm_page_unlock_queues(); 841 } 842 VM_OBJECT_UNLOCK(current->object.vm_object); 843 } 844 845 /* 846 * subyte may page fault. In case it needs to modify 847 * the map, we release the lock. 848 */ 849 vm_map_unlock_read(map); 850 851 /* 852 * calculate index into user supplied byte vector 853 */ 854 vecindex = OFF_TO_IDX(addr - first_addr); 855 856 /* 857 * If we have skipped map entries, we need to make sure that 858 * the byte vector is zeroed for those skipped entries. 859 */ 860 while ((lastvecindex + 1) < vecindex) { 861 error = subyte(vec + lastvecindex, 0); 862 if (error) { 863 error = EFAULT; 864 goto done2; 865 } 866 ++lastvecindex; 867 } 868 869 /* 870 * Pass the page information to the user 871 */ 872 error = subyte(vec + vecindex, mincoreinfo); 873 if (error) { 874 error = EFAULT; 875 goto done2; 876 } 877 878 /* 879 * If the map has changed, due to the subyte, the previous 880 * output may be invalid. 881 */ 882 vm_map_lock_read(map); 883 if (timestamp != map->timestamp) 884 goto RestartScan; 885 886 lastvecindex = vecindex; 887 addr += PAGE_SIZE; 888 } 889 } 890 891 /* 892 * subyte may page fault. In case it needs to modify 893 * the map, we release the lock. 894 */ 895 vm_map_unlock_read(map); 896 897 /* 898 * Zero the last entries in the byte vector. 899 */ 900 vecindex = OFF_TO_IDX(end - first_addr); 901 while ((lastvecindex + 1) < vecindex) { 902 error = subyte(vec + lastvecindex, 0); 903 if (error) { 904 error = EFAULT; 905 goto done2; 906 } 907 ++lastvecindex; 908 } 909 910 /* 911 * If the map has changed, due to the subyte, the previous 912 * output may be invalid. 913 */ 914 vm_map_lock_read(map); 915 if (timestamp != map->timestamp) 916 goto RestartScan; 917 vm_map_unlock_read(map); 918 done2: 919 return (error); 920 } 921 922 #ifndef _SYS_SYSPROTO_H_ 923 struct mlock_args { 924 const void *addr; 925 size_t len; 926 }; 927 #endif 928 /* 929 * MPSAFE 930 */ 931 int 932 mlock(td, uap) 933 struct thread *td; 934 struct mlock_args *uap; 935 { 936 struct proc *proc; 937 vm_offset_t addr, end, last, start; 938 vm_size_t npages, size; 939 int error; 940 941 error = suser(td); 942 if (error) 943 return (error); 944 addr = (vm_offset_t)uap->addr; 945 size = uap->len; 946 last = addr + size; 947 start = trunc_page(addr); 948 end = round_page(last); 949 if (last < addr || end < addr) 950 return (EINVAL); 951 npages = atop(end - start); 952 if (npages > vm_page_max_wired) 953 return (ENOMEM); 954 proc = td->td_proc; 955 PROC_LOCK(proc); 956 if (ptoa(npages + 957 pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) > 958 lim_cur(proc, RLIMIT_MEMLOCK)) { 959 PROC_UNLOCK(proc); 960 return (ENOMEM); 961 } 962 PROC_UNLOCK(proc); 963 if (npages + cnt.v_wire_count > vm_page_max_wired) 964 return (EAGAIN); 965 error = vm_map_wire(&proc->p_vmspace->vm_map, start, end, 966 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 967 return (error == KERN_SUCCESS ? 0 : ENOMEM); 968 } 969 970 #ifndef _SYS_SYSPROTO_H_ 971 struct mlockall_args { 972 int how; 973 }; 974 #endif 975 976 /* 977 * MPSAFE 978 */ 979 int 980 mlockall(td, uap) 981 struct thread *td; 982 struct mlockall_args *uap; 983 { 984 vm_map_t map; 985 int error; 986 987 map = &td->td_proc->p_vmspace->vm_map; 988 error = 0; 989 990 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 991 return (EINVAL); 992 993 #if 0 994 /* 995 * If wiring all pages in the process would cause it to exceed 996 * a hard resource limit, return ENOMEM. 997 */ 998 PROC_LOCK(td->td_proc); 999 if (map->size - ptoa(pmap_wired_count(vm_map_pmap(map)) > 1000 lim_cur(td->td_proc, RLIMIT_MEMLOCK))) { 1001 PROC_UNLOCK(td->td_proc); 1002 return (ENOMEM); 1003 } 1004 PROC_UNLOCK(td->td_proc); 1005 #else 1006 error = suser(td); 1007 if (error) 1008 return (error); 1009 #endif 1010 1011 if (uap->how & MCL_FUTURE) { 1012 vm_map_lock(map); 1013 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1014 vm_map_unlock(map); 1015 error = 0; 1016 } 1017 1018 if (uap->how & MCL_CURRENT) { 1019 /* 1020 * P1003.1-2001 mandates that all currently mapped pages 1021 * will be memory resident and locked (wired) upon return 1022 * from mlockall(). vm_map_wire() will wire pages, by 1023 * calling vm_fault_wire() for each page in the region. 1024 */ 1025 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1026 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1027 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1028 } 1029 1030 return (error); 1031 } 1032 1033 #ifndef _SYS_SYSPROTO_H_ 1034 struct munlockall_args { 1035 register_t dummy; 1036 }; 1037 #endif 1038 1039 /* 1040 * MPSAFE 1041 */ 1042 int 1043 munlockall(td, uap) 1044 struct thread *td; 1045 struct munlockall_args *uap; 1046 { 1047 vm_map_t map; 1048 int error; 1049 1050 map = &td->td_proc->p_vmspace->vm_map; 1051 error = suser(td); 1052 if (error) 1053 return (error); 1054 1055 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1056 vm_map_lock(map); 1057 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1058 vm_map_unlock(map); 1059 1060 /* Forcibly unwire all pages. */ 1061 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1062 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1063 1064 return (error); 1065 } 1066 1067 #ifndef _SYS_SYSPROTO_H_ 1068 struct munlock_args { 1069 const void *addr; 1070 size_t len; 1071 }; 1072 #endif 1073 /* 1074 * MPSAFE 1075 */ 1076 int 1077 munlock(td, uap) 1078 struct thread *td; 1079 struct munlock_args *uap; 1080 { 1081 vm_offset_t addr, end, last, start; 1082 vm_size_t size; 1083 int error; 1084 1085 error = suser(td); 1086 if (error) 1087 return (error); 1088 addr = (vm_offset_t)uap->addr; 1089 size = uap->len; 1090 last = addr + size; 1091 start = trunc_page(addr); 1092 end = round_page(last); 1093 if (last < addr || end < addr) 1094 return (EINVAL); 1095 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1096 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1097 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1098 } 1099 1100 /* 1101 * vm_mmap_vnode() 1102 * 1103 * MPSAFE 1104 * 1105 * Helper function for vm_mmap. Perform sanity check specific for mmap 1106 * operations on vnodes. 1107 */ 1108 int 1109 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1110 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1111 struct vnode *vp, vm_ooffset_t foff, vm_object_t *objp) 1112 { 1113 struct vattr va; 1114 void *handle; 1115 vm_object_t obj; 1116 struct mount *mp; 1117 int error, flags, type; 1118 int vfslocked; 1119 1120 mp = vp->v_mount; 1121 vfslocked = VFS_LOCK_GIANT(mp); 1122 if ((error = vget(vp, LK_EXCLUSIVE, td)) != 0) { 1123 VFS_UNLOCK_GIANT(vfslocked); 1124 return (error); 1125 } 1126 flags = *flagsp; 1127 obj = vp->v_object; 1128 if (vp->v_type == VREG) { 1129 /* 1130 * Get the proper underlying object 1131 */ 1132 if (obj == NULL) { 1133 error = EINVAL; 1134 goto done; 1135 } 1136 if (obj->handle != vp) { 1137 vput(vp); 1138 vp = (struct vnode*)obj->handle; 1139 vget(vp, LK_EXCLUSIVE, td); 1140 } 1141 type = OBJT_VNODE; 1142 handle = vp; 1143 } else if (vp->v_type == VCHR) { 1144 type = OBJT_DEVICE; 1145 handle = vp->v_rdev; 1146 1147 /* XXX: lack thredref on device */ 1148 if(vp->v_rdev->si_devsw->d_flags & D_MMAP_ANON) { 1149 *maxprotp = VM_PROT_ALL; 1150 *flagsp |= MAP_ANON; 1151 error = 0; 1152 goto done; 1153 } 1154 /* 1155 * cdevs does not provide private mappings of any kind. 1156 */ 1157 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1158 (prot & PROT_WRITE) != 0) { 1159 error = EACCES; 1160 goto done; 1161 } 1162 if (flags & (MAP_PRIVATE|MAP_COPY)) { 1163 error = EINVAL; 1164 goto done; 1165 } 1166 /* 1167 * Force device mappings to be shared. 1168 */ 1169 flags |= MAP_SHARED; 1170 } else { 1171 error = EINVAL; 1172 goto done; 1173 } 1174 if ((error = VOP_GETATTR(vp, &va, td->td_ucred, td))) { 1175 goto done; 1176 } 1177 #ifdef MAC 1178 error = mac_check_vnode_mmap(td->td_ucred, vp, prot, flags); 1179 if (error != 0) 1180 goto done; 1181 #endif 1182 if ((flags & MAP_SHARED) != 0) { 1183 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1184 if (prot & PROT_WRITE) { 1185 error = EPERM; 1186 goto done; 1187 } 1188 *maxprotp &= ~VM_PROT_WRITE; 1189 } 1190 } 1191 /* 1192 * If it is a regular file without any references 1193 * we do not need to sync it. 1194 * Adjust object size to be the size of actual file. 1195 */ 1196 if (vp->v_type == VREG) { 1197 objsize = round_page(va.va_size); 1198 if (va.va_nlink == 0) 1199 flags |= MAP_NOSYNC; 1200 } 1201 obj = vm_pager_allocate(type, handle, objsize, prot, foff); 1202 if (obj == NULL) { 1203 error = (type == OBJT_DEVICE ? EINVAL : ENOMEM); 1204 goto done; 1205 } 1206 *objp = obj; 1207 *flagsp = flags; 1208 vfs_mark_atime(vp, td); 1209 1210 done: 1211 vput(vp); 1212 VFS_UNLOCK_GIANT(vfslocked); 1213 return (error); 1214 } 1215 1216 /* 1217 * vm_mmap_cdev() 1218 * 1219 * MPSAFE 1220 * 1221 * Helper function for vm_mmap. Perform sanity check specific for mmap 1222 * operations on cdevs. 1223 */ 1224 int 1225 vm_mmap_cdev(struct thread *td, vm_size_t objsize, 1226 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1227 struct cdev *cdev, vm_ooffset_t foff, vm_object_t *objp) 1228 { 1229 vm_object_t obj; 1230 int flags; 1231 1232 flags = *flagsp; 1233 1234 /* XXX: lack thredref on device */ 1235 if (cdev->si_devsw->d_flags & D_MMAP_ANON) { 1236 *maxprotp = VM_PROT_ALL; 1237 *flagsp |= MAP_ANON; 1238 return (0); 1239 } 1240 /* 1241 * cdevs does not provide private mappings of any kind. 1242 */ 1243 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1244 (prot & PROT_WRITE) != 0) 1245 return (EACCES); 1246 if (flags & (MAP_PRIVATE|MAP_COPY)) 1247 return (EINVAL); 1248 /* 1249 * Force device mappings to be shared. 1250 */ 1251 flags |= MAP_SHARED; 1252 #ifdef MAC_XXX 1253 error = mac_check_cdev_mmap(td->td_ucred, cdev, prot); 1254 if (error != 0) 1255 return (error); 1256 #endif 1257 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, foff); 1258 if (obj == NULL) 1259 return (EINVAL); 1260 *objp = obj; 1261 *flagsp = flags; 1262 return (0); 1263 } 1264 1265 /* 1266 * vm_mmap() 1267 * 1268 * MPSAFE 1269 * 1270 * Internal version of mmap. Currently used by mmap, exec, and sys5 1271 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1272 */ 1273 int 1274 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1275 vm_prot_t maxprot, int flags, 1276 objtype_t handle_type, void *handle, 1277 vm_ooffset_t foff) 1278 { 1279 boolean_t fitit; 1280 vm_object_t object; 1281 int rv = KERN_SUCCESS; 1282 int docow, error; 1283 struct thread *td = curthread; 1284 1285 if (size == 0) 1286 return (0); 1287 1288 size = round_page(size); 1289 1290 PROC_LOCK(td->td_proc); 1291 if (td->td_proc->p_vmspace->vm_map.size + size > 1292 lim_cur(td->td_proc, RLIMIT_VMEM)) { 1293 PROC_UNLOCK(td->td_proc); 1294 return(ENOMEM); 1295 } 1296 PROC_UNLOCK(td->td_proc); 1297 1298 /* 1299 * We currently can only deal with page aligned file offsets. 1300 * The check is here rather than in the syscall because the 1301 * kernel calls this function internally for other mmaping 1302 * operations (such as in exec) and non-aligned offsets will 1303 * cause pmap inconsistencies...so we want to be sure to 1304 * disallow this in all cases. 1305 */ 1306 if (foff & PAGE_MASK) 1307 return (EINVAL); 1308 1309 if ((flags & MAP_FIXED) == 0) { 1310 fitit = TRUE; 1311 *addr = round_page(*addr); 1312 } else { 1313 if (*addr != trunc_page(*addr)) 1314 return (EINVAL); 1315 fitit = FALSE; 1316 (void) vm_map_remove(map, *addr, *addr + size); 1317 } 1318 /* 1319 * Lookup/allocate object. 1320 */ 1321 switch (handle_type) { 1322 case OBJT_DEVICE: 1323 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, 1324 handle, foff, &object); 1325 break; 1326 case OBJT_VNODE: 1327 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1328 handle, foff, &object); 1329 break; 1330 case OBJT_DEFAULT: 1331 if (handle == NULL) { 1332 error = 0; 1333 break; 1334 } 1335 /* FALLTHROUGH */ 1336 default: 1337 error = EINVAL; 1338 } 1339 if (error) 1340 return (error); 1341 if (flags & MAP_ANON) { 1342 object = NULL; 1343 docow = 0; 1344 /* 1345 * Unnamed anonymous regions always start at 0. 1346 */ 1347 if (handle == 0) 1348 foff = 0; 1349 } else { 1350 docow = MAP_PREFAULT_PARTIAL; 1351 } 1352 1353 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1354 docow |= MAP_COPY_ON_WRITE; 1355 if (flags & MAP_NOSYNC) 1356 docow |= MAP_DISABLE_SYNCER; 1357 if (flags & MAP_NOCORE) 1358 docow |= MAP_DISABLE_COREDUMP; 1359 1360 #if defined(VM_PROT_READ_IS_EXEC) 1361 if (prot & VM_PROT_READ) 1362 prot |= VM_PROT_EXECUTE; 1363 1364 if (maxprot & VM_PROT_READ) 1365 maxprot |= VM_PROT_EXECUTE; 1366 #endif 1367 1368 if (fitit) 1369 *addr = pmap_addr_hint(object, *addr, size); 1370 1371 if (flags & MAP_STACK) 1372 rv = vm_map_stack(map, *addr, size, prot, maxprot, 1373 docow | MAP_STACK_GROWS_DOWN); 1374 else 1375 rv = vm_map_find(map, object, foff, addr, size, fitit, 1376 prot, maxprot, docow); 1377 1378 if (rv != KERN_SUCCESS) { 1379 /* 1380 * Lose the object reference. Will destroy the 1381 * object if it's an unnamed anonymous mapping 1382 * or named anonymous without other references. 1383 */ 1384 vm_object_deallocate(object); 1385 } else if (flags & MAP_SHARED) { 1386 /* 1387 * Shared memory is also shared with children. 1388 */ 1389 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); 1390 if (rv != KERN_SUCCESS) 1391 (void) vm_map_remove(map, *addr, *addr + size); 1392 } 1393 1394 /* 1395 * If the process has requested that all future mappings 1396 * be wired, then heed this. 1397 */ 1398 if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE)) 1399 vm_map_wire(map, *addr, *addr + size, 1400 VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); 1401 1402 switch (rv) { 1403 case KERN_SUCCESS: 1404 return (0); 1405 case KERN_INVALID_ADDRESS: 1406 case KERN_NO_SPACE: 1407 return (ENOMEM); 1408 case KERN_PROTECTION_FAILURE: 1409 return (EACCES); 1410 default: 1411 return (EINVAL); 1412 } 1413 } 1414