1 /* 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35 * 36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_compat.h" 47 #include "opt_mac.h" 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/kernel.h> 52 #include <sys/lock.h> 53 #include <sys/mutex.h> 54 #include <sys/sysproto.h> 55 #include <sys/filedesc.h> 56 #include <sys/proc.h> 57 #include <sys/resource.h> 58 #include <sys/resourcevar.h> 59 #include <sys/vnode.h> 60 #include <sys/fcntl.h> 61 #include <sys/file.h> 62 #include <sys/mac.h> 63 #include <sys/mman.h> 64 #include <sys/mount.h> 65 #include <sys/conf.h> 66 #include <sys/stat.h> 67 #include <sys/vmmeter.h> 68 #include <sys/sysctl.h> 69 70 #include <vm/vm.h> 71 #include <vm/vm_param.h> 72 #include <vm/pmap.h> 73 #include <vm/vm_map.h> 74 #include <vm/vm_object.h> 75 #include <vm/vm_page.h> 76 #include <vm/vm_pager.h> 77 #include <vm/vm_pageout.h> 78 #include <vm/vm_extern.h> 79 #include <vm/vm_page.h> 80 #include <vm/vm_kern.h> 81 82 #ifndef _SYS_SYSPROTO_H_ 83 struct sbrk_args { 84 int incr; 85 }; 86 #endif 87 88 static int max_proc_mmap; 89 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, ""); 90 91 /* 92 * Set the maximum number of vm_map_entry structures per process. Roughly 93 * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100 94 * of our KVM malloc space still results in generous limits. We want a 95 * default that is good enough to prevent the kernel running out of resources 96 * if attacked from compromised user account but generous enough such that 97 * multi-threaded processes are not unduly inconvenienced. 98 */ 99 static void vmmapentry_rsrc_init(void *); 100 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, NULL) 101 102 static void 103 vmmapentry_rsrc_init(dummy) 104 void *dummy; 105 { 106 max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry); 107 max_proc_mmap /= 100; 108 } 109 110 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 111 int *, struct vnode *, vm_ooffset_t, vm_object_t *); 112 113 /* 114 * MPSAFE 115 */ 116 /* ARGSUSED */ 117 int 118 sbrk(td, uap) 119 struct thread *td; 120 struct sbrk_args *uap; 121 { 122 /* Not yet implemented */ 123 /* mtx_lock(&Giant); */ 124 /* mtx_unlock(&Giant); */ 125 return (EOPNOTSUPP); 126 } 127 128 #ifndef _SYS_SYSPROTO_H_ 129 struct sstk_args { 130 int incr; 131 }; 132 #endif 133 134 /* 135 * MPSAFE 136 */ 137 /* ARGSUSED */ 138 int 139 sstk(td, uap) 140 struct thread *td; 141 struct sstk_args *uap; 142 { 143 /* Not yet implemented */ 144 /* mtx_lock(&Giant); */ 145 /* mtx_unlock(&Giant); */ 146 return (EOPNOTSUPP); 147 } 148 149 #if defined(COMPAT_43) 150 #ifndef _SYS_SYSPROTO_H_ 151 struct getpagesize_args { 152 int dummy; 153 }; 154 #endif 155 156 /* ARGSUSED */ 157 int 158 ogetpagesize(td, uap) 159 struct thread *td; 160 struct getpagesize_args *uap; 161 { 162 /* MP SAFE */ 163 td->td_retval[0] = PAGE_SIZE; 164 return (0); 165 } 166 #endif /* COMPAT_43 */ 167 168 169 /* 170 * Memory Map (mmap) system call. Note that the file offset 171 * and address are allowed to be NOT page aligned, though if 172 * the MAP_FIXED flag it set, both must have the same remainder 173 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 174 * page-aligned, the actual mapping starts at trunc_page(addr) 175 * and the return value is adjusted up by the page offset. 176 * 177 * Generally speaking, only character devices which are themselves 178 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 179 * there would be no cache coherency between a descriptor and a VM mapping 180 * both to the same character device. 181 * 182 * Block devices can be mmap'd no matter what they represent. Cache coherency 183 * is maintained as long as you do not write directly to the underlying 184 * character device. 185 */ 186 #ifndef _SYS_SYSPROTO_H_ 187 struct mmap_args { 188 void *addr; 189 size_t len; 190 int prot; 191 int flags; 192 int fd; 193 long pad; 194 off_t pos; 195 }; 196 #endif 197 198 /* 199 * MPSAFE 200 */ 201 int 202 mmap(td, uap) 203 struct thread *td; 204 struct mmap_args *uap; 205 { 206 struct file *fp; 207 struct vnode *vp; 208 vm_offset_t addr; 209 vm_size_t size, pageoff; 210 vm_prot_t prot, maxprot; 211 void *handle; 212 int flags, error; 213 off_t pos; 214 struct vmspace *vms = td->td_proc->p_vmspace; 215 216 addr = (vm_offset_t) uap->addr; 217 size = uap->len; 218 prot = uap->prot & VM_PROT_ALL; 219 flags = uap->flags; 220 pos = uap->pos; 221 222 fp = NULL; 223 /* make sure mapping fits into numeric range etc */ 224 if ((ssize_t) uap->len < 0 || 225 ((flags & MAP_ANON) && uap->fd != -1)) 226 return (EINVAL); 227 228 if (flags & MAP_STACK) { 229 if ((uap->fd != -1) || 230 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 231 return (EINVAL); 232 flags |= MAP_ANON; 233 pos = 0; 234 } 235 236 /* 237 * Align the file position to a page boundary, 238 * and save its page offset component. 239 */ 240 pageoff = (pos & PAGE_MASK); 241 pos -= pageoff; 242 243 /* Adjust size for rounding (on both ends). */ 244 size += pageoff; /* low end... */ 245 size = (vm_size_t) round_page(size); /* hi end */ 246 247 /* 248 * Check for illegal addresses. Watch out for address wrap... Note 249 * that VM_*_ADDRESS are not constants due to casts (argh). 250 */ 251 if (flags & MAP_FIXED) { 252 /* 253 * The specified address must have the same remainder 254 * as the file offset taken modulo PAGE_SIZE, so it 255 * should be aligned after adjustment by pageoff. 256 */ 257 addr -= pageoff; 258 if (addr & PAGE_MASK) 259 return (EINVAL); 260 /* Address range must be all in user VM space. */ 261 if (addr < vm_map_min(&vms->vm_map) || 262 addr + size > vm_map_max(&vms->vm_map)) 263 return (EINVAL); 264 if (addr + size < addr) 265 return (EINVAL); 266 } else { 267 /* 268 * XXX for non-fixed mappings where no hint is provided or 269 * the hint would fall in the potential heap space, 270 * place it after the end of the largest possible heap. 271 * 272 * There should really be a pmap call to determine a reasonable 273 * location. 274 */ 275 PROC_LOCK(td->td_proc); 276 if (addr == 0 || 277 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 278 addr < round_page((vm_offset_t)vms->vm_daddr + 279 lim_max(td->td_proc, RLIMIT_DATA)))) 280 addr = round_page((vm_offset_t)vms->vm_daddr + 281 lim_max(td->td_proc, RLIMIT_DATA)); 282 PROC_UNLOCK(td->td_proc); 283 } 284 if (flags & MAP_ANON) { 285 /* 286 * Mapping blank space is trivial. 287 */ 288 handle = NULL; 289 maxprot = VM_PROT_ALL; 290 pos = 0; 291 } else { 292 /* 293 * Mapping file, get fp for validation. Obtain vnode and make 294 * sure it is of appropriate type. 295 * don't let the descriptor disappear on us if we block 296 */ 297 if ((error = fget(td, uap->fd, &fp)) != 0) 298 goto done; 299 if (fp->f_type != DTYPE_VNODE) { 300 error = EINVAL; 301 goto done; 302 } 303 /* 304 * POSIX shared-memory objects are defined to have 305 * kernel persistence, and are not defined to support 306 * read(2)/write(2) -- or even open(2). Thus, we can 307 * use MAP_ASYNC to trade on-disk coherence for speed. 308 * The shm_open(3) library routine turns on the FPOSIXSHM 309 * flag to request this behavior. 310 */ 311 if (fp->f_flag & FPOSIXSHM) 312 flags |= MAP_NOSYNC; 313 vp = fp->f_vnode; 314 /* 315 * Ensure that file and memory protections are 316 * compatible. Note that we only worry about 317 * writability if mapping is shared; in this case, 318 * current and max prot are dictated by the open file. 319 * XXX use the vnode instead? Problem is: what 320 * credentials do we use for determination? What if 321 * proc does a setuid? 322 */ 323 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 324 maxprot = VM_PROT_NONE; 325 else 326 maxprot = VM_PROT_EXECUTE; 327 if (fp->f_flag & FREAD) { 328 maxprot |= VM_PROT_READ; 329 } else if (prot & PROT_READ) { 330 error = EACCES; 331 goto done; 332 } 333 /* 334 * If we are sharing potential changes (either via 335 * MAP_SHARED or via the implicit sharing of character 336 * device mappings), and we are trying to get write 337 * permission although we opened it without asking 338 * for it, bail out. 339 */ 340 if ((flags & MAP_SHARED) != 0) { 341 if ((fp->f_flag & FWRITE) != 0) { 342 maxprot |= VM_PROT_WRITE; 343 } else if ((prot & PROT_WRITE) != 0) { 344 error = EACCES; 345 goto done; 346 } 347 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 348 maxprot |= VM_PROT_WRITE; 349 } 350 handle = (void *)vp; 351 } 352 353 /* 354 * Do not allow more then a certain number of vm_map_entry structures 355 * per process. Scale with the number of rforks sharing the map 356 * to make the limit reasonable for threads. 357 */ 358 if (max_proc_mmap && 359 vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) { 360 error = ENOMEM; 361 goto done; 362 } 363 364 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 365 flags, handle, pos); 366 if (error == 0) 367 td->td_retval[0] = (register_t) (addr + pageoff); 368 done: 369 if (fp) 370 fdrop(fp, td); 371 372 return (error); 373 } 374 375 #ifdef COMPAT_43 376 #ifndef _SYS_SYSPROTO_H_ 377 struct ommap_args { 378 caddr_t addr; 379 int len; 380 int prot; 381 int flags; 382 int fd; 383 long pos; 384 }; 385 #endif 386 int 387 ommap(td, uap) 388 struct thread *td; 389 struct ommap_args *uap; 390 { 391 struct mmap_args nargs; 392 static const char cvtbsdprot[8] = { 393 0, 394 PROT_EXEC, 395 PROT_WRITE, 396 PROT_EXEC | PROT_WRITE, 397 PROT_READ, 398 PROT_EXEC | PROT_READ, 399 PROT_WRITE | PROT_READ, 400 PROT_EXEC | PROT_WRITE | PROT_READ, 401 }; 402 403 #define OMAP_ANON 0x0002 404 #define OMAP_COPY 0x0020 405 #define OMAP_SHARED 0x0010 406 #define OMAP_FIXED 0x0100 407 408 nargs.addr = uap->addr; 409 nargs.len = uap->len; 410 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 411 nargs.flags = 0; 412 if (uap->flags & OMAP_ANON) 413 nargs.flags |= MAP_ANON; 414 if (uap->flags & OMAP_COPY) 415 nargs.flags |= MAP_COPY; 416 if (uap->flags & OMAP_SHARED) 417 nargs.flags |= MAP_SHARED; 418 else 419 nargs.flags |= MAP_PRIVATE; 420 if (uap->flags & OMAP_FIXED) 421 nargs.flags |= MAP_FIXED; 422 nargs.fd = uap->fd; 423 nargs.pos = uap->pos; 424 return (mmap(td, &nargs)); 425 } 426 #endif /* COMPAT_43 */ 427 428 429 #ifndef _SYS_SYSPROTO_H_ 430 struct msync_args { 431 void *addr; 432 int len; 433 int flags; 434 }; 435 #endif 436 /* 437 * MPSAFE 438 */ 439 int 440 msync(td, uap) 441 struct thread *td; 442 struct msync_args *uap; 443 { 444 vm_offset_t addr; 445 vm_size_t size, pageoff; 446 int flags; 447 vm_map_t map; 448 int rv; 449 450 addr = (vm_offset_t) uap->addr; 451 size = uap->len; 452 flags = uap->flags; 453 454 pageoff = (addr & PAGE_MASK); 455 addr -= pageoff; 456 size += pageoff; 457 size = (vm_size_t) round_page(size); 458 if (addr + size < addr) 459 return (EINVAL); 460 461 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 462 return (EINVAL); 463 464 map = &td->td_proc->p_vmspace->vm_map; 465 466 /* 467 * Clean the pages and interpret the return value. 468 */ 469 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 470 (flags & MS_INVALIDATE) != 0); 471 switch (rv) { 472 case KERN_SUCCESS: 473 return (0); 474 case KERN_INVALID_ADDRESS: 475 return (EINVAL); /* Sun returns ENOMEM? */ 476 case KERN_INVALID_ARGUMENT: 477 return (EBUSY); 478 default: 479 return (EINVAL); 480 } 481 } 482 483 #ifndef _SYS_SYSPROTO_H_ 484 struct munmap_args { 485 void *addr; 486 size_t len; 487 }; 488 #endif 489 /* 490 * MPSAFE 491 */ 492 int 493 munmap(td, uap) 494 struct thread *td; 495 struct munmap_args *uap; 496 { 497 vm_offset_t addr; 498 vm_size_t size, pageoff; 499 vm_map_t map; 500 501 addr = (vm_offset_t) uap->addr; 502 size = uap->len; 503 if (size == 0) 504 return (EINVAL); 505 506 pageoff = (addr & PAGE_MASK); 507 addr -= pageoff; 508 size += pageoff; 509 size = (vm_size_t) round_page(size); 510 if (addr + size < addr) 511 return (EINVAL); 512 513 /* 514 * Check for illegal addresses. Watch out for address wrap... 515 */ 516 map = &td->td_proc->p_vmspace->vm_map; 517 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 518 return (EINVAL); 519 vm_map_lock(map); 520 /* 521 * Make sure entire range is allocated. 522 */ 523 if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) { 524 vm_map_unlock(map); 525 return (EINVAL); 526 } 527 /* returns nothing but KERN_SUCCESS anyway */ 528 vm_map_delete(map, addr, addr + size); 529 vm_map_unlock(map); 530 return (0); 531 } 532 533 #ifndef _SYS_SYSPROTO_H_ 534 struct mprotect_args { 535 const void *addr; 536 size_t len; 537 int prot; 538 }; 539 #endif 540 /* 541 * MPSAFE 542 */ 543 int 544 mprotect(td, uap) 545 struct thread *td; 546 struct mprotect_args *uap; 547 { 548 vm_offset_t addr; 549 vm_size_t size, pageoff; 550 vm_prot_t prot; 551 552 addr = (vm_offset_t) uap->addr; 553 size = uap->len; 554 prot = uap->prot & VM_PROT_ALL; 555 #if defined(VM_PROT_READ_IS_EXEC) 556 if (prot & VM_PROT_READ) 557 prot |= VM_PROT_EXECUTE; 558 #endif 559 560 pageoff = (addr & PAGE_MASK); 561 addr -= pageoff; 562 size += pageoff; 563 size = (vm_size_t) round_page(size); 564 if (addr + size < addr) 565 return (EINVAL); 566 567 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 568 addr + size, prot, FALSE)) { 569 case KERN_SUCCESS: 570 return (0); 571 case KERN_PROTECTION_FAILURE: 572 return (EACCES); 573 } 574 return (EINVAL); 575 } 576 577 #ifndef _SYS_SYSPROTO_H_ 578 struct minherit_args { 579 void *addr; 580 size_t len; 581 int inherit; 582 }; 583 #endif 584 /* 585 * MPSAFE 586 */ 587 int 588 minherit(td, uap) 589 struct thread *td; 590 struct minherit_args *uap; 591 { 592 vm_offset_t addr; 593 vm_size_t size, pageoff; 594 vm_inherit_t inherit; 595 596 addr = (vm_offset_t)uap->addr; 597 size = uap->len; 598 inherit = uap->inherit; 599 600 pageoff = (addr & PAGE_MASK); 601 addr -= pageoff; 602 size += pageoff; 603 size = (vm_size_t) round_page(size); 604 if (addr + size < addr) 605 return (EINVAL); 606 607 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 608 addr + size, inherit)) { 609 case KERN_SUCCESS: 610 return (0); 611 case KERN_PROTECTION_FAILURE: 612 return (EACCES); 613 } 614 return (EINVAL); 615 } 616 617 #ifndef _SYS_SYSPROTO_H_ 618 struct madvise_args { 619 void *addr; 620 size_t len; 621 int behav; 622 }; 623 #endif 624 625 /* 626 * MPSAFE 627 */ 628 /* ARGSUSED */ 629 int 630 madvise(td, uap) 631 struct thread *td; 632 struct madvise_args *uap; 633 { 634 vm_offset_t start, end; 635 vm_map_t map; 636 struct proc *p; 637 int error; 638 639 /* 640 * Check for our special case, advising the swap pager we are 641 * "immortal." 642 */ 643 if (uap->behav == MADV_PROTECT) { 644 error = suser(td); 645 if (error == 0) { 646 p = td->td_proc; 647 PROC_LOCK(p); 648 p->p_flag |= P_PROTECTED; 649 PROC_UNLOCK(p); 650 } 651 return (error); 652 } 653 /* 654 * Check for illegal behavior 655 */ 656 if (uap->behav < 0 || uap->behav > MADV_CORE) 657 return (EINVAL); 658 /* 659 * Check for illegal addresses. Watch out for address wrap... Note 660 * that VM_*_ADDRESS are not constants due to casts (argh). 661 */ 662 map = &td->td_proc->p_vmspace->vm_map; 663 if ((vm_offset_t)uap->addr < vm_map_min(map) || 664 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 665 return (EINVAL); 666 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 667 return (EINVAL); 668 669 /* 670 * Since this routine is only advisory, we default to conservative 671 * behavior. 672 */ 673 start = trunc_page((vm_offset_t) uap->addr); 674 end = round_page((vm_offset_t) uap->addr + uap->len); 675 676 if (vm_map_madvise(map, start, end, uap->behav)) 677 return (EINVAL); 678 return (0); 679 } 680 681 #ifndef _SYS_SYSPROTO_H_ 682 struct mincore_args { 683 const void *addr; 684 size_t len; 685 char *vec; 686 }; 687 #endif 688 689 /* 690 * MPSAFE 691 */ 692 /* ARGSUSED */ 693 int 694 mincore(td, uap) 695 struct thread *td; 696 struct mincore_args *uap; 697 { 698 vm_offset_t addr, first_addr; 699 vm_offset_t end, cend; 700 pmap_t pmap; 701 vm_map_t map; 702 char *vec; 703 int error = 0; 704 int vecindex, lastvecindex; 705 vm_map_entry_t current; 706 vm_map_entry_t entry; 707 int mincoreinfo; 708 unsigned int timestamp; 709 710 /* 711 * Make sure that the addresses presented are valid for user 712 * mode. 713 */ 714 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 715 end = addr + (vm_size_t)round_page(uap->len); 716 map = &td->td_proc->p_vmspace->vm_map; 717 if (end > vm_map_max(map) || end < addr) 718 return (EINVAL); 719 720 /* 721 * Address of byte vector 722 */ 723 vec = uap->vec; 724 725 pmap = vmspace_pmap(td->td_proc->p_vmspace); 726 727 vm_map_lock_read(map); 728 RestartScan: 729 timestamp = map->timestamp; 730 731 if (!vm_map_lookup_entry(map, addr, &entry)) 732 entry = entry->next; 733 734 /* 735 * Do this on a map entry basis so that if the pages are not 736 * in the current processes address space, we can easily look 737 * up the pages elsewhere. 738 */ 739 lastvecindex = -1; 740 for (current = entry; 741 (current != &map->header) && (current->start < end); 742 current = current->next) { 743 744 /* 745 * ignore submaps (for now) or null objects 746 */ 747 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 748 current->object.vm_object == NULL) 749 continue; 750 751 /* 752 * limit this scan to the current map entry and the 753 * limits for the mincore call 754 */ 755 if (addr < current->start) 756 addr = current->start; 757 cend = current->end; 758 if (cend > end) 759 cend = end; 760 761 /* 762 * scan this entry one page at a time 763 */ 764 while (addr < cend) { 765 /* 766 * Check pmap first, it is likely faster, also 767 * it can provide info as to whether we are the 768 * one referencing or modifying the page. 769 */ 770 mincoreinfo = pmap_mincore(pmap, addr); 771 if (!mincoreinfo) { 772 vm_pindex_t pindex; 773 vm_ooffset_t offset; 774 vm_page_t m; 775 /* 776 * calculate the page index into the object 777 */ 778 offset = current->offset + (addr - current->start); 779 pindex = OFF_TO_IDX(offset); 780 VM_OBJECT_LOCK(current->object.vm_object); 781 m = vm_page_lookup(current->object.vm_object, 782 pindex); 783 /* 784 * if the page is resident, then gather information about 785 * it. 786 */ 787 if (m != NULL && m->valid != 0) { 788 mincoreinfo = MINCORE_INCORE; 789 vm_page_lock_queues(); 790 if (m->dirty || 791 pmap_is_modified(m)) 792 mincoreinfo |= MINCORE_MODIFIED_OTHER; 793 if ((m->flags & PG_REFERENCED) || 794 pmap_ts_referenced(m)) { 795 vm_page_flag_set(m, PG_REFERENCED); 796 mincoreinfo |= MINCORE_REFERENCED_OTHER; 797 } 798 vm_page_unlock_queues(); 799 } 800 VM_OBJECT_UNLOCK(current->object.vm_object); 801 } 802 803 /* 804 * subyte may page fault. In case it needs to modify 805 * the map, we release the lock. 806 */ 807 vm_map_unlock_read(map); 808 809 /* 810 * calculate index into user supplied byte vector 811 */ 812 vecindex = OFF_TO_IDX(addr - first_addr); 813 814 /* 815 * If we have skipped map entries, we need to make sure that 816 * the byte vector is zeroed for those skipped entries. 817 */ 818 while ((lastvecindex + 1) < vecindex) { 819 error = subyte(vec + lastvecindex, 0); 820 if (error) { 821 error = EFAULT; 822 goto done2; 823 } 824 ++lastvecindex; 825 } 826 827 /* 828 * Pass the page information to the user 829 */ 830 error = subyte(vec + vecindex, mincoreinfo); 831 if (error) { 832 error = EFAULT; 833 goto done2; 834 } 835 836 /* 837 * If the map has changed, due to the subyte, the previous 838 * output may be invalid. 839 */ 840 vm_map_lock_read(map); 841 if (timestamp != map->timestamp) 842 goto RestartScan; 843 844 lastvecindex = vecindex; 845 addr += PAGE_SIZE; 846 } 847 } 848 849 /* 850 * subyte may page fault. In case it needs to modify 851 * the map, we release the lock. 852 */ 853 vm_map_unlock_read(map); 854 855 /* 856 * Zero the last entries in the byte vector. 857 */ 858 vecindex = OFF_TO_IDX(end - first_addr); 859 while ((lastvecindex + 1) < vecindex) { 860 error = subyte(vec + lastvecindex, 0); 861 if (error) { 862 error = EFAULT; 863 goto done2; 864 } 865 ++lastvecindex; 866 } 867 868 /* 869 * If the map has changed, due to the subyte, the previous 870 * output may be invalid. 871 */ 872 vm_map_lock_read(map); 873 if (timestamp != map->timestamp) 874 goto RestartScan; 875 vm_map_unlock_read(map); 876 done2: 877 return (error); 878 } 879 880 #ifndef _SYS_SYSPROTO_H_ 881 struct mlock_args { 882 const void *addr; 883 size_t len; 884 }; 885 #endif 886 /* 887 * MPSAFE 888 */ 889 int 890 mlock(td, uap) 891 struct thread *td; 892 struct mlock_args *uap; 893 { 894 struct proc *proc; 895 vm_offset_t addr, end, last, start; 896 vm_size_t npages, size; 897 int error; 898 899 error = suser(td); 900 if (error) 901 return (error); 902 addr = (vm_offset_t)uap->addr; 903 size = uap->len; 904 last = addr + size; 905 start = trunc_page(addr); 906 end = round_page(last); 907 if (last < addr || end < addr) 908 return (EINVAL); 909 npages = atop(end - start); 910 if (npages > vm_page_max_wired) 911 return (ENOMEM); 912 proc = td->td_proc; 913 PROC_LOCK(proc); 914 if (ptoa(npages + 915 pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) > 916 lim_cur(proc, RLIMIT_MEMLOCK)) { 917 PROC_UNLOCK(proc); 918 return (ENOMEM); 919 } 920 PROC_UNLOCK(proc); 921 if (npages + cnt.v_wire_count > vm_page_max_wired) 922 return (EAGAIN); 923 error = vm_map_wire(&proc->p_vmspace->vm_map, start, end, 924 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 925 return (error == KERN_SUCCESS ? 0 : ENOMEM); 926 } 927 928 #ifndef _SYS_SYSPROTO_H_ 929 struct mlockall_args { 930 int how; 931 }; 932 #endif 933 934 /* 935 * MPSAFE 936 */ 937 int 938 mlockall(td, uap) 939 struct thread *td; 940 struct mlockall_args *uap; 941 { 942 vm_map_t map; 943 int error; 944 945 map = &td->td_proc->p_vmspace->vm_map; 946 error = 0; 947 948 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 949 return (EINVAL); 950 951 #if 0 952 /* 953 * If wiring all pages in the process would cause it to exceed 954 * a hard resource limit, return ENOMEM. 955 */ 956 PROC_LOCK(td->td_proc); 957 if (map->size - ptoa(pmap_wired_count(vm_map_pmap(map)) > 958 lim_cur(td->td_proc, RLIMIT_MEMLOCK))) { 959 PROC_UNLOCK(td->td_proc); 960 return (ENOMEM); 961 } 962 PROC_UNLOCK(td->td_proc); 963 #else 964 error = suser(td); 965 if (error) 966 return (error); 967 #endif 968 969 if (uap->how & MCL_FUTURE) { 970 vm_map_lock(map); 971 vm_map_modflags(map, MAP_WIREFUTURE, 0); 972 vm_map_unlock(map); 973 error = 0; 974 } 975 976 if (uap->how & MCL_CURRENT) { 977 /* 978 * P1003.1-2001 mandates that all currently mapped pages 979 * will be memory resident and locked (wired) upon return 980 * from mlockall(). vm_map_wire() will wire pages, by 981 * calling vm_fault_wire() for each page in the region. 982 */ 983 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 984 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 985 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 986 } 987 988 return (error); 989 } 990 991 #ifndef _SYS_SYSPROTO_H_ 992 struct munlockall_args { 993 register_t dummy; 994 }; 995 #endif 996 997 /* 998 * MPSAFE 999 */ 1000 int 1001 munlockall(td, uap) 1002 struct thread *td; 1003 struct munlockall_args *uap; 1004 { 1005 vm_map_t map; 1006 int error; 1007 1008 map = &td->td_proc->p_vmspace->vm_map; 1009 error = suser(td); 1010 if (error) 1011 return (error); 1012 1013 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1014 vm_map_lock(map); 1015 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1016 vm_map_unlock(map); 1017 1018 /* Forcibly unwire all pages. */ 1019 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1020 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1021 1022 return (error); 1023 } 1024 1025 #ifndef _SYS_SYSPROTO_H_ 1026 struct munlock_args { 1027 const void *addr; 1028 size_t len; 1029 }; 1030 #endif 1031 /* 1032 * MPSAFE 1033 */ 1034 int 1035 munlock(td, uap) 1036 struct thread *td; 1037 struct munlock_args *uap; 1038 { 1039 vm_offset_t addr, end, last, start; 1040 vm_size_t size; 1041 int error; 1042 1043 error = suser(td); 1044 if (error) 1045 return (error); 1046 addr = (vm_offset_t)uap->addr; 1047 size = uap->len; 1048 last = addr + size; 1049 start = trunc_page(addr); 1050 end = round_page(last); 1051 if (last < addr || end < addr) 1052 return (EINVAL); 1053 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1054 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1055 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1056 } 1057 1058 /* 1059 * vm_mmap_vnode() 1060 * 1061 * MPSAFE 1062 * 1063 * Helper function for vm_mmap. Perform sanity check specific for mmap 1064 * operations on vnodes. 1065 */ 1066 int 1067 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1068 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1069 struct vnode *vp, vm_ooffset_t foff, vm_object_t *objp) 1070 { 1071 struct vattr va; 1072 void *handle; 1073 vm_object_t obj; 1074 int error, flags, type; 1075 1076 mtx_lock(&Giant); 1077 if ((error = vget(vp, LK_EXCLUSIVE, td)) != 0) { 1078 mtx_unlock(&Giant); 1079 return (error); 1080 } 1081 flags = *flagsp; 1082 if (vp->v_type == VREG) { 1083 /* 1084 * Get the proper underlying object 1085 */ 1086 if (VOP_GETVOBJECT(vp, &obj) != 0) { 1087 error = EINVAL; 1088 goto done; 1089 } 1090 if (obj->handle != vp) { 1091 vput(vp); 1092 vp = (struct vnode*)obj->handle; 1093 vget(vp, LK_EXCLUSIVE, td); 1094 } 1095 type = OBJT_VNODE; 1096 handle = vp; 1097 } else if (vp->v_type == VCHR) { 1098 type = OBJT_DEVICE; 1099 handle = vp->v_rdev; 1100 1101 if(vp->v_rdev->si_devsw->d_flags & D_MMAP_ANON) { 1102 *maxprotp = VM_PROT_ALL; 1103 *flagsp |= MAP_ANON; 1104 error = 0; 1105 goto done; 1106 } 1107 /* 1108 * cdevs does not provide private mappings of any kind. 1109 */ 1110 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1111 (prot & PROT_WRITE) != 0) { 1112 error = EACCES; 1113 goto done; 1114 } 1115 if (flags & (MAP_PRIVATE|MAP_COPY)) { 1116 error = EINVAL; 1117 goto done; 1118 } 1119 /* 1120 * Force device mappings to be shared. 1121 */ 1122 flags &= ~(MAP_PRIVATE|MAP_COPY); 1123 flags |= MAP_SHARED; 1124 } else { 1125 error = EINVAL; 1126 goto done; 1127 } 1128 if ((error = VOP_GETATTR(vp, &va, td->td_ucred, td))) { 1129 goto done; 1130 } 1131 if ((flags & MAP_SHARED) != 0) { 1132 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1133 if (prot & PROT_WRITE) { 1134 error = EPERM; 1135 goto done; 1136 } 1137 *maxprotp &= ~VM_PROT_WRITE; 1138 } 1139 #ifdef MAC 1140 error = mac_check_vnode_mmap(td->td_ucred, vp, prot); 1141 if (error != 0) 1142 goto done; 1143 #endif 1144 } 1145 /* 1146 * If it is a regular file without any references 1147 * we do not need to sync it. 1148 * Adjust object size to be the size of actual file. 1149 */ 1150 if (vp->v_type == VREG) { 1151 objsize = round_page(va.va_size); 1152 if (va.va_nlink == 0) 1153 flags |= MAP_NOSYNC; 1154 } 1155 obj = vm_pager_allocate(type, handle, objsize, prot, foff); 1156 if (obj == NULL) { 1157 error = (type == OBJT_DEVICE ? EINVAL : ENOMEM); 1158 goto done; 1159 } 1160 *objp = obj; 1161 *flagsp = flags; 1162 done: 1163 vput(vp); 1164 mtx_unlock(&Giant); 1165 return (error); 1166 } 1167 1168 /* 1169 * vm_mmap() 1170 * 1171 * MPSAFE 1172 * 1173 * Internal version of mmap. Currently used by mmap, exec, and sys5 1174 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1175 */ 1176 int 1177 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1178 vm_prot_t maxprot, int flags, 1179 void *handle, 1180 vm_ooffset_t foff) 1181 { 1182 boolean_t fitit; 1183 vm_object_t object; 1184 int rv = KERN_SUCCESS; 1185 vm_ooffset_t objsize; 1186 int docow, error; 1187 struct thread *td = curthread; 1188 1189 if (size == 0) 1190 return (0); 1191 1192 objsize = size = round_page(size); 1193 1194 PROC_LOCK(td->td_proc); 1195 if (td->td_proc->p_vmspace->vm_map.size + size > 1196 lim_cur(td->td_proc, RLIMIT_VMEM)) { 1197 PROC_UNLOCK(td->td_proc); 1198 return(ENOMEM); 1199 } 1200 PROC_UNLOCK(td->td_proc); 1201 1202 /* 1203 * We currently can only deal with page aligned file offsets. 1204 * The check is here rather than in the syscall because the 1205 * kernel calls this function internally for other mmaping 1206 * operations (such as in exec) and non-aligned offsets will 1207 * cause pmap inconsistencies...so we want to be sure to 1208 * disallow this in all cases. 1209 */ 1210 if (foff & PAGE_MASK) 1211 return (EINVAL); 1212 1213 if ((flags & MAP_FIXED) == 0) { 1214 fitit = TRUE; 1215 *addr = round_page(*addr); 1216 } else { 1217 if (*addr != trunc_page(*addr)) 1218 return (EINVAL); 1219 fitit = FALSE; 1220 (void) vm_map_remove(map, *addr, *addr + size); 1221 } 1222 /* 1223 * Lookup/allocate object. 1224 */ 1225 if (handle != NULL) { 1226 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1227 handle, foff, &object); 1228 if (error) { 1229 return (error); 1230 } 1231 } 1232 if (flags & MAP_ANON) { 1233 object = NULL; 1234 docow = 0; 1235 /* 1236 * Unnamed anonymous regions always start at 0. 1237 */ 1238 if (handle == 0) 1239 foff = 0; 1240 } else { 1241 docow = MAP_PREFAULT_PARTIAL; 1242 } 1243 1244 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1245 docow |= MAP_COPY_ON_WRITE; 1246 if (flags & MAP_NOSYNC) 1247 docow |= MAP_DISABLE_SYNCER; 1248 if (flags & MAP_NOCORE) 1249 docow |= MAP_DISABLE_COREDUMP; 1250 1251 #if defined(VM_PROT_READ_IS_EXEC) 1252 if (prot & VM_PROT_READ) 1253 prot |= VM_PROT_EXECUTE; 1254 1255 if (maxprot & VM_PROT_READ) 1256 maxprot |= VM_PROT_EXECUTE; 1257 #endif 1258 1259 if (fitit) 1260 *addr = pmap_addr_hint(object, *addr, size); 1261 1262 if (flags & MAP_STACK) 1263 rv = vm_map_stack(map, *addr, size, prot, maxprot, 1264 docow | MAP_STACK_GROWS_DOWN); 1265 else 1266 rv = vm_map_find(map, object, foff, addr, size, fitit, 1267 prot, maxprot, docow); 1268 1269 if (rv != KERN_SUCCESS) { 1270 /* 1271 * Lose the object reference. Will destroy the 1272 * object if it's an unnamed anonymous mapping 1273 * or named anonymous without other references. 1274 */ 1275 vm_object_deallocate(object); 1276 } else if (flags & MAP_SHARED) { 1277 /* 1278 * Shared memory is also shared with children. 1279 */ 1280 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); 1281 if (rv != KERN_SUCCESS) 1282 (void) vm_map_remove(map, *addr, *addr + size); 1283 } 1284 1285 /* 1286 * If the process has requested that all future mappings 1287 * be wired, then heed this. 1288 */ 1289 if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE)) 1290 vm_map_wire(map, *addr, *addr + size, 1291 VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); 1292 1293 switch (rv) { 1294 case KERN_SUCCESS: 1295 return (0); 1296 case KERN_INVALID_ADDRESS: 1297 case KERN_NO_SPACE: 1298 return (ENOMEM); 1299 case KERN_PROTECTION_FAILURE: 1300 return (EACCES); 1301 default: 1302 return (EINVAL); 1303 } 1304 } 1305