1 /*- 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35 * 36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_compat.h" 47 #include "opt_mac.h" 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/kernel.h> 52 #include <sys/lock.h> 53 #include <sys/mutex.h> 54 #include <sys/sysproto.h> 55 #include <sys/filedesc.h> 56 #include <sys/proc.h> 57 #include <sys/resource.h> 58 #include <sys/resourcevar.h> 59 #include <sys/vnode.h> 60 #include <sys/fcntl.h> 61 #include <sys/file.h> 62 #include <sys/mac.h> 63 #include <sys/mman.h> 64 #include <sys/mount.h> 65 #include <sys/conf.h> 66 #include <sys/stat.h> 67 #include <sys/vmmeter.h> 68 #include <sys/sysctl.h> 69 70 #include <vm/vm.h> 71 #include <vm/vm_param.h> 72 #include <vm/pmap.h> 73 #include <vm/vm_map.h> 74 #include <vm/vm_object.h> 75 #include <vm/vm_page.h> 76 #include <vm/vm_pager.h> 77 #include <vm/vm_pageout.h> 78 #include <vm/vm_extern.h> 79 #include <vm/vm_page.h> 80 #include <vm/vm_kern.h> 81 82 #ifndef _SYS_SYSPROTO_H_ 83 struct sbrk_args { 84 int incr; 85 }; 86 #endif 87 88 static int max_proc_mmap; 89 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, ""); 90 91 /* 92 * Set the maximum number of vm_map_entry structures per process. Roughly 93 * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100 94 * of our KVM malloc space still results in generous limits. We want a 95 * default that is good enough to prevent the kernel running out of resources 96 * if attacked from compromised user account but generous enough such that 97 * multi-threaded processes are not unduly inconvenienced. 98 */ 99 static void vmmapentry_rsrc_init(void *); 100 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, NULL) 101 102 static void 103 vmmapentry_rsrc_init(dummy) 104 void *dummy; 105 { 106 max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry); 107 max_proc_mmap /= 100; 108 } 109 110 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 111 int *, struct vnode *, vm_ooffset_t, vm_object_t *); 112 static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 113 int *, struct cdev *, vm_ooffset_t, vm_object_t *); 114 115 /* 116 * MPSAFE 117 */ 118 /* ARGSUSED */ 119 int 120 sbrk(td, uap) 121 struct thread *td; 122 struct sbrk_args *uap; 123 { 124 /* Not yet implemented */ 125 return (EOPNOTSUPP); 126 } 127 128 #ifndef _SYS_SYSPROTO_H_ 129 struct sstk_args { 130 int incr; 131 }; 132 #endif 133 134 /* 135 * MPSAFE 136 */ 137 /* ARGSUSED */ 138 int 139 sstk(td, uap) 140 struct thread *td; 141 struct sstk_args *uap; 142 { 143 /* Not yet implemented */ 144 return (EOPNOTSUPP); 145 } 146 147 #if defined(COMPAT_43) 148 #ifndef _SYS_SYSPROTO_H_ 149 struct getpagesize_args { 150 int dummy; 151 }; 152 #endif 153 154 /* ARGSUSED */ 155 int 156 ogetpagesize(td, uap) 157 struct thread *td; 158 struct getpagesize_args *uap; 159 { 160 /* MP SAFE */ 161 td->td_retval[0] = PAGE_SIZE; 162 return (0); 163 } 164 #endif /* COMPAT_43 */ 165 166 167 /* 168 * Memory Map (mmap) system call. Note that the file offset 169 * and address are allowed to be NOT page aligned, though if 170 * the MAP_FIXED flag it set, both must have the same remainder 171 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 172 * page-aligned, the actual mapping starts at trunc_page(addr) 173 * and the return value is adjusted up by the page offset. 174 * 175 * Generally speaking, only character devices which are themselves 176 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 177 * there would be no cache coherency between a descriptor and a VM mapping 178 * both to the same character device. 179 * 180 * Block devices can be mmap'd no matter what they represent. Cache coherency 181 * is maintained as long as you do not write directly to the underlying 182 * character device. 183 */ 184 #ifndef _SYS_SYSPROTO_H_ 185 struct mmap_args { 186 void *addr; 187 size_t len; 188 int prot; 189 int flags; 190 int fd; 191 long pad; 192 off_t pos; 193 }; 194 #endif 195 196 /* 197 * MPSAFE 198 */ 199 int 200 mmap(td, uap) 201 struct thread *td; 202 struct mmap_args *uap; 203 { 204 struct file *fp; 205 struct vnode *vp; 206 vm_offset_t addr; 207 vm_size_t size, pageoff; 208 vm_prot_t prot, maxprot; 209 void *handle; 210 objtype_t handle_type; 211 int flags, error; 212 off_t pos; 213 struct vmspace *vms = td->td_proc->p_vmspace; 214 215 addr = (vm_offset_t) uap->addr; 216 size = uap->len; 217 prot = uap->prot & VM_PROT_ALL; 218 flags = uap->flags; 219 pos = uap->pos; 220 221 fp = NULL; 222 /* make sure mapping fits into numeric range etc */ 223 if ((ssize_t) uap->len < 0 || 224 ((flags & MAP_ANON) && uap->fd != -1)) 225 return (EINVAL); 226 227 if (flags & MAP_STACK) { 228 if ((uap->fd != -1) || 229 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 230 return (EINVAL); 231 flags |= MAP_ANON; 232 pos = 0; 233 } 234 235 /* 236 * Align the file position to a page boundary, 237 * and save its page offset component. 238 */ 239 pageoff = (pos & PAGE_MASK); 240 pos -= pageoff; 241 242 /* Adjust size for rounding (on both ends). */ 243 size += pageoff; /* low end... */ 244 size = (vm_size_t) round_page(size); /* hi end */ 245 246 /* 247 * Check for illegal addresses. Watch out for address wrap... Note 248 * that VM_*_ADDRESS are not constants due to casts (argh). 249 */ 250 if (flags & MAP_FIXED) { 251 /* 252 * The specified address must have the same remainder 253 * as the file offset taken modulo PAGE_SIZE, so it 254 * should be aligned after adjustment by pageoff. 255 */ 256 addr -= pageoff; 257 if (addr & PAGE_MASK) 258 return (EINVAL); 259 /* Address range must be all in user VM space. */ 260 if (addr < vm_map_min(&vms->vm_map) || 261 addr + size > vm_map_max(&vms->vm_map)) 262 return (EINVAL); 263 if (addr + size < addr) 264 return (EINVAL); 265 } else { 266 /* 267 * XXX for non-fixed mappings where no hint is provided or 268 * the hint would fall in the potential heap space, 269 * place it after the end of the largest possible heap. 270 * 271 * There should really be a pmap call to determine a reasonable 272 * location. 273 */ 274 PROC_LOCK(td->td_proc); 275 if (addr == 0 || 276 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 277 addr < round_page((vm_offset_t)vms->vm_daddr + 278 lim_max(td->td_proc, RLIMIT_DATA)))) 279 addr = round_page((vm_offset_t)vms->vm_daddr + 280 lim_max(td->td_proc, RLIMIT_DATA)); 281 PROC_UNLOCK(td->td_proc); 282 } 283 if (flags & MAP_ANON) { 284 /* 285 * Mapping blank space is trivial. 286 */ 287 handle = NULL; 288 handle_type = OBJT_DEFAULT; 289 maxprot = VM_PROT_ALL; 290 pos = 0; 291 } else { 292 /* 293 * Mapping file, get fp for validation. Obtain vnode and make 294 * sure it is of appropriate type. 295 * don't let the descriptor disappear on us if we block 296 */ 297 if ((error = fget(td, uap->fd, &fp)) != 0) 298 goto done; 299 if (fp->f_type != DTYPE_VNODE) { 300 error = EINVAL; 301 goto done; 302 } 303 /* 304 * POSIX shared-memory objects are defined to have 305 * kernel persistence, and are not defined to support 306 * read(2)/write(2) -- or even open(2). Thus, we can 307 * use MAP_ASYNC to trade on-disk coherence for speed. 308 * The shm_open(3) library routine turns on the FPOSIXSHM 309 * flag to request this behavior. 310 */ 311 if (fp->f_flag & FPOSIXSHM) 312 flags |= MAP_NOSYNC; 313 vp = fp->f_vnode; 314 /* 315 * Ensure that file and memory protections are 316 * compatible. Note that we only worry about 317 * writability if mapping is shared; in this case, 318 * current and max prot are dictated by the open file. 319 * XXX use the vnode instead? Problem is: what 320 * credentials do we use for determination? What if 321 * proc does a setuid? 322 */ 323 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 324 maxprot = VM_PROT_NONE; 325 else 326 maxprot = VM_PROT_EXECUTE; 327 if (fp->f_flag & FREAD) { 328 maxprot |= VM_PROT_READ; 329 } else if (prot & PROT_READ) { 330 error = EACCES; 331 goto done; 332 } 333 /* 334 * If we are sharing potential changes (either via 335 * MAP_SHARED or via the implicit sharing of character 336 * device mappings), and we are trying to get write 337 * permission although we opened it without asking 338 * for it, bail out. 339 */ 340 if ((flags & MAP_SHARED) != 0) { 341 if ((fp->f_flag & FWRITE) != 0) { 342 maxprot |= VM_PROT_WRITE; 343 } else if ((prot & PROT_WRITE) != 0) { 344 error = EACCES; 345 goto done; 346 } 347 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 348 maxprot |= VM_PROT_WRITE; 349 } 350 handle = (void *)vp; 351 handle_type = OBJT_VNODE; 352 } 353 354 /* 355 * Do not allow more then a certain number of vm_map_entry structures 356 * per process. Scale with the number of rforks sharing the map 357 * to make the limit reasonable for threads. 358 */ 359 if (max_proc_mmap && 360 vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) { 361 error = ENOMEM; 362 goto done; 363 } 364 365 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 366 flags, handle_type, handle, pos); 367 if (error == 0) 368 td->td_retval[0] = (register_t) (addr + pageoff); 369 done: 370 if (fp) 371 fdrop(fp, td); 372 373 return (error); 374 } 375 376 #ifdef COMPAT_43 377 #ifndef _SYS_SYSPROTO_H_ 378 struct ommap_args { 379 caddr_t addr; 380 int len; 381 int prot; 382 int flags; 383 int fd; 384 long pos; 385 }; 386 #endif 387 int 388 ommap(td, uap) 389 struct thread *td; 390 struct ommap_args *uap; 391 { 392 struct mmap_args nargs; 393 static const char cvtbsdprot[8] = { 394 0, 395 PROT_EXEC, 396 PROT_WRITE, 397 PROT_EXEC | PROT_WRITE, 398 PROT_READ, 399 PROT_EXEC | PROT_READ, 400 PROT_WRITE | PROT_READ, 401 PROT_EXEC | PROT_WRITE | PROT_READ, 402 }; 403 404 #define OMAP_ANON 0x0002 405 #define OMAP_COPY 0x0020 406 #define OMAP_SHARED 0x0010 407 #define OMAP_FIXED 0x0100 408 409 nargs.addr = uap->addr; 410 nargs.len = uap->len; 411 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 412 nargs.flags = 0; 413 if (uap->flags & OMAP_ANON) 414 nargs.flags |= MAP_ANON; 415 if (uap->flags & OMAP_COPY) 416 nargs.flags |= MAP_COPY; 417 if (uap->flags & OMAP_SHARED) 418 nargs.flags |= MAP_SHARED; 419 else 420 nargs.flags |= MAP_PRIVATE; 421 if (uap->flags & OMAP_FIXED) 422 nargs.flags |= MAP_FIXED; 423 nargs.fd = uap->fd; 424 nargs.pos = uap->pos; 425 return (mmap(td, &nargs)); 426 } 427 #endif /* COMPAT_43 */ 428 429 430 #ifndef _SYS_SYSPROTO_H_ 431 struct msync_args { 432 void *addr; 433 int len; 434 int flags; 435 }; 436 #endif 437 /* 438 * MPSAFE 439 */ 440 int 441 msync(td, uap) 442 struct thread *td; 443 struct msync_args *uap; 444 { 445 vm_offset_t addr; 446 vm_size_t size, pageoff; 447 int flags; 448 vm_map_t map; 449 int rv; 450 451 addr = (vm_offset_t) uap->addr; 452 size = uap->len; 453 flags = uap->flags; 454 455 pageoff = (addr & PAGE_MASK); 456 addr -= pageoff; 457 size += pageoff; 458 size = (vm_size_t) round_page(size); 459 if (addr + size < addr) 460 return (EINVAL); 461 462 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 463 return (EINVAL); 464 465 map = &td->td_proc->p_vmspace->vm_map; 466 467 /* 468 * Clean the pages and interpret the return value. 469 */ 470 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 471 (flags & MS_INVALIDATE) != 0); 472 switch (rv) { 473 case KERN_SUCCESS: 474 return (0); 475 case KERN_INVALID_ADDRESS: 476 return (EINVAL); /* Sun returns ENOMEM? */ 477 case KERN_INVALID_ARGUMENT: 478 return (EBUSY); 479 default: 480 return (EINVAL); 481 } 482 } 483 484 #ifndef _SYS_SYSPROTO_H_ 485 struct munmap_args { 486 void *addr; 487 size_t len; 488 }; 489 #endif 490 /* 491 * MPSAFE 492 */ 493 int 494 munmap(td, uap) 495 struct thread *td; 496 struct munmap_args *uap; 497 { 498 vm_offset_t addr; 499 vm_size_t size, pageoff; 500 vm_map_t map; 501 502 addr = (vm_offset_t) uap->addr; 503 size = uap->len; 504 if (size == 0) 505 return (EINVAL); 506 507 pageoff = (addr & PAGE_MASK); 508 addr -= pageoff; 509 size += pageoff; 510 size = (vm_size_t) round_page(size); 511 if (addr + size < addr) 512 return (EINVAL); 513 514 /* 515 * Check for illegal addresses. Watch out for address wrap... 516 */ 517 map = &td->td_proc->p_vmspace->vm_map; 518 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 519 return (EINVAL); 520 vm_map_lock(map); 521 /* 522 * Make sure entire range is allocated. 523 */ 524 if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) { 525 vm_map_unlock(map); 526 return (EINVAL); 527 } 528 /* returns nothing but KERN_SUCCESS anyway */ 529 vm_map_delete(map, addr, addr + size); 530 vm_map_unlock(map); 531 return (0); 532 } 533 534 #ifndef _SYS_SYSPROTO_H_ 535 struct mprotect_args { 536 const void *addr; 537 size_t len; 538 int prot; 539 }; 540 #endif 541 /* 542 * MPSAFE 543 */ 544 int 545 mprotect(td, uap) 546 struct thread *td; 547 struct mprotect_args *uap; 548 { 549 vm_offset_t addr; 550 vm_size_t size, pageoff; 551 vm_prot_t prot; 552 553 addr = (vm_offset_t) uap->addr; 554 size = uap->len; 555 prot = uap->prot & VM_PROT_ALL; 556 #if defined(VM_PROT_READ_IS_EXEC) 557 if (prot & VM_PROT_READ) 558 prot |= VM_PROT_EXECUTE; 559 #endif 560 561 pageoff = (addr & PAGE_MASK); 562 addr -= pageoff; 563 size += pageoff; 564 size = (vm_size_t) round_page(size); 565 if (addr + size < addr) 566 return (EINVAL); 567 568 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 569 addr + size, prot, FALSE)) { 570 case KERN_SUCCESS: 571 return (0); 572 case KERN_PROTECTION_FAILURE: 573 return (EACCES); 574 } 575 return (EINVAL); 576 } 577 578 #ifndef _SYS_SYSPROTO_H_ 579 struct minherit_args { 580 void *addr; 581 size_t len; 582 int inherit; 583 }; 584 #endif 585 /* 586 * MPSAFE 587 */ 588 int 589 minherit(td, uap) 590 struct thread *td; 591 struct minherit_args *uap; 592 { 593 vm_offset_t addr; 594 vm_size_t size, pageoff; 595 vm_inherit_t inherit; 596 597 addr = (vm_offset_t)uap->addr; 598 size = uap->len; 599 inherit = uap->inherit; 600 601 pageoff = (addr & PAGE_MASK); 602 addr -= pageoff; 603 size += pageoff; 604 size = (vm_size_t) round_page(size); 605 if (addr + size < addr) 606 return (EINVAL); 607 608 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 609 addr + size, inherit)) { 610 case KERN_SUCCESS: 611 return (0); 612 case KERN_PROTECTION_FAILURE: 613 return (EACCES); 614 } 615 return (EINVAL); 616 } 617 618 #ifndef _SYS_SYSPROTO_H_ 619 struct madvise_args { 620 void *addr; 621 size_t len; 622 int behav; 623 }; 624 #endif 625 626 /* 627 * MPSAFE 628 */ 629 /* ARGSUSED */ 630 int 631 madvise(td, uap) 632 struct thread *td; 633 struct madvise_args *uap; 634 { 635 vm_offset_t start, end; 636 vm_map_t map; 637 struct proc *p; 638 int error; 639 640 /* 641 * Check for our special case, advising the swap pager we are 642 * "immortal." 643 */ 644 if (uap->behav == MADV_PROTECT) { 645 error = suser(td); 646 if (error == 0) { 647 p = td->td_proc; 648 PROC_LOCK(p); 649 p->p_flag |= P_PROTECTED; 650 PROC_UNLOCK(p); 651 } 652 return (error); 653 } 654 /* 655 * Check for illegal behavior 656 */ 657 if (uap->behav < 0 || uap->behav > MADV_CORE) 658 return (EINVAL); 659 /* 660 * Check for illegal addresses. Watch out for address wrap... Note 661 * that VM_*_ADDRESS are not constants due to casts (argh). 662 */ 663 map = &td->td_proc->p_vmspace->vm_map; 664 if ((vm_offset_t)uap->addr < vm_map_min(map) || 665 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 666 return (EINVAL); 667 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 668 return (EINVAL); 669 670 /* 671 * Since this routine is only advisory, we default to conservative 672 * behavior. 673 */ 674 start = trunc_page((vm_offset_t) uap->addr); 675 end = round_page((vm_offset_t) uap->addr + uap->len); 676 677 if (vm_map_madvise(map, start, end, uap->behav)) 678 return (EINVAL); 679 return (0); 680 } 681 682 #ifndef _SYS_SYSPROTO_H_ 683 struct mincore_args { 684 const void *addr; 685 size_t len; 686 char *vec; 687 }; 688 #endif 689 690 /* 691 * MPSAFE 692 */ 693 /* ARGSUSED */ 694 int 695 mincore(td, uap) 696 struct thread *td; 697 struct mincore_args *uap; 698 { 699 vm_offset_t addr, first_addr; 700 vm_offset_t end, cend; 701 pmap_t pmap; 702 vm_map_t map; 703 char *vec; 704 int error = 0; 705 int vecindex, lastvecindex; 706 vm_map_entry_t current; 707 vm_map_entry_t entry; 708 int mincoreinfo; 709 unsigned int timestamp; 710 711 /* 712 * Make sure that the addresses presented are valid for user 713 * mode. 714 */ 715 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 716 end = addr + (vm_size_t)round_page(uap->len); 717 map = &td->td_proc->p_vmspace->vm_map; 718 if (end > vm_map_max(map) || end < addr) 719 return (EINVAL); 720 721 /* 722 * Address of byte vector 723 */ 724 vec = uap->vec; 725 726 pmap = vmspace_pmap(td->td_proc->p_vmspace); 727 728 vm_map_lock_read(map); 729 RestartScan: 730 timestamp = map->timestamp; 731 732 if (!vm_map_lookup_entry(map, addr, &entry)) 733 entry = entry->next; 734 735 /* 736 * Do this on a map entry basis so that if the pages are not 737 * in the current processes address space, we can easily look 738 * up the pages elsewhere. 739 */ 740 lastvecindex = -1; 741 for (current = entry; 742 (current != &map->header) && (current->start < end); 743 current = current->next) { 744 745 /* 746 * ignore submaps (for now) or null objects 747 */ 748 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 749 current->object.vm_object == NULL) 750 continue; 751 752 /* 753 * limit this scan to the current map entry and the 754 * limits for the mincore call 755 */ 756 if (addr < current->start) 757 addr = current->start; 758 cend = current->end; 759 if (cend > end) 760 cend = end; 761 762 /* 763 * scan this entry one page at a time 764 */ 765 while (addr < cend) { 766 /* 767 * Check pmap first, it is likely faster, also 768 * it can provide info as to whether we are the 769 * one referencing or modifying the page. 770 */ 771 mincoreinfo = pmap_mincore(pmap, addr); 772 if (!mincoreinfo) { 773 vm_pindex_t pindex; 774 vm_ooffset_t offset; 775 vm_page_t m; 776 /* 777 * calculate the page index into the object 778 */ 779 offset = current->offset + (addr - current->start); 780 pindex = OFF_TO_IDX(offset); 781 VM_OBJECT_LOCK(current->object.vm_object); 782 m = vm_page_lookup(current->object.vm_object, 783 pindex); 784 /* 785 * if the page is resident, then gather information about 786 * it. 787 */ 788 if (m != NULL && m->valid != 0) { 789 mincoreinfo = MINCORE_INCORE; 790 vm_page_lock_queues(); 791 if (m->dirty || 792 pmap_is_modified(m)) 793 mincoreinfo |= MINCORE_MODIFIED_OTHER; 794 if ((m->flags & PG_REFERENCED) || 795 pmap_ts_referenced(m)) { 796 vm_page_flag_set(m, PG_REFERENCED); 797 mincoreinfo |= MINCORE_REFERENCED_OTHER; 798 } 799 vm_page_unlock_queues(); 800 } 801 VM_OBJECT_UNLOCK(current->object.vm_object); 802 } 803 804 /* 805 * subyte may page fault. In case it needs to modify 806 * the map, we release the lock. 807 */ 808 vm_map_unlock_read(map); 809 810 /* 811 * calculate index into user supplied byte vector 812 */ 813 vecindex = OFF_TO_IDX(addr - first_addr); 814 815 /* 816 * If we have skipped map entries, we need to make sure that 817 * the byte vector is zeroed for those skipped entries. 818 */ 819 while ((lastvecindex + 1) < vecindex) { 820 error = subyte(vec + lastvecindex, 0); 821 if (error) { 822 error = EFAULT; 823 goto done2; 824 } 825 ++lastvecindex; 826 } 827 828 /* 829 * Pass the page information to the user 830 */ 831 error = subyte(vec + vecindex, mincoreinfo); 832 if (error) { 833 error = EFAULT; 834 goto done2; 835 } 836 837 /* 838 * If the map has changed, due to the subyte, the previous 839 * output may be invalid. 840 */ 841 vm_map_lock_read(map); 842 if (timestamp != map->timestamp) 843 goto RestartScan; 844 845 lastvecindex = vecindex; 846 addr += PAGE_SIZE; 847 } 848 } 849 850 /* 851 * subyte may page fault. In case it needs to modify 852 * the map, we release the lock. 853 */ 854 vm_map_unlock_read(map); 855 856 /* 857 * Zero the last entries in the byte vector. 858 */ 859 vecindex = OFF_TO_IDX(end - first_addr); 860 while ((lastvecindex + 1) < vecindex) { 861 error = subyte(vec + lastvecindex, 0); 862 if (error) { 863 error = EFAULT; 864 goto done2; 865 } 866 ++lastvecindex; 867 } 868 869 /* 870 * If the map has changed, due to the subyte, the previous 871 * output may be invalid. 872 */ 873 vm_map_lock_read(map); 874 if (timestamp != map->timestamp) 875 goto RestartScan; 876 vm_map_unlock_read(map); 877 done2: 878 return (error); 879 } 880 881 #ifndef _SYS_SYSPROTO_H_ 882 struct mlock_args { 883 const void *addr; 884 size_t len; 885 }; 886 #endif 887 /* 888 * MPSAFE 889 */ 890 int 891 mlock(td, uap) 892 struct thread *td; 893 struct mlock_args *uap; 894 { 895 struct proc *proc; 896 vm_offset_t addr, end, last, start; 897 vm_size_t npages, size; 898 int error; 899 900 error = suser(td); 901 if (error) 902 return (error); 903 addr = (vm_offset_t)uap->addr; 904 size = uap->len; 905 last = addr + size; 906 start = trunc_page(addr); 907 end = round_page(last); 908 if (last < addr || end < addr) 909 return (EINVAL); 910 npages = atop(end - start); 911 if (npages > vm_page_max_wired) 912 return (ENOMEM); 913 proc = td->td_proc; 914 PROC_LOCK(proc); 915 if (ptoa(npages + 916 pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) > 917 lim_cur(proc, RLIMIT_MEMLOCK)) { 918 PROC_UNLOCK(proc); 919 return (ENOMEM); 920 } 921 PROC_UNLOCK(proc); 922 if (npages + cnt.v_wire_count > vm_page_max_wired) 923 return (EAGAIN); 924 error = vm_map_wire(&proc->p_vmspace->vm_map, start, end, 925 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 926 return (error == KERN_SUCCESS ? 0 : ENOMEM); 927 } 928 929 #ifndef _SYS_SYSPROTO_H_ 930 struct mlockall_args { 931 int how; 932 }; 933 #endif 934 935 /* 936 * MPSAFE 937 */ 938 int 939 mlockall(td, uap) 940 struct thread *td; 941 struct mlockall_args *uap; 942 { 943 vm_map_t map; 944 int error; 945 946 map = &td->td_proc->p_vmspace->vm_map; 947 error = 0; 948 949 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 950 return (EINVAL); 951 952 #if 0 953 /* 954 * If wiring all pages in the process would cause it to exceed 955 * a hard resource limit, return ENOMEM. 956 */ 957 PROC_LOCK(td->td_proc); 958 if (map->size - ptoa(pmap_wired_count(vm_map_pmap(map)) > 959 lim_cur(td->td_proc, RLIMIT_MEMLOCK))) { 960 PROC_UNLOCK(td->td_proc); 961 return (ENOMEM); 962 } 963 PROC_UNLOCK(td->td_proc); 964 #else 965 error = suser(td); 966 if (error) 967 return (error); 968 #endif 969 970 if (uap->how & MCL_FUTURE) { 971 vm_map_lock(map); 972 vm_map_modflags(map, MAP_WIREFUTURE, 0); 973 vm_map_unlock(map); 974 error = 0; 975 } 976 977 if (uap->how & MCL_CURRENT) { 978 /* 979 * P1003.1-2001 mandates that all currently mapped pages 980 * will be memory resident and locked (wired) upon return 981 * from mlockall(). vm_map_wire() will wire pages, by 982 * calling vm_fault_wire() for each page in the region. 983 */ 984 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 985 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 986 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 987 } 988 989 return (error); 990 } 991 992 #ifndef _SYS_SYSPROTO_H_ 993 struct munlockall_args { 994 register_t dummy; 995 }; 996 #endif 997 998 /* 999 * MPSAFE 1000 */ 1001 int 1002 munlockall(td, uap) 1003 struct thread *td; 1004 struct munlockall_args *uap; 1005 { 1006 vm_map_t map; 1007 int error; 1008 1009 map = &td->td_proc->p_vmspace->vm_map; 1010 error = suser(td); 1011 if (error) 1012 return (error); 1013 1014 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1015 vm_map_lock(map); 1016 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1017 vm_map_unlock(map); 1018 1019 /* Forcibly unwire all pages. */ 1020 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1021 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1022 1023 return (error); 1024 } 1025 1026 #ifndef _SYS_SYSPROTO_H_ 1027 struct munlock_args { 1028 const void *addr; 1029 size_t len; 1030 }; 1031 #endif 1032 /* 1033 * MPSAFE 1034 */ 1035 int 1036 munlock(td, uap) 1037 struct thread *td; 1038 struct munlock_args *uap; 1039 { 1040 vm_offset_t addr, end, last, start; 1041 vm_size_t size; 1042 int error; 1043 1044 error = suser(td); 1045 if (error) 1046 return (error); 1047 addr = (vm_offset_t)uap->addr; 1048 size = uap->len; 1049 last = addr + size; 1050 start = trunc_page(addr); 1051 end = round_page(last); 1052 if (last < addr || end < addr) 1053 return (EINVAL); 1054 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1055 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1056 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1057 } 1058 1059 /* 1060 * vm_mmap_vnode() 1061 * 1062 * MPSAFE 1063 * 1064 * Helper function for vm_mmap. Perform sanity check specific for mmap 1065 * operations on vnodes. 1066 */ 1067 int 1068 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1069 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1070 struct vnode *vp, vm_ooffset_t foff, vm_object_t *objp) 1071 { 1072 struct vattr va; 1073 void *handle; 1074 vm_object_t obj; 1075 struct mount *mp; 1076 int error, flags, type; 1077 int vfslocked; 1078 1079 mp = vp->v_mount; 1080 vfslocked = VFS_LOCK_GIANT(mp); 1081 if ((error = vget(vp, LK_EXCLUSIVE, td)) != 0) { 1082 VFS_UNLOCK_GIANT(vfslocked); 1083 return (error); 1084 } 1085 flags = *flagsp; 1086 obj = vp->v_object; 1087 if (vp->v_type == VREG) { 1088 /* 1089 * Get the proper underlying object 1090 */ 1091 if (obj == NULL) { 1092 error = EINVAL; 1093 goto done; 1094 } 1095 if (obj->handle != vp) { 1096 vput(vp); 1097 vp = (struct vnode*)obj->handle; 1098 vget(vp, LK_EXCLUSIVE, td); 1099 } 1100 type = OBJT_VNODE; 1101 handle = vp; 1102 } else if (vp->v_type == VCHR) { 1103 type = OBJT_DEVICE; 1104 handle = vp->v_rdev; 1105 1106 /* XXX: lack thredref on device */ 1107 if(vp->v_rdev->si_devsw->d_flags & D_MMAP_ANON) { 1108 *maxprotp = VM_PROT_ALL; 1109 *flagsp |= MAP_ANON; 1110 error = 0; 1111 goto done; 1112 } 1113 /* 1114 * cdevs does not provide private mappings of any kind. 1115 */ 1116 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1117 (prot & PROT_WRITE) != 0) { 1118 error = EACCES; 1119 goto done; 1120 } 1121 if (flags & (MAP_PRIVATE|MAP_COPY)) { 1122 error = EINVAL; 1123 goto done; 1124 } 1125 /* 1126 * Force device mappings to be shared. 1127 */ 1128 flags |= MAP_SHARED; 1129 } else { 1130 error = EINVAL; 1131 goto done; 1132 } 1133 if ((error = VOP_GETATTR(vp, &va, td->td_ucred, td))) { 1134 goto done; 1135 } 1136 #ifdef MAC 1137 error = mac_check_vnode_mmap(td->td_ucred, vp, prot, flags); 1138 if (error != 0) 1139 goto done; 1140 #endif 1141 if ((flags & MAP_SHARED) != 0) { 1142 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1143 if (prot & PROT_WRITE) { 1144 error = EPERM; 1145 goto done; 1146 } 1147 *maxprotp &= ~VM_PROT_WRITE; 1148 } 1149 } 1150 /* 1151 * If it is a regular file without any references 1152 * we do not need to sync it. 1153 * Adjust object size to be the size of actual file. 1154 */ 1155 if (vp->v_type == VREG) { 1156 objsize = round_page(va.va_size); 1157 if (va.va_nlink == 0) 1158 flags |= MAP_NOSYNC; 1159 } 1160 obj = vm_pager_allocate(type, handle, objsize, prot, foff); 1161 if (obj == NULL) { 1162 error = (type == OBJT_DEVICE ? EINVAL : ENOMEM); 1163 goto done; 1164 } 1165 *objp = obj; 1166 *flagsp = flags; 1167 vfs_mark_atime(vp, td); 1168 1169 done: 1170 vput(vp); 1171 VFS_UNLOCK_GIANT(vfslocked); 1172 return (error); 1173 } 1174 1175 /* 1176 * vm_mmap_cdev() 1177 * 1178 * MPSAFE 1179 * 1180 * Helper function for vm_mmap. Perform sanity check specific for mmap 1181 * operations on cdevs. 1182 */ 1183 int 1184 vm_mmap_cdev(struct thread *td, vm_size_t objsize, 1185 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1186 struct cdev *cdev, vm_ooffset_t foff, vm_object_t *objp) 1187 { 1188 vm_object_t obj; 1189 int flags; 1190 1191 flags = *flagsp; 1192 1193 /* XXX: lack thredref on device */ 1194 if (cdev->si_devsw->d_flags & D_MMAP_ANON) { 1195 *maxprotp = VM_PROT_ALL; 1196 *flagsp |= MAP_ANON; 1197 return (0); 1198 } 1199 /* 1200 * cdevs does not provide private mappings of any kind. 1201 */ 1202 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1203 (prot & PROT_WRITE) != 0) 1204 return (EACCES); 1205 if (flags & (MAP_PRIVATE|MAP_COPY)) 1206 return (EINVAL); 1207 /* 1208 * Force device mappings to be shared. 1209 */ 1210 flags |= MAP_SHARED; 1211 #ifdef MAC_XXX 1212 error = mac_check_cdev_mmap(td->td_ucred, cdev, prot); 1213 if (error != 0) 1214 return (error); 1215 #endif 1216 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, foff); 1217 if (obj == NULL) 1218 return (EINVAL); 1219 *objp = obj; 1220 *flagsp = flags; 1221 return (0); 1222 } 1223 1224 /* 1225 * vm_mmap() 1226 * 1227 * MPSAFE 1228 * 1229 * Internal version of mmap. Currently used by mmap, exec, and sys5 1230 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1231 */ 1232 int 1233 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1234 vm_prot_t maxprot, int flags, 1235 objtype_t handle_type, void *handle, 1236 vm_ooffset_t foff) 1237 { 1238 boolean_t fitit; 1239 vm_object_t object; 1240 int rv = KERN_SUCCESS; 1241 int docow, error; 1242 struct thread *td = curthread; 1243 1244 if (size == 0) 1245 return (0); 1246 1247 size = round_page(size); 1248 1249 PROC_LOCK(td->td_proc); 1250 if (td->td_proc->p_vmspace->vm_map.size + size > 1251 lim_cur(td->td_proc, RLIMIT_VMEM)) { 1252 PROC_UNLOCK(td->td_proc); 1253 return(ENOMEM); 1254 } 1255 PROC_UNLOCK(td->td_proc); 1256 1257 /* 1258 * We currently can only deal with page aligned file offsets. 1259 * The check is here rather than in the syscall because the 1260 * kernel calls this function internally for other mmaping 1261 * operations (such as in exec) and non-aligned offsets will 1262 * cause pmap inconsistencies...so we want to be sure to 1263 * disallow this in all cases. 1264 */ 1265 if (foff & PAGE_MASK) 1266 return (EINVAL); 1267 1268 if ((flags & MAP_FIXED) == 0) { 1269 fitit = TRUE; 1270 *addr = round_page(*addr); 1271 } else { 1272 if (*addr != trunc_page(*addr)) 1273 return (EINVAL); 1274 fitit = FALSE; 1275 (void) vm_map_remove(map, *addr, *addr + size); 1276 } 1277 /* 1278 * Lookup/allocate object. 1279 */ 1280 switch (handle_type) { 1281 case OBJT_DEVICE: 1282 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, 1283 handle, foff, &object); 1284 break; 1285 case OBJT_VNODE: 1286 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1287 handle, foff, &object); 1288 break; 1289 case OBJT_DEFAULT: 1290 if (handle == NULL) { 1291 error = 0; 1292 break; 1293 } 1294 /* FALLTHROUGH */ 1295 default: 1296 error = EINVAL; 1297 } 1298 if (error) 1299 return (error); 1300 if (flags & MAP_ANON) { 1301 object = NULL; 1302 docow = 0; 1303 /* 1304 * Unnamed anonymous regions always start at 0. 1305 */ 1306 if (handle == 0) 1307 foff = 0; 1308 } else { 1309 docow = MAP_PREFAULT_PARTIAL; 1310 } 1311 1312 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1313 docow |= MAP_COPY_ON_WRITE; 1314 if (flags & MAP_NOSYNC) 1315 docow |= MAP_DISABLE_SYNCER; 1316 if (flags & MAP_NOCORE) 1317 docow |= MAP_DISABLE_COREDUMP; 1318 1319 #if defined(VM_PROT_READ_IS_EXEC) 1320 if (prot & VM_PROT_READ) 1321 prot |= VM_PROT_EXECUTE; 1322 1323 if (maxprot & VM_PROT_READ) 1324 maxprot |= VM_PROT_EXECUTE; 1325 #endif 1326 1327 if (fitit) 1328 *addr = pmap_addr_hint(object, *addr, size); 1329 1330 if (flags & MAP_STACK) 1331 rv = vm_map_stack(map, *addr, size, prot, maxprot, 1332 docow | MAP_STACK_GROWS_DOWN); 1333 else 1334 rv = vm_map_find(map, object, foff, addr, size, fitit, 1335 prot, maxprot, docow); 1336 1337 if (rv != KERN_SUCCESS) { 1338 /* 1339 * Lose the object reference. Will destroy the 1340 * object if it's an unnamed anonymous mapping 1341 * or named anonymous without other references. 1342 */ 1343 vm_object_deallocate(object); 1344 } else if (flags & MAP_SHARED) { 1345 /* 1346 * Shared memory is also shared with children. 1347 */ 1348 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); 1349 if (rv != KERN_SUCCESS) 1350 (void) vm_map_remove(map, *addr, *addr + size); 1351 } 1352 1353 /* 1354 * If the process has requested that all future mappings 1355 * be wired, then heed this. 1356 */ 1357 if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE)) 1358 vm_map_wire(map, *addr, *addr + size, 1359 VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); 1360 1361 switch (rv) { 1362 case KERN_SUCCESS: 1363 return (0); 1364 case KERN_INVALID_ADDRESS: 1365 case KERN_NO_SPACE: 1366 return (ENOMEM); 1367 case KERN_PROTECTION_FAILURE: 1368 return (EACCES); 1369 default: 1370 return (EINVAL); 1371 } 1372 } 1373