1 /* 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35 * 36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_compat.h" 47 #include "opt_mac.h" 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/kernel.h> 52 #include <sys/lock.h> 53 #include <sys/mutex.h> 54 #include <sys/sysproto.h> 55 #include <sys/filedesc.h> 56 #include <sys/proc.h> 57 #include <sys/resource.h> 58 #include <sys/resourcevar.h> 59 #include <sys/vnode.h> 60 #include <sys/fcntl.h> 61 #include <sys/file.h> 62 #include <sys/mac.h> 63 #include <sys/mman.h> 64 #include <sys/mount.h> 65 #include <sys/conf.h> 66 #include <sys/stat.h> 67 #include <sys/vmmeter.h> 68 #include <sys/sysctl.h> 69 70 #include <vm/vm.h> 71 #include <vm/vm_param.h> 72 #include <vm/pmap.h> 73 #include <vm/vm_map.h> 74 #include <vm/vm_object.h> 75 #include <vm/vm_page.h> 76 #include <vm/vm_pager.h> 77 #include <vm/vm_pageout.h> 78 #include <vm/vm_extern.h> 79 #include <vm/vm_page.h> 80 #include <vm/vm_kern.h> 81 82 #ifndef _SYS_SYSPROTO_H_ 83 struct sbrk_args { 84 int incr; 85 }; 86 #endif 87 88 static int max_proc_mmap; 89 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, ""); 90 91 /* 92 * Set the maximum number of vm_map_entry structures per process. Roughly 93 * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100 94 * of our KVM malloc space still results in generous limits. We want a 95 * default that is good enough to prevent the kernel running out of resources 96 * if attacked from compromised user account but generous enough such that 97 * multi-threaded processes are not unduly inconvenienced. 98 */ 99 static void vmmapentry_rsrc_init(void *); 100 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, NULL) 101 102 static void 103 vmmapentry_rsrc_init(dummy) 104 void *dummy; 105 { 106 max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry); 107 max_proc_mmap /= 100; 108 } 109 110 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 111 int *, struct vnode *, vm_ooffset_t, vm_object_t *); 112 113 /* 114 * MPSAFE 115 */ 116 /* ARGSUSED */ 117 int 118 sbrk(td, uap) 119 struct thread *td; 120 struct sbrk_args *uap; 121 { 122 /* Not yet implemented */ 123 /* mtx_lock(&Giant); */ 124 /* mtx_unlock(&Giant); */ 125 return (EOPNOTSUPP); 126 } 127 128 #ifndef _SYS_SYSPROTO_H_ 129 struct sstk_args { 130 int incr; 131 }; 132 #endif 133 134 /* 135 * MPSAFE 136 */ 137 /* ARGSUSED */ 138 int 139 sstk(td, uap) 140 struct thread *td; 141 struct sstk_args *uap; 142 { 143 /* Not yet implemented */ 144 /* mtx_lock(&Giant); */ 145 /* mtx_unlock(&Giant); */ 146 return (EOPNOTSUPP); 147 } 148 149 #if defined(COMPAT_43) 150 #ifndef _SYS_SYSPROTO_H_ 151 struct getpagesize_args { 152 int dummy; 153 }; 154 #endif 155 156 /* ARGSUSED */ 157 int 158 ogetpagesize(td, uap) 159 struct thread *td; 160 struct getpagesize_args *uap; 161 { 162 /* MP SAFE */ 163 td->td_retval[0] = PAGE_SIZE; 164 return (0); 165 } 166 #endif /* COMPAT_43 */ 167 168 169 /* 170 * Memory Map (mmap) system call. Note that the file offset 171 * and address are allowed to be NOT page aligned, though if 172 * the MAP_FIXED flag it set, both must have the same remainder 173 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 174 * page-aligned, the actual mapping starts at trunc_page(addr) 175 * and the return value is adjusted up by the page offset. 176 * 177 * Generally speaking, only character devices which are themselves 178 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 179 * there would be no cache coherency between a descriptor and a VM mapping 180 * both to the same character device. 181 * 182 * Block devices can be mmap'd no matter what they represent. Cache coherency 183 * is maintained as long as you do not write directly to the underlying 184 * character device. 185 */ 186 #ifndef _SYS_SYSPROTO_H_ 187 struct mmap_args { 188 void *addr; 189 size_t len; 190 int prot; 191 int flags; 192 int fd; 193 long pad; 194 off_t pos; 195 }; 196 #endif 197 198 /* 199 * MPSAFE 200 */ 201 int 202 mmap(td, uap) 203 struct thread *td; 204 struct mmap_args *uap; 205 { 206 struct file *fp; 207 struct vnode *vp; 208 vm_offset_t addr; 209 vm_size_t size, pageoff; 210 vm_prot_t prot, maxprot; 211 void *handle; 212 int flags, error; 213 off_t pos; 214 struct vmspace *vms = td->td_proc->p_vmspace; 215 216 addr = (vm_offset_t) uap->addr; 217 size = uap->len; 218 prot = uap->prot & VM_PROT_ALL; 219 flags = uap->flags; 220 pos = uap->pos; 221 222 fp = NULL; 223 /* make sure mapping fits into numeric range etc */ 224 if ((ssize_t) uap->len < 0 || 225 ((flags & MAP_ANON) && uap->fd != -1)) 226 return (EINVAL); 227 228 if (flags & MAP_STACK) { 229 if ((uap->fd != -1) || 230 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 231 return (EINVAL); 232 flags |= MAP_ANON; 233 pos = 0; 234 } 235 236 /* 237 * Align the file position to a page boundary, 238 * and save its page offset component. 239 */ 240 pageoff = (pos & PAGE_MASK); 241 pos -= pageoff; 242 243 /* Adjust size for rounding (on both ends). */ 244 size += pageoff; /* low end... */ 245 size = (vm_size_t) round_page(size); /* hi end */ 246 247 /* 248 * Check for illegal addresses. Watch out for address wrap... Note 249 * that VM_*_ADDRESS are not constants due to casts (argh). 250 */ 251 if (flags & MAP_FIXED) { 252 /* 253 * The specified address must have the same remainder 254 * as the file offset taken modulo PAGE_SIZE, so it 255 * should be aligned after adjustment by pageoff. 256 */ 257 addr -= pageoff; 258 if (addr & PAGE_MASK) 259 return (EINVAL); 260 /* Address range must be all in user VM space. */ 261 if (addr < vm_map_min(&vms->vm_map) || 262 addr + size > vm_map_max(&vms->vm_map)) 263 return (EINVAL); 264 if (addr + size < addr) 265 return (EINVAL); 266 } else { 267 /* 268 * XXX for non-fixed mappings where no hint is provided or 269 * the hint would fall in the potential heap space, 270 * place it after the end of the largest possible heap. 271 * 272 * There should really be a pmap call to determine a reasonable 273 * location. 274 */ 275 PROC_LOCK(td->td_proc); 276 if (addr == 0 || 277 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 278 addr < round_page((vm_offset_t)vms->vm_daddr + 279 lim_max(td->td_proc, RLIMIT_DATA)))) 280 addr = round_page((vm_offset_t)vms->vm_daddr + 281 lim_max(td->td_proc, RLIMIT_DATA)); 282 PROC_UNLOCK(td->td_proc); 283 } 284 if (flags & MAP_ANON) { 285 /* 286 * Mapping blank space is trivial. 287 */ 288 handle = NULL; 289 maxprot = VM_PROT_ALL; 290 pos = 0; 291 } else { 292 /* 293 * Mapping file, get fp for validation. Obtain vnode and make 294 * sure it is of appropriate type. 295 * don't let the descriptor disappear on us if we block 296 */ 297 if ((error = fget(td, uap->fd, &fp)) != 0) 298 goto done; 299 if (fp->f_type != DTYPE_VNODE) { 300 error = EINVAL; 301 goto done; 302 } 303 /* 304 * POSIX shared-memory objects are defined to have 305 * kernel persistence, and are not defined to support 306 * read(2)/write(2) -- or even open(2). Thus, we can 307 * use MAP_ASYNC to trade on-disk coherence for speed. 308 * The shm_open(3) library routine turns on the FPOSIXSHM 309 * flag to request this behavior. 310 */ 311 if (fp->f_flag & FPOSIXSHM) 312 flags |= MAP_NOSYNC; 313 vp = fp->f_vnode; 314 /* 315 * Ensure that file and memory protections are 316 * compatible. Note that we only worry about 317 * writability if mapping is shared; in this case, 318 * current and max prot are dictated by the open file. 319 * XXX use the vnode instead? Problem is: what 320 * credentials do we use for determination? What if 321 * proc does a setuid? 322 */ 323 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 324 maxprot = VM_PROT_NONE; 325 else 326 maxprot = VM_PROT_EXECUTE; 327 if (fp->f_flag & FREAD) { 328 maxprot |= VM_PROT_READ; 329 } else if (prot & PROT_READ) { 330 error = EACCES; 331 goto done; 332 } 333 /* 334 * If we are sharing potential changes (either via 335 * MAP_SHARED or via the implicit sharing of character 336 * device mappings), and we are trying to get write 337 * permission although we opened it without asking 338 * for it, bail out. 339 */ 340 if ((flags & MAP_SHARED) != 0) { 341 if ((fp->f_flag & FWRITE) != 0) { 342 maxprot |= VM_PROT_WRITE; 343 } else if ((prot & PROT_WRITE) != 0) { 344 error = EACCES; 345 goto done; 346 } 347 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 348 maxprot |= VM_PROT_WRITE; 349 } 350 handle = (void *)vp; 351 } 352 353 /* 354 * Do not allow more then a certain number of vm_map_entry structures 355 * per process. Scale with the number of rforks sharing the map 356 * to make the limit reasonable for threads. 357 */ 358 if (max_proc_mmap && 359 vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) { 360 error = ENOMEM; 361 goto done; 362 } 363 364 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 365 flags, handle, pos); 366 if (error == 0) 367 td->td_retval[0] = (register_t) (addr + pageoff); 368 done: 369 if (fp) 370 fdrop(fp, td); 371 372 return (error); 373 } 374 375 #ifdef COMPAT_43 376 #ifndef _SYS_SYSPROTO_H_ 377 struct ommap_args { 378 caddr_t addr; 379 int len; 380 int prot; 381 int flags; 382 int fd; 383 long pos; 384 }; 385 #endif 386 int 387 ommap(td, uap) 388 struct thread *td; 389 struct ommap_args *uap; 390 { 391 struct mmap_args nargs; 392 static const char cvtbsdprot[8] = { 393 0, 394 PROT_EXEC, 395 PROT_WRITE, 396 PROT_EXEC | PROT_WRITE, 397 PROT_READ, 398 PROT_EXEC | PROT_READ, 399 PROT_WRITE | PROT_READ, 400 PROT_EXEC | PROT_WRITE | PROT_READ, 401 }; 402 403 #define OMAP_ANON 0x0002 404 #define OMAP_COPY 0x0020 405 #define OMAP_SHARED 0x0010 406 #define OMAP_FIXED 0x0100 407 408 nargs.addr = uap->addr; 409 nargs.len = uap->len; 410 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 411 nargs.flags = 0; 412 if (uap->flags & OMAP_ANON) 413 nargs.flags |= MAP_ANON; 414 if (uap->flags & OMAP_COPY) 415 nargs.flags |= MAP_COPY; 416 if (uap->flags & OMAP_SHARED) 417 nargs.flags |= MAP_SHARED; 418 else 419 nargs.flags |= MAP_PRIVATE; 420 if (uap->flags & OMAP_FIXED) 421 nargs.flags |= MAP_FIXED; 422 nargs.fd = uap->fd; 423 nargs.pos = uap->pos; 424 return (mmap(td, &nargs)); 425 } 426 #endif /* COMPAT_43 */ 427 428 429 #ifndef _SYS_SYSPROTO_H_ 430 struct msync_args { 431 void *addr; 432 int len; 433 int flags; 434 }; 435 #endif 436 /* 437 * MPSAFE 438 */ 439 int 440 msync(td, uap) 441 struct thread *td; 442 struct msync_args *uap; 443 { 444 vm_offset_t addr; 445 vm_size_t size, pageoff; 446 int flags; 447 vm_map_t map; 448 int rv; 449 450 addr = (vm_offset_t) uap->addr; 451 size = uap->len; 452 flags = uap->flags; 453 454 pageoff = (addr & PAGE_MASK); 455 addr -= pageoff; 456 size += pageoff; 457 size = (vm_size_t) round_page(size); 458 if (addr + size < addr) 459 return (EINVAL); 460 461 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 462 return (EINVAL); 463 464 map = &td->td_proc->p_vmspace->vm_map; 465 466 /* 467 * Clean the pages and interpret the return value. 468 */ 469 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 470 (flags & MS_INVALIDATE) != 0); 471 switch (rv) { 472 case KERN_SUCCESS: 473 return (0); 474 case KERN_INVALID_ADDRESS: 475 return (EINVAL); /* Sun returns ENOMEM? */ 476 case KERN_INVALID_ARGUMENT: 477 return (EBUSY); 478 default: 479 return (EINVAL); 480 } 481 } 482 483 #ifndef _SYS_SYSPROTO_H_ 484 struct munmap_args { 485 void *addr; 486 size_t len; 487 }; 488 #endif 489 /* 490 * MPSAFE 491 */ 492 int 493 munmap(td, uap) 494 struct thread *td; 495 struct munmap_args *uap; 496 { 497 vm_offset_t addr; 498 vm_size_t size, pageoff; 499 vm_map_t map; 500 501 addr = (vm_offset_t) uap->addr; 502 size = uap->len; 503 if (size == 0) 504 return (EINVAL); 505 506 pageoff = (addr & PAGE_MASK); 507 addr -= pageoff; 508 size += pageoff; 509 size = (vm_size_t) round_page(size); 510 if (addr + size < addr) 511 return (EINVAL); 512 513 /* 514 * Check for illegal addresses. Watch out for address wrap... 515 */ 516 map = &td->td_proc->p_vmspace->vm_map; 517 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 518 return (EINVAL); 519 vm_map_lock(map); 520 /* 521 * Make sure entire range is allocated. 522 */ 523 if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) { 524 vm_map_unlock(map); 525 return (EINVAL); 526 } 527 /* returns nothing but KERN_SUCCESS anyway */ 528 vm_map_delete(map, addr, addr + size); 529 vm_map_unlock(map); 530 return (0); 531 } 532 533 #ifndef _SYS_SYSPROTO_H_ 534 struct mprotect_args { 535 const void *addr; 536 size_t len; 537 int prot; 538 }; 539 #endif 540 /* 541 * MPSAFE 542 */ 543 int 544 mprotect(td, uap) 545 struct thread *td; 546 struct mprotect_args *uap; 547 { 548 vm_offset_t addr; 549 vm_size_t size, pageoff; 550 vm_prot_t prot; 551 552 addr = (vm_offset_t) uap->addr; 553 size = uap->len; 554 prot = uap->prot & VM_PROT_ALL; 555 #if defined(VM_PROT_READ_IS_EXEC) 556 if (prot & VM_PROT_READ) 557 prot |= VM_PROT_EXECUTE; 558 #endif 559 560 pageoff = (addr & PAGE_MASK); 561 addr -= pageoff; 562 size += pageoff; 563 size = (vm_size_t) round_page(size); 564 if (addr + size < addr) 565 return (EINVAL); 566 567 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 568 addr + size, prot, FALSE)) { 569 case KERN_SUCCESS: 570 return (0); 571 case KERN_PROTECTION_FAILURE: 572 return (EACCES); 573 } 574 return (EINVAL); 575 } 576 577 #ifndef _SYS_SYSPROTO_H_ 578 struct minherit_args { 579 void *addr; 580 size_t len; 581 int inherit; 582 }; 583 #endif 584 /* 585 * MPSAFE 586 */ 587 int 588 minherit(td, uap) 589 struct thread *td; 590 struct minherit_args *uap; 591 { 592 vm_offset_t addr; 593 vm_size_t size, pageoff; 594 vm_inherit_t inherit; 595 596 addr = (vm_offset_t)uap->addr; 597 size = uap->len; 598 inherit = uap->inherit; 599 600 pageoff = (addr & PAGE_MASK); 601 addr -= pageoff; 602 size += pageoff; 603 size = (vm_size_t) round_page(size); 604 if (addr + size < addr) 605 return (EINVAL); 606 607 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 608 addr + size, inherit)) { 609 case KERN_SUCCESS: 610 return (0); 611 case KERN_PROTECTION_FAILURE: 612 return (EACCES); 613 } 614 return (EINVAL); 615 } 616 617 #ifndef _SYS_SYSPROTO_H_ 618 struct madvise_args { 619 void *addr; 620 size_t len; 621 int behav; 622 }; 623 #endif 624 625 /* 626 * MPSAFE 627 */ 628 /* ARGSUSED */ 629 int 630 madvise(td, uap) 631 struct thread *td; 632 struct madvise_args *uap; 633 { 634 vm_offset_t start, end; 635 vm_map_t map; 636 struct proc *p; 637 int error; 638 639 /* 640 * Check for our special case, advising the swap pager we are 641 * "immortal." 642 */ 643 if (uap->behav == MADV_PROTECT) { 644 error = suser(td); 645 if (error == 0) { 646 p = td->td_proc; 647 PROC_LOCK(p); 648 p->p_flag |= P_PROTECTED; 649 PROC_UNLOCK(p); 650 } 651 return (error); 652 } 653 /* 654 * Check for illegal behavior 655 */ 656 if (uap->behav < 0 || uap->behav > MADV_CORE) 657 return (EINVAL); 658 /* 659 * Check for illegal addresses. Watch out for address wrap... Note 660 * that VM_*_ADDRESS are not constants due to casts (argh). 661 */ 662 map = &td->td_proc->p_vmspace->vm_map; 663 if ((vm_offset_t)uap->addr < vm_map_min(map) || 664 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 665 return (EINVAL); 666 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 667 return (EINVAL); 668 669 /* 670 * Since this routine is only advisory, we default to conservative 671 * behavior. 672 */ 673 start = trunc_page((vm_offset_t) uap->addr); 674 end = round_page((vm_offset_t) uap->addr + uap->len); 675 676 if (vm_map_madvise(map, start, end, uap->behav)) 677 return (EINVAL); 678 return (0); 679 } 680 681 #ifndef _SYS_SYSPROTO_H_ 682 struct mincore_args { 683 const void *addr; 684 size_t len; 685 char *vec; 686 }; 687 #endif 688 689 /* 690 * MPSAFE 691 */ 692 /* ARGSUSED */ 693 int 694 mincore(td, uap) 695 struct thread *td; 696 struct mincore_args *uap; 697 { 698 vm_offset_t addr, first_addr; 699 vm_offset_t end, cend; 700 pmap_t pmap; 701 vm_map_t map; 702 char *vec; 703 int error = 0; 704 int vecindex, lastvecindex; 705 vm_map_entry_t current; 706 vm_map_entry_t entry; 707 int mincoreinfo; 708 unsigned int timestamp; 709 710 /* 711 * Make sure that the addresses presented are valid for user 712 * mode. 713 */ 714 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 715 end = addr + (vm_size_t)round_page(uap->len); 716 map = &td->td_proc->p_vmspace->vm_map; 717 if (end > vm_map_max(map) || end < addr) 718 return (EINVAL); 719 720 /* 721 * Address of byte vector 722 */ 723 vec = uap->vec; 724 725 pmap = vmspace_pmap(td->td_proc->p_vmspace); 726 727 vm_map_lock_read(map); 728 RestartScan: 729 timestamp = map->timestamp; 730 731 if (!vm_map_lookup_entry(map, addr, &entry)) 732 entry = entry->next; 733 734 /* 735 * Do this on a map entry basis so that if the pages are not 736 * in the current processes address space, we can easily look 737 * up the pages elsewhere. 738 */ 739 lastvecindex = -1; 740 for (current = entry; 741 (current != &map->header) && (current->start < end); 742 current = current->next) { 743 744 /* 745 * ignore submaps (for now) or null objects 746 */ 747 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 748 current->object.vm_object == NULL) 749 continue; 750 751 /* 752 * limit this scan to the current map entry and the 753 * limits for the mincore call 754 */ 755 if (addr < current->start) 756 addr = current->start; 757 cend = current->end; 758 if (cend > end) 759 cend = end; 760 761 /* 762 * scan this entry one page at a time 763 */ 764 while (addr < cend) { 765 /* 766 * Check pmap first, it is likely faster, also 767 * it can provide info as to whether we are the 768 * one referencing or modifying the page. 769 */ 770 mtx_lock(&Giant); 771 mincoreinfo = pmap_mincore(pmap, addr); 772 mtx_unlock(&Giant); 773 if (!mincoreinfo) { 774 vm_pindex_t pindex; 775 vm_ooffset_t offset; 776 vm_page_t m; 777 /* 778 * calculate the page index into the object 779 */ 780 offset = current->offset + (addr - current->start); 781 pindex = OFF_TO_IDX(offset); 782 VM_OBJECT_LOCK(current->object.vm_object); 783 m = vm_page_lookup(current->object.vm_object, 784 pindex); 785 /* 786 * if the page is resident, then gather information about 787 * it. 788 */ 789 if (m != NULL && m->valid != 0) { 790 mincoreinfo = MINCORE_INCORE; 791 vm_page_lock_queues(); 792 if (m->dirty || 793 pmap_is_modified(m)) 794 mincoreinfo |= MINCORE_MODIFIED_OTHER; 795 if ((m->flags & PG_REFERENCED) || 796 pmap_ts_referenced(m)) { 797 vm_page_flag_set(m, PG_REFERENCED); 798 mincoreinfo |= MINCORE_REFERENCED_OTHER; 799 } 800 vm_page_unlock_queues(); 801 } 802 VM_OBJECT_UNLOCK(current->object.vm_object); 803 } 804 805 /* 806 * subyte may page fault. In case it needs to modify 807 * the map, we release the lock. 808 */ 809 vm_map_unlock_read(map); 810 811 /* 812 * calculate index into user supplied byte vector 813 */ 814 vecindex = OFF_TO_IDX(addr - first_addr); 815 816 /* 817 * If we have skipped map entries, we need to make sure that 818 * the byte vector is zeroed for those skipped entries. 819 */ 820 while ((lastvecindex + 1) < vecindex) { 821 error = subyte(vec + lastvecindex, 0); 822 if (error) { 823 error = EFAULT; 824 goto done2; 825 } 826 ++lastvecindex; 827 } 828 829 /* 830 * Pass the page information to the user 831 */ 832 error = subyte(vec + vecindex, mincoreinfo); 833 if (error) { 834 error = EFAULT; 835 goto done2; 836 } 837 838 /* 839 * If the map has changed, due to the subyte, the previous 840 * output may be invalid. 841 */ 842 vm_map_lock_read(map); 843 if (timestamp != map->timestamp) 844 goto RestartScan; 845 846 lastvecindex = vecindex; 847 addr += PAGE_SIZE; 848 } 849 } 850 851 /* 852 * subyte may page fault. In case it needs to modify 853 * the map, we release the lock. 854 */ 855 vm_map_unlock_read(map); 856 857 /* 858 * Zero the last entries in the byte vector. 859 */ 860 vecindex = OFF_TO_IDX(end - first_addr); 861 while ((lastvecindex + 1) < vecindex) { 862 error = subyte(vec + lastvecindex, 0); 863 if (error) { 864 error = EFAULT; 865 goto done2; 866 } 867 ++lastvecindex; 868 } 869 870 /* 871 * If the map has changed, due to the subyte, the previous 872 * output may be invalid. 873 */ 874 vm_map_lock_read(map); 875 if (timestamp != map->timestamp) 876 goto RestartScan; 877 vm_map_unlock_read(map); 878 done2: 879 return (error); 880 } 881 882 #ifndef _SYS_SYSPROTO_H_ 883 struct mlock_args { 884 const void *addr; 885 size_t len; 886 }; 887 #endif 888 /* 889 * MPSAFE 890 */ 891 int 892 mlock(td, uap) 893 struct thread *td; 894 struct mlock_args *uap; 895 { 896 struct proc *proc; 897 vm_offset_t addr, end, last, start; 898 vm_size_t npages, size; 899 int error; 900 901 error = suser(td); 902 if (error) 903 return (error); 904 addr = (vm_offset_t)uap->addr; 905 size = uap->len; 906 last = addr + size; 907 start = trunc_page(addr); 908 end = round_page(last); 909 if (last < addr || end < addr) 910 return (EINVAL); 911 npages = atop(end - start); 912 if (npages > vm_page_max_wired) 913 return (ENOMEM); 914 proc = td->td_proc; 915 PROC_LOCK(proc); 916 if (ptoa(npages + 917 pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) > 918 lim_cur(proc, RLIMIT_MEMLOCK)) { 919 PROC_UNLOCK(proc); 920 return (ENOMEM); 921 } 922 PROC_UNLOCK(proc); 923 if (npages + cnt.v_wire_count > vm_page_max_wired) 924 return (EAGAIN); 925 error = vm_map_wire(&proc->p_vmspace->vm_map, start, end, 926 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 927 return (error == KERN_SUCCESS ? 0 : ENOMEM); 928 } 929 930 #ifndef _SYS_SYSPROTO_H_ 931 struct mlockall_args { 932 int how; 933 }; 934 #endif 935 936 /* 937 * MPSAFE 938 */ 939 int 940 mlockall(td, uap) 941 struct thread *td; 942 struct mlockall_args *uap; 943 { 944 vm_map_t map; 945 int error; 946 947 map = &td->td_proc->p_vmspace->vm_map; 948 error = 0; 949 950 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 951 return (EINVAL); 952 953 #if 0 954 /* 955 * If wiring all pages in the process would cause it to exceed 956 * a hard resource limit, return ENOMEM. 957 */ 958 PROC_LOCK(td->td_proc); 959 if (map->size - ptoa(pmap_wired_count(vm_map_pmap(map)) > 960 lim_cur(td->td_proc, RLIMIT_MEMLOCK))) { 961 PROC_UNLOCK(td->td_proc); 962 return (ENOMEM); 963 } 964 PROC_UNLOCK(td->td_proc); 965 #else 966 error = suser(td); 967 if (error) 968 return (error); 969 #endif 970 971 if (uap->how & MCL_FUTURE) { 972 vm_map_lock(map); 973 vm_map_modflags(map, MAP_WIREFUTURE, 0); 974 vm_map_unlock(map); 975 error = 0; 976 } 977 978 if (uap->how & MCL_CURRENT) { 979 /* 980 * P1003.1-2001 mandates that all currently mapped pages 981 * will be memory resident and locked (wired) upon return 982 * from mlockall(). vm_map_wire() will wire pages, by 983 * calling vm_fault_wire() for each page in the region. 984 */ 985 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 986 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 987 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 988 } 989 990 return (error); 991 } 992 993 #ifndef _SYS_SYSPROTO_H_ 994 struct munlockall_args { 995 register_t dummy; 996 }; 997 #endif 998 999 /* 1000 * MPSAFE 1001 */ 1002 int 1003 munlockall(td, uap) 1004 struct thread *td; 1005 struct munlockall_args *uap; 1006 { 1007 vm_map_t map; 1008 int error; 1009 1010 map = &td->td_proc->p_vmspace->vm_map; 1011 error = suser(td); 1012 if (error) 1013 return (error); 1014 1015 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1016 vm_map_lock(map); 1017 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1018 vm_map_unlock(map); 1019 1020 /* Forcibly unwire all pages. */ 1021 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1022 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1023 1024 return (error); 1025 } 1026 1027 #ifndef _SYS_SYSPROTO_H_ 1028 struct munlock_args { 1029 const void *addr; 1030 size_t len; 1031 }; 1032 #endif 1033 /* 1034 * MPSAFE 1035 */ 1036 int 1037 munlock(td, uap) 1038 struct thread *td; 1039 struct munlock_args *uap; 1040 { 1041 vm_offset_t addr, end, last, start; 1042 vm_size_t size; 1043 int error; 1044 1045 error = suser(td); 1046 if (error) 1047 return (error); 1048 addr = (vm_offset_t)uap->addr; 1049 size = uap->len; 1050 last = addr + size; 1051 start = trunc_page(addr); 1052 end = round_page(last); 1053 if (last < addr || end < addr) 1054 return (EINVAL); 1055 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1056 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1057 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1058 } 1059 1060 /* 1061 * vm_mmap_vnode() 1062 * 1063 * MPSAFE 1064 * 1065 * Helper function for vm_mmap. Perform sanity check specific for mmap 1066 * operations on vnodes. 1067 */ 1068 int 1069 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1070 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1071 struct vnode *vp, vm_ooffset_t foff, vm_object_t *objp) 1072 { 1073 struct vattr va; 1074 void *handle; 1075 vm_object_t obj; 1076 int disablexworkaround, error, flags, type; 1077 1078 mtx_lock(&Giant); 1079 if ((error = vget(vp, LK_EXCLUSIVE, td)) != 0) { 1080 mtx_unlock(&Giant); 1081 return (error); 1082 } 1083 flags = *flagsp; 1084 if (vp->v_type == VREG) { 1085 /* 1086 * Get the proper underlying object 1087 */ 1088 if (VOP_GETVOBJECT(vp, &obj) != 0) { 1089 error = EINVAL; 1090 goto done; 1091 } 1092 if (obj->handle != vp) { 1093 vput(vp); 1094 vp = (struct vnode*)obj->handle; 1095 vget(vp, LK_EXCLUSIVE, td); 1096 } 1097 type = OBJT_VNODE; 1098 handle = vp; 1099 } else if (vp->v_type == VCHR) { 1100 type = OBJT_DEVICE; 1101 handle = vp->v_rdev; 1102 1103 if(vp->v_rdev->si_devsw->d_flags & D_MMAP_ANON) { 1104 *maxprotp = VM_PROT_ALL; 1105 *flagsp |= MAP_ANON; 1106 error = 0; 1107 goto done; 1108 } 1109 /* 1110 * cdevs does not provide private mappings of any kind. 1111 */ 1112 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1113 (prot & PROT_WRITE) != 0) { 1114 error = EACCES; 1115 goto done; 1116 } 1117 /* 1118 * However, for XIG X server to continue to work, 1119 * we should allow the superuser to do it anyway. 1120 * We only allow it at securelevel < 1. 1121 * (Because the XIG X server writes directly to video 1122 * memory via /dev/mem, it should never work at any 1123 * other securelevel. 1124 * XXX this will have to go 1125 */ 1126 if (securelevel_ge(td->td_ucred, 1)) 1127 disablexworkaround = 1; 1128 else 1129 disablexworkaround = suser(td); 1130 if (disablexworkaround && (flags & (MAP_PRIVATE|MAP_COPY))) { 1131 error = EINVAL; 1132 goto done; 1133 } 1134 /* 1135 * Force device mappings to be shared. 1136 */ 1137 flags &= ~(MAP_PRIVATE|MAP_COPY); 1138 flags |= MAP_SHARED; 1139 } else { 1140 error = EINVAL; 1141 goto done; 1142 } 1143 if ((error = VOP_GETATTR(vp, &va, td->td_ucred, td))) { 1144 goto done; 1145 } 1146 if ((flags & MAP_SHARED) != 0) { 1147 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1148 if (prot & PROT_WRITE) { 1149 error = EPERM; 1150 goto done; 1151 } 1152 *maxprotp &= ~VM_PROT_WRITE; 1153 } 1154 #ifdef MAC 1155 error = mac_check_vnode_mmap(td->td_ucred, vp, prot); 1156 if (error != 0) 1157 goto done; 1158 #endif 1159 } 1160 /* 1161 * If it is a regular file without any references 1162 * we do not need to sync it. 1163 * Adjust object size to be the size of actual file. 1164 */ 1165 if (vp->v_type == VREG) { 1166 objsize = round_page(va.va_size); 1167 if (va.va_nlink == 0) 1168 flags |= MAP_NOSYNC; 1169 } 1170 obj = vm_pager_allocate(type, handle, objsize, prot, foff); 1171 if (obj == NULL) { 1172 error = (type == OBJT_DEVICE ? EINVAL : ENOMEM); 1173 goto done; 1174 } 1175 *objp = obj; 1176 *flagsp = flags; 1177 done: 1178 vput(vp); 1179 mtx_unlock(&Giant); 1180 return (error); 1181 } 1182 1183 /* 1184 * vm_mmap() 1185 * 1186 * MPSAFE 1187 * 1188 * Internal version of mmap. Currently used by mmap, exec, and sys5 1189 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1190 */ 1191 int 1192 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1193 vm_prot_t maxprot, int flags, 1194 void *handle, 1195 vm_ooffset_t foff) 1196 { 1197 boolean_t fitit; 1198 vm_object_t object; 1199 int rv = KERN_SUCCESS; 1200 vm_ooffset_t objsize; 1201 int docow, error; 1202 struct thread *td = curthread; 1203 1204 if (size == 0) 1205 return (0); 1206 1207 objsize = size = round_page(size); 1208 1209 PROC_LOCK(td->td_proc); 1210 if (td->td_proc->p_vmspace->vm_map.size + size > 1211 lim_cur(td->td_proc, RLIMIT_VMEM)) { 1212 PROC_UNLOCK(td->td_proc); 1213 return(ENOMEM); 1214 } 1215 PROC_UNLOCK(td->td_proc); 1216 1217 /* 1218 * We currently can only deal with page aligned file offsets. 1219 * The check is here rather than in the syscall because the 1220 * kernel calls this function internally for other mmaping 1221 * operations (such as in exec) and non-aligned offsets will 1222 * cause pmap inconsistencies...so we want to be sure to 1223 * disallow this in all cases. 1224 */ 1225 if (foff & PAGE_MASK) 1226 return (EINVAL); 1227 1228 if ((flags & MAP_FIXED) == 0) { 1229 fitit = TRUE; 1230 *addr = round_page(*addr); 1231 } else { 1232 if (*addr != trunc_page(*addr)) 1233 return (EINVAL); 1234 fitit = FALSE; 1235 (void) vm_map_remove(map, *addr, *addr + size); 1236 } 1237 /* 1238 * Lookup/allocate object. 1239 */ 1240 if (handle != NULL) { 1241 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1242 handle, foff, &object); 1243 if (error) { 1244 return (error); 1245 } 1246 } 1247 if (flags & MAP_ANON) { 1248 object = NULL; 1249 docow = 0; 1250 /* 1251 * Unnamed anonymous regions always start at 0. 1252 */ 1253 if (handle == 0) 1254 foff = 0; 1255 } else { 1256 docow = MAP_PREFAULT_PARTIAL; 1257 } 1258 1259 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1260 docow |= MAP_COPY_ON_WRITE; 1261 if (flags & MAP_NOSYNC) 1262 docow |= MAP_DISABLE_SYNCER; 1263 if (flags & MAP_NOCORE) 1264 docow |= MAP_DISABLE_COREDUMP; 1265 1266 #if defined(VM_PROT_READ_IS_EXEC) 1267 if (prot & VM_PROT_READ) 1268 prot |= VM_PROT_EXECUTE; 1269 1270 if (maxprot & VM_PROT_READ) 1271 maxprot |= VM_PROT_EXECUTE; 1272 #endif 1273 1274 if (fitit) 1275 *addr = pmap_addr_hint(object, *addr, size); 1276 1277 if (flags & MAP_STACK) 1278 rv = vm_map_stack(map, *addr, size, prot, maxprot, 1279 docow | MAP_STACK_GROWS_DOWN); 1280 else 1281 rv = vm_map_find(map, object, foff, addr, size, fitit, 1282 prot, maxprot, docow); 1283 1284 if (rv != KERN_SUCCESS) { 1285 /* 1286 * Lose the object reference. Will destroy the 1287 * object if it's an unnamed anonymous mapping 1288 * or named anonymous without other references. 1289 */ 1290 vm_object_deallocate(object); 1291 } else if (flags & MAP_SHARED) { 1292 /* 1293 * Shared memory is also shared with children. 1294 */ 1295 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); 1296 if (rv != KERN_SUCCESS) 1297 (void) vm_map_remove(map, *addr, *addr + size); 1298 } 1299 1300 /* 1301 * If the process has requested that all future mappings 1302 * be wired, then heed this. 1303 */ 1304 if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE)) 1305 vm_map_wire(map, *addr, *addr + size, 1306 VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); 1307 1308 switch (rv) { 1309 case KERN_SUCCESS: 1310 return (0); 1311 case KERN_INVALID_ADDRESS: 1312 case KERN_NO_SPACE: 1313 return (ENOMEM); 1314 case KERN_PROTECTION_FAILURE: 1315 return (EACCES); 1316 default: 1317 return (EINVAL); 1318 } 1319 } 1320