1 /* 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 39 * 40 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 41 */ 42 43 /* 44 * Mapped file (mmap) interface to VM 45 */ 46 47 #include <sys/cdefs.h> 48 __FBSDID("$FreeBSD$"); 49 50 #include "opt_compat.h" 51 #include "opt_mac.h" 52 53 #include <sys/param.h> 54 #include <sys/systm.h> 55 #include <sys/kernel.h> 56 #include <sys/lock.h> 57 #include <sys/mutex.h> 58 #include <sys/sysproto.h> 59 #include <sys/filedesc.h> 60 #include <sys/proc.h> 61 #include <sys/resource.h> 62 #include <sys/resourcevar.h> 63 #include <sys/vnode.h> 64 #include <sys/fcntl.h> 65 #include <sys/file.h> 66 #include <sys/mac.h> 67 #include <sys/mman.h> 68 #include <sys/conf.h> 69 #include <sys/stat.h> 70 #include <sys/vmmeter.h> 71 #include <sys/sysctl.h> 72 73 #include <vm/vm.h> 74 #include <vm/vm_param.h> 75 #include <vm/pmap.h> 76 #include <vm/vm_map.h> 77 #include <vm/vm_object.h> 78 #include <vm/vm_page.h> 79 #include <vm/vm_pager.h> 80 #include <vm/vm_pageout.h> 81 #include <vm/vm_extern.h> 82 #include <vm/vm_page.h> 83 #include <vm/vm_kern.h> 84 85 #ifndef _SYS_SYSPROTO_H_ 86 struct sbrk_args { 87 int incr; 88 }; 89 #endif 90 91 static int max_proc_mmap; 92 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, ""); 93 94 /* 95 * Set the maximum number of vm_map_entry structures per process. Roughly 96 * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100 97 * of our KVM malloc space still results in generous limits. We want a 98 * default that is good enough to prevent the kernel running out of resources 99 * if attacked from compromised user account but generous enough such that 100 * multi-threaded processes are not unduly inconvenienced. 101 */ 102 static void vmmapentry_rsrc_init(void *); 103 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, NULL) 104 105 static void 106 vmmapentry_rsrc_init(dummy) 107 void *dummy; 108 { 109 max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry); 110 max_proc_mmap /= 100; 111 } 112 113 /* 114 * MPSAFE 115 */ 116 /* ARGSUSED */ 117 int 118 sbrk(td, uap) 119 struct thread *td; 120 struct sbrk_args *uap; 121 { 122 /* Not yet implemented */ 123 /* mtx_lock(&Giant); */ 124 /* mtx_unlock(&Giant); */ 125 return (EOPNOTSUPP); 126 } 127 128 #ifndef _SYS_SYSPROTO_H_ 129 struct sstk_args { 130 int incr; 131 }; 132 #endif 133 134 /* 135 * MPSAFE 136 */ 137 /* ARGSUSED */ 138 int 139 sstk(td, uap) 140 struct thread *td; 141 struct sstk_args *uap; 142 { 143 /* Not yet implemented */ 144 /* mtx_lock(&Giant); */ 145 /* mtx_unlock(&Giant); */ 146 return (EOPNOTSUPP); 147 } 148 149 #if defined(COMPAT_43) || defined(COMPAT_SUNOS) 150 #ifndef _SYS_SYSPROTO_H_ 151 struct getpagesize_args { 152 int dummy; 153 }; 154 #endif 155 156 /* ARGSUSED */ 157 int 158 ogetpagesize(td, uap) 159 struct thread *td; 160 struct getpagesize_args *uap; 161 { 162 /* MP SAFE */ 163 td->td_retval[0] = PAGE_SIZE; 164 return (0); 165 } 166 #endif /* COMPAT_43 || COMPAT_SUNOS */ 167 168 169 /* 170 * Memory Map (mmap) system call. Note that the file offset 171 * and address are allowed to be NOT page aligned, though if 172 * the MAP_FIXED flag it set, both must have the same remainder 173 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 174 * page-aligned, the actual mapping starts at trunc_page(addr) 175 * and the return value is adjusted up by the page offset. 176 * 177 * Generally speaking, only character devices which are themselves 178 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 179 * there would be no cache coherency between a descriptor and a VM mapping 180 * both to the same character device. 181 * 182 * Block devices can be mmap'd no matter what they represent. Cache coherency 183 * is maintained as long as you do not write directly to the underlying 184 * character device. 185 */ 186 #ifndef _SYS_SYSPROTO_H_ 187 struct mmap_args { 188 void *addr; 189 size_t len; 190 int prot; 191 int flags; 192 int fd; 193 long pad; 194 off_t pos; 195 }; 196 #endif 197 198 /* 199 * MPSAFE 200 */ 201 int 202 mmap(td, uap) 203 struct thread *td; 204 struct mmap_args *uap; 205 { 206 struct file *fp = NULL; 207 struct vnode *vp; 208 vm_offset_t addr; 209 vm_size_t size, pageoff; 210 vm_prot_t prot, maxprot; 211 void *handle; 212 int flags, error; 213 int disablexworkaround; 214 off_t pos; 215 struct vmspace *vms = td->td_proc->p_vmspace; 216 vm_object_t obj; 217 218 addr = (vm_offset_t) uap->addr; 219 size = uap->len; 220 prot = uap->prot & VM_PROT_ALL; 221 flags = uap->flags; 222 pos = uap->pos; 223 224 vp = NULL; 225 fp = NULL; 226 /* make sure mapping fits into numeric range etc */ 227 if ((ssize_t) uap->len < 0 || 228 ((flags & MAP_ANON) && uap->fd != -1)) 229 return (EINVAL); 230 231 if (flags & MAP_STACK) { 232 if ((uap->fd != -1) || 233 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 234 return (EINVAL); 235 flags |= MAP_ANON; 236 pos = 0; 237 } 238 239 /* 240 * Align the file position to a page boundary, 241 * and save its page offset component. 242 */ 243 pageoff = (pos & PAGE_MASK); 244 pos -= pageoff; 245 246 /* Adjust size for rounding (on both ends). */ 247 size += pageoff; /* low end... */ 248 size = (vm_size_t) round_page(size); /* hi end */ 249 250 /* 251 * Check for illegal addresses. Watch out for address wrap... Note 252 * that VM_*_ADDRESS are not constants due to casts (argh). 253 */ 254 if (flags & MAP_FIXED) { 255 /* 256 * The specified address must have the same remainder 257 * as the file offset taken modulo PAGE_SIZE, so it 258 * should be aligned after adjustment by pageoff. 259 */ 260 addr -= pageoff; 261 if (addr & PAGE_MASK) 262 return (EINVAL); 263 /* Address range must be all in user VM space. */ 264 if (addr < vm_map_min(&vms->vm_map) || 265 addr + size > vm_map_max(&vms->vm_map)) 266 return (EINVAL); 267 if (addr + size < addr) 268 return (EINVAL); 269 } 270 /* 271 * XXX for non-fixed mappings where no hint is provided or 272 * the hint would fall in the potential heap space, 273 * place it after the end of the largest possible heap. 274 * 275 * There should really be a pmap call to determine a reasonable 276 * location. 277 */ 278 else if (addr == 0 || 279 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 280 addr < round_page((vm_offset_t)vms->vm_daddr + 281 td->td_proc->p_rlimit[RLIMIT_DATA].rlim_max))) 282 addr = round_page((vm_offset_t)vms->vm_daddr + 283 td->td_proc->p_rlimit[RLIMIT_DATA].rlim_max); 284 285 mtx_lock(&Giant); /* syscall marked mp-safe but isn't */ 286 do { 287 if (flags & MAP_ANON) { 288 /* 289 * Mapping blank space is trivial. 290 */ 291 handle = NULL; 292 maxprot = VM_PROT_ALL; 293 pos = 0; 294 break; 295 } 296 /* 297 * Mapping file, get fp for validation. Obtain vnode and make 298 * sure it is of appropriate type. 299 * don't let the descriptor disappear on us if we block 300 */ 301 if ((error = fget(td, uap->fd, &fp)) != 0) 302 goto done; 303 if (fp->f_type != DTYPE_VNODE) { 304 error = EINVAL; 305 goto done; 306 } 307 308 /* 309 * POSIX shared-memory objects are defined to have 310 * kernel persistence, and are not defined to support 311 * read(2)/write(2) -- or even open(2). Thus, we can 312 * use MAP_ASYNC to trade on-disk coherence for speed. 313 * The shm_open(3) library routine turns on the FPOSIXSHM 314 * flag to request this behavior. 315 */ 316 if (fp->f_flag & FPOSIXSHM) 317 flags |= MAP_NOSYNC; 318 vp = fp->f_vnode; 319 error = vget(vp, LK_EXCLUSIVE, td); 320 if (error) 321 goto done; 322 if (vp->v_type != VREG && vp->v_type != VCHR) { 323 error = EINVAL; 324 goto done; 325 } 326 if (vp->v_type == VREG) { 327 /* 328 * Get the proper underlying object 329 */ 330 if (VOP_GETVOBJECT(vp, &obj) != 0) { 331 error = EINVAL; 332 goto done; 333 } 334 if (obj->handle != vp) { 335 vput(vp); 336 vp = (struct vnode*)obj->handle; 337 vget(vp, LK_EXCLUSIVE, td); 338 } 339 } 340 /* 341 * XXX hack to handle use of /dev/zero to map anon memory (ala 342 * SunOS). 343 */ 344 if ((vp->v_type == VCHR) && 345 (vp->v_rdev->si_devsw->d_flags & D_MMAP_ANON)) { 346 handle = NULL; 347 maxprot = VM_PROT_ALL; 348 flags |= MAP_ANON; 349 pos = 0; 350 break; 351 } 352 /* 353 * cdevs does not provide private mappings of any kind. 354 */ 355 /* 356 * However, for XIG X server to continue to work, 357 * we should allow the superuser to do it anyway. 358 * We only allow it at securelevel < 1. 359 * (Because the XIG X server writes directly to video 360 * memory via /dev/mem, it should never work at any 361 * other securelevel. 362 * XXX this will have to go 363 */ 364 if (securelevel_ge(td->td_ucred, 1)) 365 disablexworkaround = 1; 366 else 367 disablexworkaround = suser(td); 368 if (vp->v_type == VCHR && disablexworkaround && 369 (flags & (MAP_PRIVATE|MAP_COPY))) { 370 error = EINVAL; 371 goto done; 372 } 373 /* 374 * Ensure that file and memory protections are 375 * compatible. Note that we only worry about 376 * writability if mapping is shared; in this case, 377 * current and max prot are dictated by the open file. 378 * XXX use the vnode instead? Problem is: what 379 * credentials do we use for determination? What if 380 * proc does a setuid? 381 */ 382 maxprot = VM_PROT_EXECUTE; /* ??? */ 383 if (fp->f_flag & FREAD) { 384 maxprot |= VM_PROT_READ; 385 } else if (prot & PROT_READ) { 386 error = EACCES; 387 goto done; 388 } 389 /* 390 * If we are sharing potential changes (either via 391 * MAP_SHARED or via the implicit sharing of character 392 * device mappings), and we are trying to get write 393 * permission although we opened it without asking 394 * for it, bail out. Check for superuser, only if 395 * we're at securelevel < 1, to allow the XIG X server 396 * to continue to work. 397 */ 398 if ((flags & MAP_SHARED) != 0 || 399 (vp->v_type == VCHR && disablexworkaround)) { 400 if ((fp->f_flag & FWRITE) != 0) { 401 struct vattr va; 402 if ((error = 403 VOP_GETATTR(vp, &va, 404 td->td_ucred, td))) { 405 goto done; 406 } 407 if ((va.va_flags & 408 (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0) { 409 maxprot |= VM_PROT_WRITE; 410 } else if (prot & PROT_WRITE) { 411 error = EPERM; 412 goto done; 413 } 414 } else if ((prot & PROT_WRITE) != 0) { 415 error = EACCES; 416 goto done; 417 } 418 } else { 419 maxprot |= VM_PROT_WRITE; 420 } 421 422 handle = (void *)vp; 423 } while (0); 424 425 /* 426 * Do not allow more then a certain number of vm_map_entry structures 427 * per process. Scale with the number of rforks sharing the map 428 * to make the limit reasonable for threads. 429 */ 430 if (max_proc_mmap && 431 vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) { 432 error = ENOMEM; 433 goto done; 434 } 435 436 mtx_unlock(&Giant); 437 error = 0; 438 #ifdef MAC 439 if (handle != NULL && (flags & MAP_SHARED) != 0) { 440 error = mac_check_vnode_mmap(td->td_ucred, 441 (struct vnode *)handle, prot); 442 } 443 #endif 444 if (error == 0) 445 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 446 flags, handle, pos); 447 mtx_lock(&Giant); 448 if (error == 0) 449 td->td_retval[0] = (register_t) (addr + pageoff); 450 done: 451 if (vp) 452 vput(vp); 453 mtx_unlock(&Giant); 454 if (fp) 455 fdrop(fp, td); 456 457 return (error); 458 } 459 460 #ifdef COMPAT_43 461 #ifndef _SYS_SYSPROTO_H_ 462 struct ommap_args { 463 caddr_t addr; 464 int len; 465 int prot; 466 int flags; 467 int fd; 468 long pos; 469 }; 470 #endif 471 int 472 ommap(td, uap) 473 struct thread *td; 474 struct ommap_args *uap; 475 { 476 struct mmap_args nargs; 477 static const char cvtbsdprot[8] = { 478 0, 479 PROT_EXEC, 480 PROT_WRITE, 481 PROT_EXEC | PROT_WRITE, 482 PROT_READ, 483 PROT_EXEC | PROT_READ, 484 PROT_WRITE | PROT_READ, 485 PROT_EXEC | PROT_WRITE | PROT_READ, 486 }; 487 488 #define OMAP_ANON 0x0002 489 #define OMAP_COPY 0x0020 490 #define OMAP_SHARED 0x0010 491 #define OMAP_FIXED 0x0100 492 493 nargs.addr = uap->addr; 494 nargs.len = uap->len; 495 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 496 nargs.flags = 0; 497 if (uap->flags & OMAP_ANON) 498 nargs.flags |= MAP_ANON; 499 if (uap->flags & OMAP_COPY) 500 nargs.flags |= MAP_COPY; 501 if (uap->flags & OMAP_SHARED) 502 nargs.flags |= MAP_SHARED; 503 else 504 nargs.flags |= MAP_PRIVATE; 505 if (uap->flags & OMAP_FIXED) 506 nargs.flags |= MAP_FIXED; 507 nargs.fd = uap->fd; 508 nargs.pos = uap->pos; 509 return (mmap(td, &nargs)); 510 } 511 #endif /* COMPAT_43 */ 512 513 514 #ifndef _SYS_SYSPROTO_H_ 515 struct msync_args { 516 void *addr; 517 int len; 518 int flags; 519 }; 520 #endif 521 /* 522 * MPSAFE 523 */ 524 int 525 msync(td, uap) 526 struct thread *td; 527 struct msync_args *uap; 528 { 529 vm_offset_t addr; 530 vm_size_t size, pageoff; 531 int flags; 532 vm_map_t map; 533 int rv; 534 535 addr = (vm_offset_t) uap->addr; 536 size = uap->len; 537 flags = uap->flags; 538 539 pageoff = (addr & PAGE_MASK); 540 addr -= pageoff; 541 size += pageoff; 542 size = (vm_size_t) round_page(size); 543 if (addr + size < addr) 544 return (EINVAL); 545 546 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 547 return (EINVAL); 548 549 mtx_lock(&Giant); 550 551 map = &td->td_proc->p_vmspace->vm_map; 552 553 /* 554 * XXX Gak! If size is zero we are supposed to sync "all modified 555 * pages with the region containing addr". Unfortunately, we don't 556 * really keep track of individual mmaps so we approximate by flushing 557 * the range of the map entry containing addr. This can be incorrect 558 * if the region splits or is coalesced with a neighbor. 559 */ 560 if (size == 0) { 561 vm_map_entry_t entry; 562 563 vm_map_lock_read(map); 564 rv = vm_map_lookup_entry(map, addr, &entry); 565 vm_map_unlock_read(map); 566 if (rv == FALSE) { 567 rv = -1; 568 goto done2; 569 } 570 addr = entry->start; 571 size = entry->end - entry->start; 572 } 573 574 /* 575 * Clean the pages and interpret the return value. 576 */ 577 rv = vm_map_clean(map, addr, addr + size, (flags & MS_ASYNC) == 0, 578 (flags & MS_INVALIDATE) != 0); 579 580 done2: 581 mtx_unlock(&Giant); 582 583 switch (rv) { 584 case KERN_SUCCESS: 585 return (0); 586 case KERN_INVALID_ADDRESS: 587 return (EINVAL); /* Sun returns ENOMEM? */ 588 case KERN_FAILURE: 589 return (EIO); 590 default: 591 return (EINVAL); 592 } 593 } 594 595 #ifndef _SYS_SYSPROTO_H_ 596 struct munmap_args { 597 void *addr; 598 size_t len; 599 }; 600 #endif 601 /* 602 * MPSAFE 603 */ 604 int 605 munmap(td, uap) 606 struct thread *td; 607 struct munmap_args *uap; 608 { 609 vm_offset_t addr; 610 vm_size_t size, pageoff; 611 vm_map_t map; 612 613 addr = (vm_offset_t) uap->addr; 614 size = uap->len; 615 616 pageoff = (addr & PAGE_MASK); 617 addr -= pageoff; 618 size += pageoff; 619 size = (vm_size_t) round_page(size); 620 if (addr + size < addr) 621 return (EINVAL); 622 623 if (size == 0) 624 return (0); 625 626 /* 627 * Check for illegal addresses. Watch out for address wrap... 628 */ 629 map = &td->td_proc->p_vmspace->vm_map; 630 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 631 return (EINVAL); 632 /* 633 * Make sure entire range is allocated. 634 */ 635 if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) 636 return (EINVAL); 637 638 /* returns nothing but KERN_SUCCESS anyway */ 639 (void) vm_map_remove(map, addr, addr + size); 640 return (0); 641 } 642 643 #if 0 644 void 645 munmapfd(td, fd) 646 struct thread *td; 647 int fd; 648 { 649 /* 650 * XXX should unmap any regions mapped to this file 651 */ 652 FILEDESC_LOCK(p->p_fd); 653 td->td_proc->p_fd->fd_ofileflags[fd] &= ~UF_MAPPED; 654 FILEDESC_UNLOCK(p->p_fd); 655 } 656 #endif 657 658 #ifndef _SYS_SYSPROTO_H_ 659 struct mprotect_args { 660 const void *addr; 661 size_t len; 662 int prot; 663 }; 664 #endif 665 /* 666 * MPSAFE 667 */ 668 int 669 mprotect(td, uap) 670 struct thread *td; 671 struct mprotect_args *uap; 672 { 673 vm_offset_t addr; 674 vm_size_t size, pageoff; 675 vm_prot_t prot; 676 677 addr = (vm_offset_t) uap->addr; 678 size = uap->len; 679 prot = uap->prot & VM_PROT_ALL; 680 #if defined(VM_PROT_READ_IS_EXEC) 681 if (prot & VM_PROT_READ) 682 prot |= VM_PROT_EXECUTE; 683 #endif 684 685 pageoff = (addr & PAGE_MASK); 686 addr -= pageoff; 687 size += pageoff; 688 size = (vm_size_t) round_page(size); 689 if (addr + size < addr) 690 return (EINVAL); 691 692 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 693 addr + size, prot, FALSE)) { 694 case KERN_SUCCESS: 695 return (0); 696 case KERN_PROTECTION_FAILURE: 697 return (EACCES); 698 } 699 return (EINVAL); 700 } 701 702 #ifndef _SYS_SYSPROTO_H_ 703 struct minherit_args { 704 void *addr; 705 size_t len; 706 int inherit; 707 }; 708 #endif 709 /* 710 * MPSAFE 711 */ 712 int 713 minherit(td, uap) 714 struct thread *td; 715 struct minherit_args *uap; 716 { 717 vm_offset_t addr; 718 vm_size_t size, pageoff; 719 vm_inherit_t inherit; 720 721 addr = (vm_offset_t)uap->addr; 722 size = uap->len; 723 inherit = uap->inherit; 724 725 pageoff = (addr & PAGE_MASK); 726 addr -= pageoff; 727 size += pageoff; 728 size = (vm_size_t) round_page(size); 729 if (addr + size < addr) 730 return (EINVAL); 731 732 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 733 addr + size, inherit)) { 734 case KERN_SUCCESS: 735 return (0); 736 case KERN_PROTECTION_FAILURE: 737 return (EACCES); 738 } 739 return (EINVAL); 740 } 741 742 #ifndef _SYS_SYSPROTO_H_ 743 struct madvise_args { 744 void *addr; 745 size_t len; 746 int behav; 747 }; 748 #endif 749 750 /* 751 * MPSAFE 752 */ 753 /* ARGSUSED */ 754 int 755 madvise(td, uap) 756 struct thread *td; 757 struct madvise_args *uap; 758 { 759 vm_offset_t start, end; 760 vm_map_t map; 761 struct proc *p; 762 int error; 763 764 /* 765 * Check for our special case, advising the swap pager we are 766 * "immortal." 767 */ 768 if (uap->behav == MADV_PROTECT) { 769 error = suser(td); 770 if (error == 0) { 771 p = td->td_proc; 772 PROC_LOCK(p); 773 p->p_flag |= P_PROTECTED; 774 PROC_UNLOCK(p); 775 } 776 return (error); 777 } 778 /* 779 * Check for illegal behavior 780 */ 781 if (uap->behav < 0 || uap->behav > MADV_CORE) 782 return (EINVAL); 783 /* 784 * Check for illegal addresses. Watch out for address wrap... Note 785 * that VM_*_ADDRESS are not constants due to casts (argh). 786 */ 787 map = &td->td_proc->p_vmspace->vm_map; 788 if ((vm_offset_t)uap->addr < vm_map_min(map) || 789 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 790 return (EINVAL); 791 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 792 return (EINVAL); 793 794 /* 795 * Since this routine is only advisory, we default to conservative 796 * behavior. 797 */ 798 start = trunc_page((vm_offset_t) uap->addr); 799 end = round_page((vm_offset_t) uap->addr + uap->len); 800 801 if (vm_map_madvise(map, start, end, uap->behav)) 802 return (EINVAL); 803 return (0); 804 } 805 806 #ifndef _SYS_SYSPROTO_H_ 807 struct mincore_args { 808 const void *addr; 809 size_t len; 810 char *vec; 811 }; 812 #endif 813 814 /* 815 * MPSAFE 816 */ 817 /* ARGSUSED */ 818 int 819 mincore(td, uap) 820 struct thread *td; 821 struct mincore_args *uap; 822 { 823 vm_offset_t addr, first_addr; 824 vm_offset_t end, cend; 825 pmap_t pmap; 826 vm_map_t map; 827 char *vec; 828 int error = 0; 829 int vecindex, lastvecindex; 830 vm_map_entry_t current; 831 vm_map_entry_t entry; 832 int mincoreinfo; 833 unsigned int timestamp; 834 835 /* 836 * Make sure that the addresses presented are valid for user 837 * mode. 838 */ 839 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 840 end = addr + (vm_size_t)round_page(uap->len); 841 map = &td->td_proc->p_vmspace->vm_map; 842 if (end > vm_map_max(map) || end < addr) 843 return (EINVAL); 844 845 /* 846 * Address of byte vector 847 */ 848 vec = uap->vec; 849 850 pmap = vmspace_pmap(td->td_proc->p_vmspace); 851 852 vm_map_lock_read(map); 853 RestartScan: 854 timestamp = map->timestamp; 855 856 if (!vm_map_lookup_entry(map, addr, &entry)) 857 entry = entry->next; 858 859 /* 860 * Do this on a map entry basis so that if the pages are not 861 * in the current processes address space, we can easily look 862 * up the pages elsewhere. 863 */ 864 lastvecindex = -1; 865 for (current = entry; 866 (current != &map->header) && (current->start < end); 867 current = current->next) { 868 869 /* 870 * ignore submaps (for now) or null objects 871 */ 872 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 873 current->object.vm_object == NULL) 874 continue; 875 876 /* 877 * limit this scan to the current map entry and the 878 * limits for the mincore call 879 */ 880 if (addr < current->start) 881 addr = current->start; 882 cend = current->end; 883 if (cend > end) 884 cend = end; 885 886 /* 887 * scan this entry one page at a time 888 */ 889 while (addr < cend) { 890 /* 891 * Check pmap first, it is likely faster, also 892 * it can provide info as to whether we are the 893 * one referencing or modifying the page. 894 */ 895 mtx_lock(&Giant); 896 mincoreinfo = pmap_mincore(pmap, addr); 897 mtx_unlock(&Giant); 898 if (!mincoreinfo) { 899 vm_pindex_t pindex; 900 vm_ooffset_t offset; 901 vm_page_t m; 902 /* 903 * calculate the page index into the object 904 */ 905 offset = current->offset + (addr - current->start); 906 pindex = OFF_TO_IDX(offset); 907 VM_OBJECT_LOCK(current->object.vm_object); 908 m = vm_page_lookup(current->object.vm_object, 909 pindex); 910 /* 911 * if the page is resident, then gather information about 912 * it. 913 */ 914 if (m) { 915 mincoreinfo = MINCORE_INCORE; 916 vm_page_lock_queues(); 917 if (m->dirty || 918 pmap_is_modified(m)) 919 mincoreinfo |= MINCORE_MODIFIED_OTHER; 920 if ((m->flags & PG_REFERENCED) || 921 pmap_ts_referenced(m)) { 922 vm_page_flag_set(m, PG_REFERENCED); 923 mincoreinfo |= MINCORE_REFERENCED_OTHER; 924 } 925 vm_page_unlock_queues(); 926 } 927 VM_OBJECT_UNLOCK(current->object.vm_object); 928 } 929 930 /* 931 * subyte may page fault. In case it needs to modify 932 * the map, we release the lock. 933 */ 934 vm_map_unlock_read(map); 935 936 /* 937 * calculate index into user supplied byte vector 938 */ 939 vecindex = OFF_TO_IDX(addr - first_addr); 940 941 /* 942 * If we have skipped map entries, we need to make sure that 943 * the byte vector is zeroed for those skipped entries. 944 */ 945 while ((lastvecindex + 1) < vecindex) { 946 error = subyte(vec + lastvecindex, 0); 947 if (error) { 948 error = EFAULT; 949 goto done2; 950 } 951 ++lastvecindex; 952 } 953 954 /* 955 * Pass the page information to the user 956 */ 957 error = subyte(vec + vecindex, mincoreinfo); 958 if (error) { 959 error = EFAULT; 960 goto done2; 961 } 962 963 /* 964 * If the map has changed, due to the subyte, the previous 965 * output may be invalid. 966 */ 967 vm_map_lock_read(map); 968 if (timestamp != map->timestamp) 969 goto RestartScan; 970 971 lastvecindex = vecindex; 972 addr += PAGE_SIZE; 973 } 974 } 975 976 /* 977 * subyte may page fault. In case it needs to modify 978 * the map, we release the lock. 979 */ 980 vm_map_unlock_read(map); 981 982 /* 983 * Zero the last entries in the byte vector. 984 */ 985 vecindex = OFF_TO_IDX(end - first_addr); 986 while ((lastvecindex + 1) < vecindex) { 987 error = subyte(vec + lastvecindex, 0); 988 if (error) { 989 error = EFAULT; 990 goto done2; 991 } 992 ++lastvecindex; 993 } 994 995 /* 996 * If the map has changed, due to the subyte, the previous 997 * output may be invalid. 998 */ 999 vm_map_lock_read(map); 1000 if (timestamp != map->timestamp) 1001 goto RestartScan; 1002 vm_map_unlock_read(map); 1003 done2: 1004 return (error); 1005 } 1006 1007 #ifndef _SYS_SYSPROTO_H_ 1008 struct mlock_args { 1009 const void *addr; 1010 size_t len; 1011 }; 1012 #endif 1013 /* 1014 * MPSAFE 1015 */ 1016 int 1017 mlock(td, uap) 1018 struct thread *td; 1019 struct mlock_args *uap; 1020 { 1021 vm_offset_t addr; 1022 vm_size_t size, pageoff; 1023 int error; 1024 1025 addr = (vm_offset_t) uap->addr; 1026 size = uap->len; 1027 1028 pageoff = (addr & PAGE_MASK); 1029 addr -= pageoff; 1030 size += pageoff; 1031 size = (vm_size_t) round_page(size); 1032 1033 /* disable wrap around */ 1034 if (addr + size < addr) 1035 return (EINVAL); 1036 1037 if (atop(size) + cnt.v_wire_count > vm_page_max_wired) 1038 return (EAGAIN); 1039 1040 #if 0 1041 if (size + ptoa(pmap_wired_count(vm_map_pmap(&td->td_proc->p_vmspace->vm_map))) > 1042 td->td_proc->p_rlimit[RLIMIT_MEMLOCK].rlim_cur) 1043 return (ENOMEM); 1044 #else 1045 error = suser(td); 1046 if (error) 1047 return (error); 1048 #endif 1049 1050 error = vm_map_wire(&td->td_proc->p_vmspace->vm_map, addr, 1051 addr + size, VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); 1052 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1053 } 1054 1055 #ifndef _SYS_SYSPROTO_H_ 1056 struct mlockall_args { 1057 int how; 1058 }; 1059 #endif 1060 1061 /* 1062 * MPSAFE 1063 */ 1064 int 1065 mlockall(td, uap) 1066 struct thread *td; 1067 struct mlockall_args *uap; 1068 { 1069 vm_map_t map; 1070 int error; 1071 1072 map = &td->td_proc->p_vmspace->vm_map; 1073 error = 0; 1074 1075 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1076 return (EINVAL); 1077 1078 #if 0 1079 /* 1080 * If wiring all pages in the process would cause it to exceed 1081 * a hard resource limit, return ENOMEM. 1082 */ 1083 if (map->size - ptoa(pmap_wired_count(vm_map_pmap(map)) > 1084 td->td_proc->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)) 1085 return (ENOMEM); 1086 #else 1087 error = suser(td); 1088 if (error) 1089 return (error); 1090 #endif 1091 1092 if (uap->how & MCL_FUTURE) { 1093 vm_map_lock(map); 1094 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1095 vm_map_unlock(map); 1096 error = 0; 1097 } 1098 1099 if (uap->how & MCL_CURRENT) { 1100 /* 1101 * P1003.1-2001 mandates that all currently mapped pages 1102 * will be memory resident and locked (wired) upon return 1103 * from mlockall(). vm_map_wire() will wire pages, by 1104 * calling vm_fault_wire() for each page in the region. 1105 */ 1106 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1107 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1108 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1109 } 1110 1111 return (error); 1112 } 1113 1114 #ifndef _SYS_SYSPROTO_H_ 1115 struct munlockall_args { 1116 register_t dummy; 1117 }; 1118 #endif 1119 1120 /* 1121 * MPSAFE 1122 */ 1123 int 1124 munlockall(td, uap) 1125 struct thread *td; 1126 struct munlockall_args *uap; 1127 { 1128 vm_map_t map; 1129 int error; 1130 1131 map = &td->td_proc->p_vmspace->vm_map; 1132 error = suser(td); 1133 if (error) 1134 return (error); 1135 1136 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1137 vm_map_lock(map); 1138 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1139 vm_map_unlock(map); 1140 1141 /* Forcibly unwire all pages. */ 1142 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1143 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1144 1145 return (error); 1146 } 1147 1148 #ifndef _SYS_SYSPROTO_H_ 1149 struct munlock_args { 1150 const void *addr; 1151 size_t len; 1152 }; 1153 #endif 1154 /* 1155 * MPSAFE 1156 */ 1157 int 1158 munlock(td, uap) 1159 struct thread *td; 1160 struct munlock_args *uap; 1161 { 1162 vm_offset_t addr; 1163 vm_size_t size, pageoff; 1164 int error; 1165 1166 addr = (vm_offset_t) uap->addr; 1167 size = uap->len; 1168 1169 pageoff = (addr & PAGE_MASK); 1170 addr -= pageoff; 1171 size += pageoff; 1172 size = (vm_size_t) round_page(size); 1173 1174 /* disable wrap around */ 1175 if (addr + size < addr) 1176 return (EINVAL); 1177 1178 error = suser(td); 1179 if (error) 1180 return (error); 1181 1182 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, addr, 1183 addr + size, VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); 1184 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1185 } 1186 1187 /* 1188 * vm_mmap() 1189 * 1190 * MPSAFE 1191 * 1192 * Internal version of mmap. Currently used by mmap, exec, and sys5 1193 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1194 */ 1195 int 1196 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1197 vm_prot_t maxprot, int flags, 1198 void *handle, 1199 vm_ooffset_t foff) 1200 { 1201 boolean_t fitit; 1202 vm_object_t object; 1203 struct vnode *vp = NULL; 1204 objtype_t type; 1205 int rv = KERN_SUCCESS; 1206 vm_ooffset_t objsize; 1207 int docow; 1208 struct thread *td = curthread; 1209 1210 if (size == 0) 1211 return (0); 1212 1213 objsize = size = round_page(size); 1214 1215 if (td->td_proc->p_vmspace->vm_map.size + size > 1216 td->td_proc->p_rlimit[RLIMIT_VMEM].rlim_cur) { 1217 return(ENOMEM); 1218 } 1219 1220 /* 1221 * We currently can only deal with page aligned file offsets. 1222 * The check is here rather than in the syscall because the 1223 * kernel calls this function internally for other mmaping 1224 * operations (such as in exec) and non-aligned offsets will 1225 * cause pmap inconsistencies...so we want to be sure to 1226 * disallow this in all cases. 1227 */ 1228 if (foff & PAGE_MASK) 1229 return (EINVAL); 1230 1231 if ((flags & MAP_FIXED) == 0) { 1232 fitit = TRUE; 1233 *addr = round_page(*addr); 1234 } else { 1235 if (*addr != trunc_page(*addr)) 1236 return (EINVAL); 1237 fitit = FALSE; 1238 (void) vm_map_remove(map, *addr, *addr + size); 1239 } 1240 1241 /* 1242 * Lookup/allocate object. 1243 */ 1244 if (flags & MAP_ANON) { 1245 type = OBJT_DEFAULT; 1246 /* 1247 * Unnamed anonymous regions always start at 0. 1248 */ 1249 if (handle == 0) 1250 foff = 0; 1251 } else { 1252 vp = (struct vnode *) handle; 1253 mtx_lock(&Giant); 1254 ASSERT_VOP_LOCKED(vp, "vm_mmap"); 1255 if (vp->v_type == VCHR) { 1256 type = OBJT_DEVICE; 1257 handle = vp->v_rdev; 1258 } else { 1259 struct vattr vat; 1260 int error; 1261 1262 error = VOP_GETATTR(vp, &vat, td->td_ucred, td); 1263 if (error) { 1264 mtx_unlock(&Giant); 1265 return (error); 1266 } 1267 objsize = round_page(vat.va_size); 1268 type = OBJT_VNODE; 1269 /* 1270 * if it is a regular file without any references 1271 * we do not need to sync it. 1272 */ 1273 if (vp->v_type == VREG && vat.va_nlink == 0) { 1274 flags |= MAP_NOSYNC; 1275 } 1276 } 1277 mtx_unlock(&Giant); 1278 } 1279 1280 if (handle == NULL) { 1281 object = NULL; 1282 docow = 0; 1283 } else { 1284 object = vm_pager_allocate(type, 1285 handle, objsize, prot, foff); 1286 if (object == NULL) { 1287 return (type == OBJT_DEVICE ? EINVAL : ENOMEM); 1288 } 1289 docow = MAP_PREFAULT_PARTIAL; 1290 } 1291 1292 /* 1293 * Force device mappings to be shared. 1294 */ 1295 if (type == OBJT_DEVICE) { 1296 flags &= ~(MAP_PRIVATE|MAP_COPY); 1297 flags |= MAP_SHARED; 1298 } 1299 1300 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1301 docow |= MAP_COPY_ON_WRITE; 1302 if (flags & MAP_NOSYNC) 1303 docow |= MAP_DISABLE_SYNCER; 1304 if (flags & MAP_NOCORE) 1305 docow |= MAP_DISABLE_COREDUMP; 1306 1307 #if defined(VM_PROT_READ_IS_EXEC) 1308 if (prot & VM_PROT_READ) 1309 prot |= VM_PROT_EXECUTE; 1310 1311 if (maxprot & VM_PROT_READ) 1312 maxprot |= VM_PROT_EXECUTE; 1313 #endif 1314 1315 if (fitit) 1316 *addr = pmap_addr_hint(object, *addr, size); 1317 1318 if (flags & MAP_STACK) 1319 rv = vm_map_stack(map, *addr, size, prot, maxprot, 1320 docow | MAP_STACK_GROWS_DOWN); 1321 else 1322 rv = vm_map_find(map, object, foff, addr, size, fitit, 1323 prot, maxprot, docow); 1324 1325 if (rv != KERN_SUCCESS) { 1326 /* 1327 * Lose the object reference. Will destroy the 1328 * object if it's an unnamed anonymous mapping 1329 * or named anonymous without other references. 1330 */ 1331 vm_object_deallocate(object); 1332 } else if (flags & MAP_SHARED) { 1333 /* 1334 * Shared memory is also shared with children. 1335 */ 1336 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); 1337 if (rv != KERN_SUCCESS) 1338 (void) vm_map_remove(map, *addr, *addr + size); 1339 } 1340 1341 /* 1342 * If the process has requested that all future mappings 1343 * be wired, then heed this. 1344 */ 1345 if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE)) 1346 vm_map_wire(map, *addr, *addr + size, 1347 VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); 1348 1349 switch (rv) { 1350 case KERN_SUCCESS: 1351 return (0); 1352 case KERN_INVALID_ADDRESS: 1353 case KERN_NO_SPACE: 1354 return (ENOMEM); 1355 case KERN_PROTECTION_FAILURE: 1356 return (EACCES); 1357 default: 1358 return (EINVAL); 1359 } 1360 } 1361