1 /* 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 39 * 40 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 41 */ 42 43 /* 44 * Mapped file (mmap) interface to VM 45 */ 46 47 #include <sys/cdefs.h> 48 __FBSDID("$FreeBSD$"); 49 50 #include "opt_compat.h" 51 #include "opt_mac.h" 52 53 #include <sys/param.h> 54 #include <sys/systm.h> 55 #include <sys/kernel.h> 56 #include <sys/lock.h> 57 #include <sys/mutex.h> 58 #include <sys/sysproto.h> 59 #include <sys/filedesc.h> 60 #include <sys/proc.h> 61 #include <sys/resource.h> 62 #include <sys/resourcevar.h> 63 #include <sys/vnode.h> 64 #include <sys/fcntl.h> 65 #include <sys/file.h> 66 #include <sys/mac.h> 67 #include <sys/mman.h> 68 #include <sys/conf.h> 69 #include <sys/stat.h> 70 #include <sys/vmmeter.h> 71 #include <sys/sysctl.h> 72 73 #include <vm/vm.h> 74 #include <vm/vm_param.h> 75 #include <vm/pmap.h> 76 #include <vm/vm_map.h> 77 #include <vm/vm_object.h> 78 #include <vm/vm_page.h> 79 #include <vm/vm_pager.h> 80 #include <vm/vm_pageout.h> 81 #include <vm/vm_extern.h> 82 #include <vm/vm_page.h> 83 #include <vm/vm_kern.h> 84 85 #ifndef _SYS_SYSPROTO_H_ 86 struct sbrk_args { 87 int incr; 88 }; 89 #endif 90 91 static int max_proc_mmap; 92 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, ""); 93 94 /* 95 * Set the maximum number of vm_map_entry structures per process. Roughly 96 * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100 97 * of our KVM malloc space still results in generous limits. We want a 98 * default that is good enough to prevent the kernel running out of resources 99 * if attacked from compromised user account but generous enough such that 100 * multi-threaded processes are not unduly inconvenienced. 101 */ 102 static void vmmapentry_rsrc_init(void *); 103 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, NULL) 104 105 static void 106 vmmapentry_rsrc_init(dummy) 107 void *dummy; 108 { 109 max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry); 110 max_proc_mmap /= 100; 111 } 112 113 /* 114 * MPSAFE 115 */ 116 /* ARGSUSED */ 117 int 118 sbrk(td, uap) 119 struct thread *td; 120 struct sbrk_args *uap; 121 { 122 /* Not yet implemented */ 123 /* mtx_lock(&Giant); */ 124 /* mtx_unlock(&Giant); */ 125 return (EOPNOTSUPP); 126 } 127 128 #ifndef _SYS_SYSPROTO_H_ 129 struct sstk_args { 130 int incr; 131 }; 132 #endif 133 134 /* 135 * MPSAFE 136 */ 137 /* ARGSUSED */ 138 int 139 sstk(td, uap) 140 struct thread *td; 141 struct sstk_args *uap; 142 { 143 /* Not yet implemented */ 144 /* mtx_lock(&Giant); */ 145 /* mtx_unlock(&Giant); */ 146 return (EOPNOTSUPP); 147 } 148 149 #if defined(COMPAT_43) || defined(COMPAT_SUNOS) 150 #ifndef _SYS_SYSPROTO_H_ 151 struct getpagesize_args { 152 int dummy; 153 }; 154 #endif 155 156 /* ARGSUSED */ 157 int 158 ogetpagesize(td, uap) 159 struct thread *td; 160 struct getpagesize_args *uap; 161 { 162 /* MP SAFE */ 163 td->td_retval[0] = PAGE_SIZE; 164 return (0); 165 } 166 #endif /* COMPAT_43 || COMPAT_SUNOS */ 167 168 169 /* 170 * Memory Map (mmap) system call. Note that the file offset 171 * and address are allowed to be NOT page aligned, though if 172 * the MAP_FIXED flag it set, both must have the same remainder 173 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 174 * page-aligned, the actual mapping starts at trunc_page(addr) 175 * and the return value is adjusted up by the page offset. 176 * 177 * Generally speaking, only character devices which are themselves 178 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 179 * there would be no cache coherency between a descriptor and a VM mapping 180 * both to the same character device. 181 * 182 * Block devices can be mmap'd no matter what they represent. Cache coherency 183 * is maintained as long as you do not write directly to the underlying 184 * character device. 185 */ 186 #ifndef _SYS_SYSPROTO_H_ 187 struct mmap_args { 188 void *addr; 189 size_t len; 190 int prot; 191 int flags; 192 int fd; 193 long pad; 194 off_t pos; 195 }; 196 #endif 197 198 /* 199 * MPSAFE 200 */ 201 int 202 mmap(td, uap) 203 struct thread *td; 204 struct mmap_args *uap; 205 { 206 struct file *fp = NULL; 207 struct vnode *vp; 208 vm_offset_t addr; 209 vm_size_t size, pageoff; 210 vm_prot_t prot, maxprot; 211 void *handle; 212 int flags, error; 213 int disablexworkaround; 214 off_t pos; 215 struct vmspace *vms = td->td_proc->p_vmspace; 216 vm_object_t obj; 217 218 addr = (vm_offset_t) uap->addr; 219 size = uap->len; 220 prot = uap->prot & VM_PROT_ALL; 221 flags = uap->flags; 222 pos = uap->pos; 223 224 vp = NULL; 225 fp = NULL; 226 /* make sure mapping fits into numeric range etc */ 227 if ((ssize_t) uap->len < 0 || 228 ((flags & MAP_ANON) && uap->fd != -1)) 229 return (EINVAL); 230 231 if (flags & MAP_STACK) { 232 if ((uap->fd != -1) || 233 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 234 return (EINVAL); 235 flags |= MAP_ANON; 236 pos = 0; 237 } 238 239 /* 240 * Align the file position to a page boundary, 241 * and save its page offset component. 242 */ 243 pageoff = (pos & PAGE_MASK); 244 pos -= pageoff; 245 246 /* Adjust size for rounding (on both ends). */ 247 size += pageoff; /* low end... */ 248 size = (vm_size_t) round_page(size); /* hi end */ 249 250 /* 251 * Check for illegal addresses. Watch out for address wrap... Note 252 * that VM_*_ADDRESS are not constants due to casts (argh). 253 */ 254 if (flags & MAP_FIXED) { 255 /* 256 * The specified address must have the same remainder 257 * as the file offset taken modulo PAGE_SIZE, so it 258 * should be aligned after adjustment by pageoff. 259 */ 260 addr -= pageoff; 261 if (addr & PAGE_MASK) 262 return (EINVAL); 263 /* Address range must be all in user VM space. */ 264 if (addr < vm_map_min(&vms->vm_map) || 265 addr + size > vm_map_max(&vms->vm_map)) 266 return (EINVAL); 267 if (addr + size < addr) 268 return (EINVAL); 269 } 270 /* 271 * XXX for non-fixed mappings where no hint is provided or 272 * the hint would fall in the potential heap space, 273 * place it after the end of the largest possible heap. 274 * 275 * There should really be a pmap call to determine a reasonable 276 * location. 277 */ 278 else if (addr == 0 || 279 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 280 addr < round_page((vm_offset_t)vms->vm_daddr + maxdsiz))) 281 addr = round_page((vm_offset_t)vms->vm_daddr + maxdsiz); 282 283 mtx_lock(&Giant); /* syscall marked mp-safe but isn't */ 284 do { 285 if (flags & MAP_ANON) { 286 /* 287 * Mapping blank space is trivial. 288 */ 289 handle = NULL; 290 maxprot = VM_PROT_ALL; 291 pos = 0; 292 break; 293 } 294 /* 295 * Mapping file, get fp for validation. Obtain vnode and make 296 * sure it is of appropriate type. 297 * don't let the descriptor disappear on us if we block 298 */ 299 if ((error = fget(td, uap->fd, &fp)) != 0) 300 goto done; 301 if (fp->f_type != DTYPE_VNODE) { 302 error = EINVAL; 303 goto done; 304 } 305 306 /* 307 * POSIX shared-memory objects are defined to have 308 * kernel persistence, and are not defined to support 309 * read(2)/write(2) -- or even open(2). Thus, we can 310 * use MAP_ASYNC to trade on-disk coherence for speed. 311 * The shm_open(3) library routine turns on the FPOSIXSHM 312 * flag to request this behavior. 313 */ 314 if (fp->f_flag & FPOSIXSHM) 315 flags |= MAP_NOSYNC; 316 vp = fp->f_vnode; 317 error = vget(vp, LK_EXCLUSIVE, td); 318 if (error) 319 goto done; 320 if (vp->v_type != VREG && vp->v_type != VCHR) { 321 error = EINVAL; 322 goto done; 323 } 324 if (vp->v_type == VREG) { 325 /* 326 * Get the proper underlying object 327 */ 328 if (VOP_GETVOBJECT(vp, &obj) != 0) { 329 error = EINVAL; 330 goto done; 331 } 332 if (obj->handle != vp) { 333 vput(vp); 334 vp = (struct vnode*)obj->handle; 335 vget(vp, LK_EXCLUSIVE, td); 336 } 337 } 338 /* 339 * XXX hack to handle use of /dev/zero to map anon memory (ala 340 * SunOS). 341 */ 342 if ((vp->v_type == VCHR) && 343 (vp->v_rdev->si_devsw->d_flags & D_MMAP_ANON)) { 344 handle = NULL; 345 maxprot = VM_PROT_ALL; 346 flags |= MAP_ANON; 347 pos = 0; 348 break; 349 } 350 /* 351 * cdevs does not provide private mappings of any kind. 352 */ 353 /* 354 * However, for XIG X server to continue to work, 355 * we should allow the superuser to do it anyway. 356 * We only allow it at securelevel < 1. 357 * (Because the XIG X server writes directly to video 358 * memory via /dev/mem, it should never work at any 359 * other securelevel. 360 * XXX this will have to go 361 */ 362 if (securelevel_ge(td->td_ucred, 1)) 363 disablexworkaround = 1; 364 else 365 disablexworkaround = suser(td); 366 if (vp->v_type == VCHR && disablexworkaround && 367 (flags & (MAP_PRIVATE|MAP_COPY))) { 368 error = EINVAL; 369 goto done; 370 } 371 /* 372 * Ensure that file and memory protections are 373 * compatible. Note that we only worry about 374 * writability if mapping is shared; in this case, 375 * current and max prot are dictated by the open file. 376 * XXX use the vnode instead? Problem is: what 377 * credentials do we use for determination? What if 378 * proc does a setuid? 379 */ 380 maxprot = VM_PROT_EXECUTE; /* ??? */ 381 if (fp->f_flag & FREAD) { 382 maxprot |= VM_PROT_READ; 383 } else if (prot & PROT_READ) { 384 error = EACCES; 385 goto done; 386 } 387 /* 388 * If we are sharing potential changes (either via 389 * MAP_SHARED or via the implicit sharing of character 390 * device mappings), and we are trying to get write 391 * permission although we opened it without asking 392 * for it, bail out. Check for superuser, only if 393 * we're at securelevel < 1, to allow the XIG X server 394 * to continue to work. 395 */ 396 if ((flags & MAP_SHARED) != 0 || 397 (vp->v_type == VCHR && disablexworkaround)) { 398 if ((fp->f_flag & FWRITE) != 0) { 399 struct vattr va; 400 if ((error = 401 VOP_GETATTR(vp, &va, 402 td->td_ucred, td))) { 403 goto done; 404 } 405 if ((va.va_flags & 406 (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0) { 407 maxprot |= VM_PROT_WRITE; 408 } else if (prot & PROT_WRITE) { 409 error = EPERM; 410 goto done; 411 } 412 } else if ((prot & PROT_WRITE) != 0) { 413 error = EACCES; 414 goto done; 415 } 416 } else { 417 maxprot |= VM_PROT_WRITE; 418 } 419 420 handle = (void *)vp; 421 } while (0); 422 423 /* 424 * Do not allow more then a certain number of vm_map_entry structures 425 * per process. Scale with the number of rforks sharing the map 426 * to make the limit reasonable for threads. 427 */ 428 if (max_proc_mmap && 429 vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) { 430 error = ENOMEM; 431 goto done; 432 } 433 434 mtx_unlock(&Giant); 435 error = 0; 436 #ifdef MAC 437 if (handle != NULL && (flags & MAP_SHARED) != 0) { 438 error = mac_check_vnode_mmap(td->td_ucred, 439 (struct vnode *)handle, prot); 440 } 441 #endif 442 if (error == 0) 443 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 444 flags, handle, pos); 445 mtx_lock(&Giant); 446 if (error == 0) 447 td->td_retval[0] = (register_t) (addr + pageoff); 448 done: 449 if (vp) 450 vput(vp); 451 mtx_unlock(&Giant); 452 if (fp) 453 fdrop(fp, td); 454 455 return (error); 456 } 457 458 #ifdef COMPAT_43 459 #ifndef _SYS_SYSPROTO_H_ 460 struct ommap_args { 461 caddr_t addr; 462 int len; 463 int prot; 464 int flags; 465 int fd; 466 long pos; 467 }; 468 #endif 469 int 470 ommap(td, uap) 471 struct thread *td; 472 struct ommap_args *uap; 473 { 474 struct mmap_args nargs; 475 static const char cvtbsdprot[8] = { 476 0, 477 PROT_EXEC, 478 PROT_WRITE, 479 PROT_EXEC | PROT_WRITE, 480 PROT_READ, 481 PROT_EXEC | PROT_READ, 482 PROT_WRITE | PROT_READ, 483 PROT_EXEC | PROT_WRITE | PROT_READ, 484 }; 485 486 #define OMAP_ANON 0x0002 487 #define OMAP_COPY 0x0020 488 #define OMAP_SHARED 0x0010 489 #define OMAP_FIXED 0x0100 490 491 nargs.addr = uap->addr; 492 nargs.len = uap->len; 493 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 494 nargs.flags = 0; 495 if (uap->flags & OMAP_ANON) 496 nargs.flags |= MAP_ANON; 497 if (uap->flags & OMAP_COPY) 498 nargs.flags |= MAP_COPY; 499 if (uap->flags & OMAP_SHARED) 500 nargs.flags |= MAP_SHARED; 501 else 502 nargs.flags |= MAP_PRIVATE; 503 if (uap->flags & OMAP_FIXED) 504 nargs.flags |= MAP_FIXED; 505 nargs.fd = uap->fd; 506 nargs.pos = uap->pos; 507 return (mmap(td, &nargs)); 508 } 509 #endif /* COMPAT_43 */ 510 511 512 #ifndef _SYS_SYSPROTO_H_ 513 struct msync_args { 514 void *addr; 515 int len; 516 int flags; 517 }; 518 #endif 519 /* 520 * MPSAFE 521 */ 522 int 523 msync(td, uap) 524 struct thread *td; 525 struct msync_args *uap; 526 { 527 vm_offset_t addr; 528 vm_size_t size, pageoff; 529 int flags; 530 vm_map_t map; 531 int rv; 532 533 addr = (vm_offset_t) uap->addr; 534 size = uap->len; 535 flags = uap->flags; 536 537 pageoff = (addr & PAGE_MASK); 538 addr -= pageoff; 539 size += pageoff; 540 size = (vm_size_t) round_page(size); 541 if (addr + size < addr) 542 return (EINVAL); 543 544 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 545 return (EINVAL); 546 547 mtx_lock(&Giant); 548 549 map = &td->td_proc->p_vmspace->vm_map; 550 551 /* 552 * XXX Gak! If size is zero we are supposed to sync "all modified 553 * pages with the region containing addr". Unfortunately, we don't 554 * really keep track of individual mmaps so we approximate by flushing 555 * the range of the map entry containing addr. This can be incorrect 556 * if the region splits or is coalesced with a neighbor. 557 */ 558 if (size == 0) { 559 vm_map_entry_t entry; 560 561 vm_map_lock_read(map); 562 rv = vm_map_lookup_entry(map, addr, &entry); 563 vm_map_unlock_read(map); 564 if (rv == FALSE) { 565 rv = -1; 566 goto done2; 567 } 568 addr = entry->start; 569 size = entry->end - entry->start; 570 } 571 572 /* 573 * Clean the pages and interpret the return value. 574 */ 575 rv = vm_map_clean(map, addr, addr + size, (flags & MS_ASYNC) == 0, 576 (flags & MS_INVALIDATE) != 0); 577 578 done2: 579 mtx_unlock(&Giant); 580 581 switch (rv) { 582 case KERN_SUCCESS: 583 return (0); 584 case KERN_INVALID_ADDRESS: 585 return (EINVAL); /* Sun returns ENOMEM? */ 586 case KERN_FAILURE: 587 return (EIO); 588 default: 589 return (EINVAL); 590 } 591 } 592 593 #ifndef _SYS_SYSPROTO_H_ 594 struct munmap_args { 595 void *addr; 596 size_t len; 597 }; 598 #endif 599 /* 600 * MPSAFE 601 */ 602 int 603 munmap(td, uap) 604 struct thread *td; 605 struct munmap_args *uap; 606 { 607 vm_offset_t addr; 608 vm_size_t size, pageoff; 609 vm_map_t map; 610 611 addr = (vm_offset_t) uap->addr; 612 size = uap->len; 613 614 pageoff = (addr & PAGE_MASK); 615 addr -= pageoff; 616 size += pageoff; 617 size = (vm_size_t) round_page(size); 618 if (addr + size < addr) 619 return (EINVAL); 620 621 if (size == 0) 622 return (0); 623 624 /* 625 * Check for illegal addresses. Watch out for address wrap... 626 */ 627 map = &td->td_proc->p_vmspace->vm_map; 628 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 629 return (EINVAL); 630 /* 631 * Make sure entire range is allocated. 632 */ 633 if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) 634 return (EINVAL); 635 636 /* returns nothing but KERN_SUCCESS anyway */ 637 (void) vm_map_remove(map, addr, addr + size); 638 return (0); 639 } 640 641 #if 0 642 void 643 munmapfd(td, fd) 644 struct thread *td; 645 int fd; 646 { 647 /* 648 * XXX should unmap any regions mapped to this file 649 */ 650 FILEDESC_LOCK(p->p_fd); 651 td->td_proc->p_fd->fd_ofileflags[fd] &= ~UF_MAPPED; 652 FILEDESC_UNLOCK(p->p_fd); 653 } 654 #endif 655 656 #ifndef _SYS_SYSPROTO_H_ 657 struct mprotect_args { 658 const void *addr; 659 size_t len; 660 int prot; 661 }; 662 #endif 663 /* 664 * MPSAFE 665 */ 666 int 667 mprotect(td, uap) 668 struct thread *td; 669 struct mprotect_args *uap; 670 { 671 vm_offset_t addr; 672 vm_size_t size, pageoff; 673 vm_prot_t prot; 674 675 addr = (vm_offset_t) uap->addr; 676 size = uap->len; 677 prot = uap->prot & VM_PROT_ALL; 678 #if defined(VM_PROT_READ_IS_EXEC) 679 if (prot & VM_PROT_READ) 680 prot |= VM_PROT_EXECUTE; 681 #endif 682 683 pageoff = (addr & PAGE_MASK); 684 addr -= pageoff; 685 size += pageoff; 686 size = (vm_size_t) round_page(size); 687 if (addr + size < addr) 688 return (EINVAL); 689 690 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 691 addr + size, prot, FALSE)) { 692 case KERN_SUCCESS: 693 return (0); 694 case KERN_PROTECTION_FAILURE: 695 return (EACCES); 696 } 697 return (EINVAL); 698 } 699 700 #ifndef _SYS_SYSPROTO_H_ 701 struct minherit_args { 702 void *addr; 703 size_t len; 704 int inherit; 705 }; 706 #endif 707 /* 708 * MPSAFE 709 */ 710 int 711 minherit(td, uap) 712 struct thread *td; 713 struct minherit_args *uap; 714 { 715 vm_offset_t addr; 716 vm_size_t size, pageoff; 717 vm_inherit_t inherit; 718 719 addr = (vm_offset_t)uap->addr; 720 size = uap->len; 721 inherit = uap->inherit; 722 723 pageoff = (addr & PAGE_MASK); 724 addr -= pageoff; 725 size += pageoff; 726 size = (vm_size_t) round_page(size); 727 if (addr + size < addr) 728 return (EINVAL); 729 730 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 731 addr + size, inherit)) { 732 case KERN_SUCCESS: 733 return (0); 734 case KERN_PROTECTION_FAILURE: 735 return (EACCES); 736 } 737 return (EINVAL); 738 } 739 740 #ifndef _SYS_SYSPROTO_H_ 741 struct madvise_args { 742 void *addr; 743 size_t len; 744 int behav; 745 }; 746 #endif 747 748 /* 749 * MPSAFE 750 */ 751 /* ARGSUSED */ 752 int 753 madvise(td, uap) 754 struct thread *td; 755 struct madvise_args *uap; 756 { 757 vm_offset_t start, end; 758 vm_map_t map; 759 struct proc *p; 760 int error; 761 762 /* 763 * Check for our special case, advising the swap pager we are 764 * "immortal." 765 */ 766 if (uap->behav == MADV_PROTECT) { 767 error = suser(td); 768 if (error == 0) { 769 p = td->td_proc; 770 PROC_LOCK(p); 771 p->p_flag |= P_PROTECTED; 772 PROC_UNLOCK(p); 773 } 774 return (error); 775 } 776 /* 777 * Check for illegal behavior 778 */ 779 if (uap->behav < 0 || uap->behav > MADV_CORE) 780 return (EINVAL); 781 /* 782 * Check for illegal addresses. Watch out for address wrap... Note 783 * that VM_*_ADDRESS are not constants due to casts (argh). 784 */ 785 map = &td->td_proc->p_vmspace->vm_map; 786 if ((vm_offset_t)uap->addr < vm_map_min(map) || 787 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 788 return (EINVAL); 789 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 790 return (EINVAL); 791 792 /* 793 * Since this routine is only advisory, we default to conservative 794 * behavior. 795 */ 796 start = trunc_page((vm_offset_t) uap->addr); 797 end = round_page((vm_offset_t) uap->addr + uap->len); 798 799 if (vm_map_madvise(map, start, end, uap->behav)) 800 return (EINVAL); 801 return (0); 802 } 803 804 #ifndef _SYS_SYSPROTO_H_ 805 struct mincore_args { 806 const void *addr; 807 size_t len; 808 char *vec; 809 }; 810 #endif 811 812 /* 813 * MPSAFE 814 */ 815 /* ARGSUSED */ 816 int 817 mincore(td, uap) 818 struct thread *td; 819 struct mincore_args *uap; 820 { 821 vm_offset_t addr, first_addr; 822 vm_offset_t end, cend; 823 pmap_t pmap; 824 vm_map_t map; 825 char *vec; 826 int error = 0; 827 int vecindex, lastvecindex; 828 vm_map_entry_t current; 829 vm_map_entry_t entry; 830 int mincoreinfo; 831 unsigned int timestamp; 832 833 /* 834 * Make sure that the addresses presented are valid for user 835 * mode. 836 */ 837 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 838 end = addr + (vm_size_t)round_page(uap->len); 839 map = &td->td_proc->p_vmspace->vm_map; 840 if (end > vm_map_max(map) || end < addr) 841 return (EINVAL); 842 843 /* 844 * Address of byte vector 845 */ 846 vec = uap->vec; 847 848 pmap = vmspace_pmap(td->td_proc->p_vmspace); 849 850 vm_map_lock_read(map); 851 RestartScan: 852 timestamp = map->timestamp; 853 854 if (!vm_map_lookup_entry(map, addr, &entry)) 855 entry = entry->next; 856 857 /* 858 * Do this on a map entry basis so that if the pages are not 859 * in the current processes address space, we can easily look 860 * up the pages elsewhere. 861 */ 862 lastvecindex = -1; 863 for (current = entry; 864 (current != &map->header) && (current->start < end); 865 current = current->next) { 866 867 /* 868 * ignore submaps (for now) or null objects 869 */ 870 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 871 current->object.vm_object == NULL) 872 continue; 873 874 /* 875 * limit this scan to the current map entry and the 876 * limits for the mincore call 877 */ 878 if (addr < current->start) 879 addr = current->start; 880 cend = current->end; 881 if (cend > end) 882 cend = end; 883 884 /* 885 * scan this entry one page at a time 886 */ 887 while (addr < cend) { 888 /* 889 * Check pmap first, it is likely faster, also 890 * it can provide info as to whether we are the 891 * one referencing or modifying the page. 892 */ 893 mtx_lock(&Giant); 894 mincoreinfo = pmap_mincore(pmap, addr); 895 mtx_unlock(&Giant); 896 if (!mincoreinfo) { 897 vm_pindex_t pindex; 898 vm_ooffset_t offset; 899 vm_page_t m; 900 /* 901 * calculate the page index into the object 902 */ 903 offset = current->offset + (addr - current->start); 904 pindex = OFF_TO_IDX(offset); 905 VM_OBJECT_LOCK(current->object.vm_object); 906 m = vm_page_lookup(current->object.vm_object, 907 pindex); 908 /* 909 * if the page is resident, then gather information about 910 * it. 911 */ 912 if (m) { 913 mincoreinfo = MINCORE_INCORE; 914 vm_page_lock_queues(); 915 if (m->dirty || 916 pmap_is_modified(m)) 917 mincoreinfo |= MINCORE_MODIFIED_OTHER; 918 if ((m->flags & PG_REFERENCED) || 919 pmap_ts_referenced(m)) { 920 vm_page_flag_set(m, PG_REFERENCED); 921 mincoreinfo |= MINCORE_REFERENCED_OTHER; 922 } 923 vm_page_unlock_queues(); 924 } 925 VM_OBJECT_UNLOCK(current->object.vm_object); 926 } 927 928 /* 929 * subyte may page fault. In case it needs to modify 930 * the map, we release the lock. 931 */ 932 vm_map_unlock_read(map); 933 934 /* 935 * calculate index into user supplied byte vector 936 */ 937 vecindex = OFF_TO_IDX(addr - first_addr); 938 939 /* 940 * If we have skipped map entries, we need to make sure that 941 * the byte vector is zeroed for those skipped entries. 942 */ 943 while ((lastvecindex + 1) < vecindex) { 944 error = subyte(vec + lastvecindex, 0); 945 if (error) { 946 error = EFAULT; 947 goto done2; 948 } 949 ++lastvecindex; 950 } 951 952 /* 953 * Pass the page information to the user 954 */ 955 error = subyte(vec + vecindex, mincoreinfo); 956 if (error) { 957 error = EFAULT; 958 goto done2; 959 } 960 961 /* 962 * If the map has changed, due to the subyte, the previous 963 * output may be invalid. 964 */ 965 vm_map_lock_read(map); 966 if (timestamp != map->timestamp) 967 goto RestartScan; 968 969 lastvecindex = vecindex; 970 addr += PAGE_SIZE; 971 } 972 } 973 974 /* 975 * subyte may page fault. In case it needs to modify 976 * the map, we release the lock. 977 */ 978 vm_map_unlock_read(map); 979 980 /* 981 * Zero the last entries in the byte vector. 982 */ 983 vecindex = OFF_TO_IDX(end - first_addr); 984 while ((lastvecindex + 1) < vecindex) { 985 error = subyte(vec + lastvecindex, 0); 986 if (error) { 987 error = EFAULT; 988 goto done2; 989 } 990 ++lastvecindex; 991 } 992 993 /* 994 * If the map has changed, due to the subyte, the previous 995 * output may be invalid. 996 */ 997 vm_map_lock_read(map); 998 if (timestamp != map->timestamp) 999 goto RestartScan; 1000 vm_map_unlock_read(map); 1001 done2: 1002 return (error); 1003 } 1004 1005 #ifndef _SYS_SYSPROTO_H_ 1006 struct mlock_args { 1007 const void *addr; 1008 size_t len; 1009 }; 1010 #endif 1011 /* 1012 * MPSAFE 1013 */ 1014 int 1015 mlock(td, uap) 1016 struct thread *td; 1017 struct mlock_args *uap; 1018 { 1019 vm_offset_t addr; 1020 vm_size_t size, pageoff; 1021 int error; 1022 1023 addr = (vm_offset_t) uap->addr; 1024 size = uap->len; 1025 1026 pageoff = (addr & PAGE_MASK); 1027 addr -= pageoff; 1028 size += pageoff; 1029 size = (vm_size_t) round_page(size); 1030 1031 /* disable wrap around */ 1032 if (addr + size < addr) 1033 return (EINVAL); 1034 1035 if (atop(size) + cnt.v_wire_count > vm_page_max_wired) 1036 return (EAGAIN); 1037 1038 #ifdef pmap_wired_count 1039 if (size + ptoa(pmap_wired_count(vm_map_pmap(&td->td_proc->p_vmspace->vm_map))) > 1040 td->td_proc->p_rlimit[RLIMIT_MEMLOCK].rlim_cur) 1041 return (ENOMEM); 1042 #else 1043 error = suser(td); 1044 if (error) 1045 return (error); 1046 #endif 1047 1048 error = vm_map_wire(&td->td_proc->p_vmspace->vm_map, addr, 1049 addr + size, VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); 1050 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1051 } 1052 1053 #ifndef _SYS_SYSPROTO_H_ 1054 struct mlockall_args { 1055 int how; 1056 }; 1057 #endif 1058 1059 /* 1060 * MPSAFE 1061 */ 1062 int 1063 mlockall(td, uap) 1064 struct thread *td; 1065 struct mlockall_args *uap; 1066 { 1067 vm_map_t map; 1068 int error; 1069 1070 map = &td->td_proc->p_vmspace->vm_map; 1071 error = 0; 1072 1073 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1074 return (EINVAL); 1075 1076 #ifdef pmap_wired_count 1077 /* 1078 * If wiring all pages in the process would cause it to exceed 1079 * a hard resource limit, return ENOMEM. 1080 */ 1081 if (map->size - ptoa(pmap_wired_count(vm_map_pmap(map)) > 1082 td->td_proc->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)) 1083 return (ENOMEM); 1084 #else 1085 error = suser(td); 1086 if (error) 1087 return (error); 1088 #endif 1089 1090 if (uap->how & MCL_FUTURE) { 1091 vm_map_lock(map); 1092 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1093 vm_map_unlock(map); 1094 error = 0; 1095 } 1096 1097 if (uap->how & MCL_CURRENT) { 1098 /* 1099 * P1003.1-2001 mandates that all currently mapped pages 1100 * will be memory resident and locked (wired) upon return 1101 * from mlockall(). vm_map_wire() will wire pages, by 1102 * calling vm_fault_wire() for each page in the region. 1103 */ 1104 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1105 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1106 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1107 } 1108 1109 return (error); 1110 } 1111 1112 #ifndef _SYS_SYSPROTO_H_ 1113 struct munlockall_args { 1114 register_t dummy; 1115 }; 1116 #endif 1117 1118 /* 1119 * MPSAFE 1120 */ 1121 int 1122 munlockall(td, uap) 1123 struct thread *td; 1124 struct munlockall_args *uap; 1125 { 1126 vm_map_t map; 1127 int error; 1128 1129 map = &td->td_proc->p_vmspace->vm_map; 1130 #ifndef pmap_wired_count 1131 error = suser(td); 1132 if (error) 1133 return (error); 1134 #endif 1135 1136 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1137 vm_map_lock(map); 1138 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1139 vm_map_unlock(map); 1140 1141 /* Forcibly unwire all pages. */ 1142 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1143 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1144 1145 return (error); 1146 } 1147 1148 #ifndef _SYS_SYSPROTO_H_ 1149 struct munlock_args { 1150 const void *addr; 1151 size_t len; 1152 }; 1153 #endif 1154 /* 1155 * MPSAFE 1156 */ 1157 int 1158 munlock(td, uap) 1159 struct thread *td; 1160 struct munlock_args *uap; 1161 { 1162 vm_offset_t addr; 1163 vm_size_t size, pageoff; 1164 int error; 1165 1166 addr = (vm_offset_t) uap->addr; 1167 size = uap->len; 1168 1169 pageoff = (addr & PAGE_MASK); 1170 addr -= pageoff; 1171 size += pageoff; 1172 size = (vm_size_t) round_page(size); 1173 1174 /* disable wrap around */ 1175 if (addr + size < addr) 1176 return (EINVAL); 1177 1178 #ifndef pmap_wired_count 1179 error = suser(td); 1180 if (error) 1181 return (error); 1182 #endif 1183 1184 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, addr, 1185 addr + size, VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); 1186 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1187 } 1188 1189 /* 1190 * vm_mmap() 1191 * 1192 * MPSAFE 1193 * 1194 * Internal version of mmap. Currently used by mmap, exec, and sys5 1195 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1196 */ 1197 int 1198 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1199 vm_prot_t maxprot, int flags, 1200 void *handle, 1201 vm_ooffset_t foff) 1202 { 1203 boolean_t fitit; 1204 vm_object_t object; 1205 struct vnode *vp = NULL; 1206 objtype_t type; 1207 int rv = KERN_SUCCESS; 1208 vm_ooffset_t objsize; 1209 int docow; 1210 struct thread *td = curthread; 1211 1212 if (size == 0) 1213 return (0); 1214 1215 objsize = size = round_page(size); 1216 1217 if (td->td_proc->p_vmspace->vm_map.size + size > 1218 td->td_proc->p_rlimit[RLIMIT_VMEM].rlim_cur) { 1219 return(ENOMEM); 1220 } 1221 1222 /* 1223 * We currently can only deal with page aligned file offsets. 1224 * The check is here rather than in the syscall because the 1225 * kernel calls this function internally for other mmaping 1226 * operations (such as in exec) and non-aligned offsets will 1227 * cause pmap inconsistencies...so we want to be sure to 1228 * disallow this in all cases. 1229 */ 1230 if (foff & PAGE_MASK) 1231 return (EINVAL); 1232 1233 if ((flags & MAP_FIXED) == 0) { 1234 fitit = TRUE; 1235 *addr = round_page(*addr); 1236 } else { 1237 if (*addr != trunc_page(*addr)) 1238 return (EINVAL); 1239 fitit = FALSE; 1240 (void) vm_map_remove(map, *addr, *addr + size); 1241 } 1242 1243 /* 1244 * Lookup/allocate object. 1245 */ 1246 if (flags & MAP_ANON) { 1247 type = OBJT_DEFAULT; 1248 /* 1249 * Unnamed anonymous regions always start at 0. 1250 */ 1251 if (handle == 0) 1252 foff = 0; 1253 } else { 1254 vp = (struct vnode *) handle; 1255 mtx_lock(&Giant); 1256 ASSERT_VOP_LOCKED(vp, "vm_mmap"); 1257 if (vp->v_type == VCHR) { 1258 type = OBJT_DEVICE; 1259 handle = vp->v_rdev; 1260 } else { 1261 struct vattr vat; 1262 int error; 1263 1264 error = VOP_GETATTR(vp, &vat, td->td_ucred, td); 1265 if (error) { 1266 mtx_unlock(&Giant); 1267 return (error); 1268 } 1269 objsize = round_page(vat.va_size); 1270 type = OBJT_VNODE; 1271 /* 1272 * if it is a regular file without any references 1273 * we do not need to sync it. 1274 */ 1275 if (vp->v_type == VREG && vat.va_nlink == 0) { 1276 flags |= MAP_NOSYNC; 1277 } 1278 } 1279 mtx_unlock(&Giant); 1280 } 1281 1282 if (handle == NULL) { 1283 object = NULL; 1284 docow = 0; 1285 } else { 1286 object = vm_pager_allocate(type, 1287 handle, objsize, prot, foff); 1288 if (object == NULL) { 1289 return (type == OBJT_DEVICE ? EINVAL : ENOMEM); 1290 } 1291 docow = MAP_PREFAULT_PARTIAL; 1292 } 1293 1294 /* 1295 * Force device mappings to be shared. 1296 */ 1297 if (type == OBJT_DEVICE) { 1298 flags &= ~(MAP_PRIVATE|MAP_COPY); 1299 flags |= MAP_SHARED; 1300 } 1301 1302 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1303 docow |= MAP_COPY_ON_WRITE; 1304 if (flags & MAP_NOSYNC) 1305 docow |= MAP_DISABLE_SYNCER; 1306 if (flags & MAP_NOCORE) 1307 docow |= MAP_DISABLE_COREDUMP; 1308 1309 #if defined(VM_PROT_READ_IS_EXEC) 1310 if (prot & VM_PROT_READ) 1311 prot |= VM_PROT_EXECUTE; 1312 1313 if (maxprot & VM_PROT_READ) 1314 maxprot |= VM_PROT_EXECUTE; 1315 #endif 1316 1317 if (fitit) 1318 *addr = pmap_addr_hint(object, *addr, size); 1319 1320 if (flags & MAP_STACK) 1321 rv = vm_map_stack (map, *addr, size, prot, 1322 maxprot, docow); 1323 else 1324 rv = vm_map_find(map, object, foff, addr, size, fitit, 1325 prot, maxprot, docow); 1326 1327 if (rv != KERN_SUCCESS) { 1328 /* 1329 * Lose the object reference. Will destroy the 1330 * object if it's an unnamed anonymous mapping 1331 * or named anonymous without other references. 1332 */ 1333 vm_object_deallocate(object); 1334 } else if (flags & MAP_SHARED) { 1335 /* 1336 * Shared memory is also shared with children. 1337 */ 1338 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); 1339 if (rv != KERN_SUCCESS) 1340 (void) vm_map_remove(map, *addr, *addr + size); 1341 } 1342 1343 /* 1344 * If the process has requested that all future mappings 1345 * be wired, then heed this. 1346 */ 1347 if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE)) 1348 vm_map_wire(map, *addr, *addr + size, 1349 VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); 1350 1351 switch (rv) { 1352 case KERN_SUCCESS: 1353 return (0); 1354 case KERN_INVALID_ADDRESS: 1355 case KERN_NO_SPACE: 1356 return (ENOMEM); 1357 case KERN_PROTECTION_FAILURE: 1358 return (EACCES); 1359 default: 1360 return (EINVAL); 1361 } 1362 } 1363