1 /* 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 39 * 40 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 41 */ 42 43 /* 44 * Mapped file (mmap) interface to VM 45 */ 46 47 #include <sys/cdefs.h> 48 __FBSDID("$FreeBSD$"); 49 50 #include "opt_compat.h" 51 #include "opt_mac.h" 52 53 #include <sys/param.h> 54 #include <sys/systm.h> 55 #include <sys/kernel.h> 56 #include <sys/lock.h> 57 #include <sys/mutex.h> 58 #include <sys/sysproto.h> 59 #include <sys/filedesc.h> 60 #include <sys/proc.h> 61 #include <sys/resource.h> 62 #include <sys/resourcevar.h> 63 #include <sys/vnode.h> 64 #include <sys/fcntl.h> 65 #include <sys/file.h> 66 #include <sys/mac.h> 67 #include <sys/mman.h> 68 #include <sys/conf.h> 69 #include <sys/stat.h> 70 #include <sys/vmmeter.h> 71 #include <sys/sysctl.h> 72 73 #include <vm/vm.h> 74 #include <vm/vm_param.h> 75 #include <vm/pmap.h> 76 #include <vm/vm_map.h> 77 #include <vm/vm_object.h> 78 #include <vm/vm_page.h> 79 #include <vm/vm_pager.h> 80 #include <vm/vm_pageout.h> 81 #include <vm/vm_extern.h> 82 #include <vm/vm_page.h> 83 #include <vm/vm_kern.h> 84 85 #ifndef _SYS_SYSPROTO_H_ 86 struct sbrk_args { 87 int incr; 88 }; 89 #endif 90 91 static int max_proc_mmap; 92 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, ""); 93 94 /* 95 * Set the maximum number of vm_map_entry structures per process. Roughly 96 * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100 97 * of our KVM malloc space still results in generous limits. We want a 98 * default that is good enough to prevent the kernel running out of resources 99 * if attacked from compromised user account but generous enough such that 100 * multi-threaded processes are not unduly inconvenienced. 101 */ 102 static void vmmapentry_rsrc_init(void *); 103 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, NULL) 104 105 static void 106 vmmapentry_rsrc_init(dummy) 107 void *dummy; 108 { 109 max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry); 110 max_proc_mmap /= 100; 111 } 112 113 /* 114 * MPSAFE 115 */ 116 /* ARGSUSED */ 117 int 118 sbrk(td, uap) 119 struct thread *td; 120 struct sbrk_args *uap; 121 { 122 /* Not yet implemented */ 123 /* mtx_lock(&Giant); */ 124 /* mtx_unlock(&Giant); */ 125 return (EOPNOTSUPP); 126 } 127 128 #ifndef _SYS_SYSPROTO_H_ 129 struct sstk_args { 130 int incr; 131 }; 132 #endif 133 134 /* 135 * MPSAFE 136 */ 137 /* ARGSUSED */ 138 int 139 sstk(td, uap) 140 struct thread *td; 141 struct sstk_args *uap; 142 { 143 /* Not yet implemented */ 144 /* mtx_lock(&Giant); */ 145 /* mtx_unlock(&Giant); */ 146 return (EOPNOTSUPP); 147 } 148 149 #if defined(COMPAT_43) || defined(COMPAT_SUNOS) 150 #ifndef _SYS_SYSPROTO_H_ 151 struct getpagesize_args { 152 int dummy; 153 }; 154 #endif 155 156 /* ARGSUSED */ 157 int 158 ogetpagesize(td, uap) 159 struct thread *td; 160 struct getpagesize_args *uap; 161 { 162 /* MP SAFE */ 163 td->td_retval[0] = PAGE_SIZE; 164 return (0); 165 } 166 #endif /* COMPAT_43 || COMPAT_SUNOS */ 167 168 169 /* 170 * Memory Map (mmap) system call. Note that the file offset 171 * and address are allowed to be NOT page aligned, though if 172 * the MAP_FIXED flag it set, both must have the same remainder 173 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 174 * page-aligned, the actual mapping starts at trunc_page(addr) 175 * and the return value is adjusted up by the page offset. 176 * 177 * Generally speaking, only character devices which are themselves 178 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 179 * there would be no cache coherency between a descriptor and a VM mapping 180 * both to the same character device. 181 * 182 * Block devices can be mmap'd no matter what they represent. Cache coherency 183 * is maintained as long as you do not write directly to the underlying 184 * character device. 185 */ 186 #ifndef _SYS_SYSPROTO_H_ 187 struct mmap_args { 188 void *addr; 189 size_t len; 190 int prot; 191 int flags; 192 int fd; 193 long pad; 194 off_t pos; 195 }; 196 #endif 197 198 /* 199 * MPSAFE 200 */ 201 int 202 mmap(td, uap) 203 struct thread *td; 204 struct mmap_args *uap; 205 { 206 struct file *fp = NULL; 207 struct vnode *vp; 208 vm_offset_t addr; 209 vm_size_t size, pageoff; 210 vm_prot_t prot, maxprot; 211 void *handle; 212 int flags, error; 213 int disablexworkaround; 214 off_t pos; 215 struct vmspace *vms = td->td_proc->p_vmspace; 216 vm_object_t obj; 217 218 addr = (vm_offset_t) uap->addr; 219 size = uap->len; 220 prot = uap->prot & VM_PROT_ALL; 221 flags = uap->flags; 222 pos = uap->pos; 223 224 vp = NULL; 225 fp = NULL; 226 /* make sure mapping fits into numeric range etc */ 227 if ((ssize_t) uap->len < 0 || 228 ((flags & MAP_ANON) && uap->fd != -1)) 229 return (EINVAL); 230 231 if (flags & MAP_STACK) { 232 if ((uap->fd != -1) || 233 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 234 return (EINVAL); 235 flags |= MAP_ANON; 236 pos = 0; 237 } 238 239 /* 240 * Align the file position to a page boundary, 241 * and save its page offset component. 242 */ 243 pageoff = (pos & PAGE_MASK); 244 pos -= pageoff; 245 246 /* Adjust size for rounding (on both ends). */ 247 size += pageoff; /* low end... */ 248 size = (vm_size_t) round_page(size); /* hi end */ 249 250 /* 251 * Check for illegal addresses. Watch out for address wrap... Note 252 * that VM_*_ADDRESS are not constants due to casts (argh). 253 */ 254 if (flags & MAP_FIXED) { 255 /* 256 * The specified address must have the same remainder 257 * as the file offset taken modulo PAGE_SIZE, so it 258 * should be aligned after adjustment by pageoff. 259 */ 260 addr -= pageoff; 261 if (addr & PAGE_MASK) 262 return (EINVAL); 263 /* Address range must be all in user VM space. */ 264 if (addr < vm_map_min(&vms->vm_map) || 265 addr + size > vm_map_max(&vms->vm_map)) 266 return (EINVAL); 267 if (addr + size < addr) 268 return (EINVAL); 269 } 270 /* 271 * XXX for non-fixed mappings where no hint is provided or 272 * the hint would fall in the potential heap space, 273 * place it after the end of the largest possible heap. 274 * 275 * There should really be a pmap call to determine a reasonable 276 * location. 277 */ 278 else if (addr == 0 || 279 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 280 addr < round_page((vm_offset_t)vms->vm_daddr + 281 td->td_proc->p_rlimit[RLIMIT_DATA].rlim_max))) 282 addr = round_page((vm_offset_t)vms->vm_daddr + 283 td->td_proc->p_rlimit[RLIMIT_DATA].rlim_max); 284 285 mtx_lock(&Giant); /* syscall marked mp-safe but isn't */ 286 do { 287 if (flags & MAP_ANON) { 288 /* 289 * Mapping blank space is trivial. 290 */ 291 handle = NULL; 292 maxprot = VM_PROT_ALL; 293 pos = 0; 294 break; 295 } 296 /* 297 * Mapping file, get fp for validation. Obtain vnode and make 298 * sure it is of appropriate type. 299 * don't let the descriptor disappear on us if we block 300 */ 301 if ((error = fget(td, uap->fd, &fp)) != 0) 302 goto done; 303 if (fp->f_type != DTYPE_VNODE) { 304 error = EINVAL; 305 goto done; 306 } 307 308 /* 309 * POSIX shared-memory objects are defined to have 310 * kernel persistence, and are not defined to support 311 * read(2)/write(2) -- or even open(2). Thus, we can 312 * use MAP_ASYNC to trade on-disk coherence for speed. 313 * The shm_open(3) library routine turns on the FPOSIXSHM 314 * flag to request this behavior. 315 */ 316 if (fp->f_flag & FPOSIXSHM) 317 flags |= MAP_NOSYNC; 318 vp = fp->f_vnode; 319 error = vget(vp, LK_EXCLUSIVE, td); 320 if (error) 321 goto done; 322 if (vp->v_type != VREG && vp->v_type != VCHR) { 323 error = EINVAL; 324 goto done; 325 } 326 if (vp->v_type == VREG) { 327 /* 328 * Get the proper underlying object 329 */ 330 if (VOP_GETVOBJECT(vp, &obj) != 0) { 331 error = EINVAL; 332 goto done; 333 } 334 if (obj->handle != vp) { 335 vput(vp); 336 vp = (struct vnode*)obj->handle; 337 vget(vp, LK_EXCLUSIVE, td); 338 } 339 } 340 /* 341 * XXX hack to handle use of /dev/zero to map anon memory (ala 342 * SunOS). 343 */ 344 if ((vp->v_type == VCHR) && 345 (vp->v_rdev->si_devsw->d_flags & D_MMAP_ANON)) { 346 handle = NULL; 347 maxprot = VM_PROT_ALL; 348 flags |= MAP_ANON; 349 pos = 0; 350 break; 351 } 352 /* 353 * cdevs does not provide private mappings of any kind. 354 */ 355 /* 356 * However, for XIG X server to continue to work, 357 * we should allow the superuser to do it anyway. 358 * We only allow it at securelevel < 1. 359 * (Because the XIG X server writes directly to video 360 * memory via /dev/mem, it should never work at any 361 * other securelevel. 362 * XXX this will have to go 363 */ 364 if (securelevel_ge(td->td_ucred, 1)) 365 disablexworkaround = 1; 366 else 367 disablexworkaround = suser(td); 368 if (vp->v_type == VCHR && disablexworkaround && 369 (flags & (MAP_PRIVATE|MAP_COPY))) { 370 error = EINVAL; 371 goto done; 372 } 373 /* 374 * Ensure that file and memory protections are 375 * compatible. Note that we only worry about 376 * writability if mapping is shared; in this case, 377 * current and max prot are dictated by the open file. 378 * XXX use the vnode instead? Problem is: what 379 * credentials do we use for determination? What if 380 * proc does a setuid? 381 */ 382 maxprot = VM_PROT_EXECUTE; /* ??? */ 383 if (fp->f_flag & FREAD) { 384 maxprot |= VM_PROT_READ; 385 } else if (prot & PROT_READ) { 386 error = EACCES; 387 goto done; 388 } 389 /* 390 * If we are sharing potential changes (either via 391 * MAP_SHARED or via the implicit sharing of character 392 * device mappings), and we are trying to get write 393 * permission although we opened it without asking 394 * for it, bail out. Check for superuser, only if 395 * we're at securelevel < 1, to allow the XIG X server 396 * to continue to work. 397 */ 398 if ((flags & MAP_SHARED) != 0 || 399 (vp->v_type == VCHR && disablexworkaround)) { 400 if ((fp->f_flag & FWRITE) != 0) { 401 struct vattr va; 402 if ((error = 403 VOP_GETATTR(vp, &va, 404 td->td_ucred, td))) { 405 goto done; 406 } 407 if ((va.va_flags & 408 (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0) { 409 maxprot |= VM_PROT_WRITE; 410 } else if (prot & PROT_WRITE) { 411 error = EPERM; 412 goto done; 413 } 414 } else if ((prot & PROT_WRITE) != 0) { 415 error = EACCES; 416 goto done; 417 } 418 } else { 419 maxprot |= VM_PROT_WRITE; 420 } 421 422 handle = (void *)vp; 423 } while (0); 424 425 /* 426 * Do not allow more then a certain number of vm_map_entry structures 427 * per process. Scale with the number of rforks sharing the map 428 * to make the limit reasonable for threads. 429 */ 430 if (max_proc_mmap && 431 vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) { 432 error = ENOMEM; 433 goto done; 434 } 435 436 mtx_unlock(&Giant); 437 error = 0; 438 #ifdef MAC 439 if (handle != NULL && (flags & MAP_SHARED) != 0) { 440 error = mac_check_vnode_mmap(td->td_ucred, 441 (struct vnode *)handle, prot); 442 } 443 #endif 444 if (error == 0) 445 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 446 flags, handle, pos); 447 mtx_lock(&Giant); 448 if (error == 0) 449 td->td_retval[0] = (register_t) (addr + pageoff); 450 done: 451 if (vp) 452 vput(vp); 453 mtx_unlock(&Giant); 454 if (fp) 455 fdrop(fp, td); 456 457 return (error); 458 } 459 460 #ifdef COMPAT_43 461 #ifndef _SYS_SYSPROTO_H_ 462 struct ommap_args { 463 caddr_t addr; 464 int len; 465 int prot; 466 int flags; 467 int fd; 468 long pos; 469 }; 470 #endif 471 int 472 ommap(td, uap) 473 struct thread *td; 474 struct ommap_args *uap; 475 { 476 struct mmap_args nargs; 477 static const char cvtbsdprot[8] = { 478 0, 479 PROT_EXEC, 480 PROT_WRITE, 481 PROT_EXEC | PROT_WRITE, 482 PROT_READ, 483 PROT_EXEC | PROT_READ, 484 PROT_WRITE | PROT_READ, 485 PROT_EXEC | PROT_WRITE | PROT_READ, 486 }; 487 488 #define OMAP_ANON 0x0002 489 #define OMAP_COPY 0x0020 490 #define OMAP_SHARED 0x0010 491 #define OMAP_FIXED 0x0100 492 493 nargs.addr = uap->addr; 494 nargs.len = uap->len; 495 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 496 nargs.flags = 0; 497 if (uap->flags & OMAP_ANON) 498 nargs.flags |= MAP_ANON; 499 if (uap->flags & OMAP_COPY) 500 nargs.flags |= MAP_COPY; 501 if (uap->flags & OMAP_SHARED) 502 nargs.flags |= MAP_SHARED; 503 else 504 nargs.flags |= MAP_PRIVATE; 505 if (uap->flags & OMAP_FIXED) 506 nargs.flags |= MAP_FIXED; 507 nargs.fd = uap->fd; 508 nargs.pos = uap->pos; 509 return (mmap(td, &nargs)); 510 } 511 #endif /* COMPAT_43 */ 512 513 514 #ifndef _SYS_SYSPROTO_H_ 515 struct msync_args { 516 void *addr; 517 int len; 518 int flags; 519 }; 520 #endif 521 /* 522 * MPSAFE 523 */ 524 int 525 msync(td, uap) 526 struct thread *td; 527 struct msync_args *uap; 528 { 529 vm_offset_t addr; 530 vm_size_t size, pageoff; 531 int flags; 532 vm_map_t map; 533 int rv; 534 535 addr = (vm_offset_t) uap->addr; 536 size = uap->len; 537 flags = uap->flags; 538 539 pageoff = (addr & PAGE_MASK); 540 addr -= pageoff; 541 size += pageoff; 542 size = (vm_size_t) round_page(size); 543 if (addr + size < addr) 544 return (EINVAL); 545 546 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 547 return (EINVAL); 548 549 map = &td->td_proc->p_vmspace->vm_map; 550 551 /* 552 * Clean the pages and interpret the return value. 553 */ 554 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 555 (flags & MS_INVALIDATE) != 0); 556 switch (rv) { 557 case KERN_SUCCESS: 558 return (0); 559 case KERN_INVALID_ADDRESS: 560 return (EINVAL); /* Sun returns ENOMEM? */ 561 case KERN_INVALID_ARGUMENT: 562 return (EBUSY); 563 default: 564 return (EINVAL); 565 } 566 } 567 568 #ifndef _SYS_SYSPROTO_H_ 569 struct munmap_args { 570 void *addr; 571 size_t len; 572 }; 573 #endif 574 /* 575 * MPSAFE 576 */ 577 int 578 munmap(td, uap) 579 struct thread *td; 580 struct munmap_args *uap; 581 { 582 vm_offset_t addr; 583 vm_size_t size, pageoff; 584 vm_map_t map; 585 586 addr = (vm_offset_t) uap->addr; 587 size = uap->len; 588 if (size == 0) 589 return (EINVAL); 590 591 pageoff = (addr & PAGE_MASK); 592 addr -= pageoff; 593 size += pageoff; 594 size = (vm_size_t) round_page(size); 595 if (addr + size < addr) 596 return (EINVAL); 597 598 /* 599 * Check for illegal addresses. Watch out for address wrap... 600 */ 601 map = &td->td_proc->p_vmspace->vm_map; 602 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 603 return (EINVAL); 604 vm_map_lock(map); 605 /* 606 * Make sure entire range is allocated. 607 */ 608 if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) { 609 vm_map_unlock(map); 610 return (EINVAL); 611 } 612 /* returns nothing but KERN_SUCCESS anyway */ 613 vm_map_delete(map, addr, addr + size); 614 vm_map_unlock(map); 615 return (0); 616 } 617 618 #ifndef _SYS_SYSPROTO_H_ 619 struct mprotect_args { 620 const void *addr; 621 size_t len; 622 int prot; 623 }; 624 #endif 625 /* 626 * MPSAFE 627 */ 628 int 629 mprotect(td, uap) 630 struct thread *td; 631 struct mprotect_args *uap; 632 { 633 vm_offset_t addr; 634 vm_size_t size, pageoff; 635 vm_prot_t prot; 636 637 addr = (vm_offset_t) uap->addr; 638 size = uap->len; 639 prot = uap->prot & VM_PROT_ALL; 640 #if defined(VM_PROT_READ_IS_EXEC) 641 if (prot & VM_PROT_READ) 642 prot |= VM_PROT_EXECUTE; 643 #endif 644 645 pageoff = (addr & PAGE_MASK); 646 addr -= pageoff; 647 size += pageoff; 648 size = (vm_size_t) round_page(size); 649 if (addr + size < addr) 650 return (EINVAL); 651 652 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 653 addr + size, prot, FALSE)) { 654 case KERN_SUCCESS: 655 return (0); 656 case KERN_PROTECTION_FAILURE: 657 return (EACCES); 658 } 659 return (EINVAL); 660 } 661 662 #ifndef _SYS_SYSPROTO_H_ 663 struct minherit_args { 664 void *addr; 665 size_t len; 666 int inherit; 667 }; 668 #endif 669 /* 670 * MPSAFE 671 */ 672 int 673 minherit(td, uap) 674 struct thread *td; 675 struct minherit_args *uap; 676 { 677 vm_offset_t addr; 678 vm_size_t size, pageoff; 679 vm_inherit_t inherit; 680 681 addr = (vm_offset_t)uap->addr; 682 size = uap->len; 683 inherit = uap->inherit; 684 685 pageoff = (addr & PAGE_MASK); 686 addr -= pageoff; 687 size += pageoff; 688 size = (vm_size_t) round_page(size); 689 if (addr + size < addr) 690 return (EINVAL); 691 692 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 693 addr + size, inherit)) { 694 case KERN_SUCCESS: 695 return (0); 696 case KERN_PROTECTION_FAILURE: 697 return (EACCES); 698 } 699 return (EINVAL); 700 } 701 702 #ifndef _SYS_SYSPROTO_H_ 703 struct madvise_args { 704 void *addr; 705 size_t len; 706 int behav; 707 }; 708 #endif 709 710 /* 711 * MPSAFE 712 */ 713 /* ARGSUSED */ 714 int 715 madvise(td, uap) 716 struct thread *td; 717 struct madvise_args *uap; 718 { 719 vm_offset_t start, end; 720 vm_map_t map; 721 struct proc *p; 722 int error; 723 724 /* 725 * Check for our special case, advising the swap pager we are 726 * "immortal." 727 */ 728 if (uap->behav == MADV_PROTECT) { 729 error = suser(td); 730 if (error == 0) { 731 p = td->td_proc; 732 PROC_LOCK(p); 733 p->p_flag |= P_PROTECTED; 734 PROC_UNLOCK(p); 735 } 736 return (error); 737 } 738 /* 739 * Check for illegal behavior 740 */ 741 if (uap->behav < 0 || uap->behav > MADV_CORE) 742 return (EINVAL); 743 /* 744 * Check for illegal addresses. Watch out for address wrap... Note 745 * that VM_*_ADDRESS are not constants due to casts (argh). 746 */ 747 map = &td->td_proc->p_vmspace->vm_map; 748 if ((vm_offset_t)uap->addr < vm_map_min(map) || 749 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 750 return (EINVAL); 751 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 752 return (EINVAL); 753 754 /* 755 * Since this routine is only advisory, we default to conservative 756 * behavior. 757 */ 758 start = trunc_page((vm_offset_t) uap->addr); 759 end = round_page((vm_offset_t) uap->addr + uap->len); 760 761 if (vm_map_madvise(map, start, end, uap->behav)) 762 return (EINVAL); 763 return (0); 764 } 765 766 #ifndef _SYS_SYSPROTO_H_ 767 struct mincore_args { 768 const void *addr; 769 size_t len; 770 char *vec; 771 }; 772 #endif 773 774 /* 775 * MPSAFE 776 */ 777 /* ARGSUSED */ 778 int 779 mincore(td, uap) 780 struct thread *td; 781 struct mincore_args *uap; 782 { 783 vm_offset_t addr, first_addr; 784 vm_offset_t end, cend; 785 pmap_t pmap; 786 vm_map_t map; 787 char *vec; 788 int error = 0; 789 int vecindex, lastvecindex; 790 vm_map_entry_t current; 791 vm_map_entry_t entry; 792 int mincoreinfo; 793 unsigned int timestamp; 794 795 /* 796 * Make sure that the addresses presented are valid for user 797 * mode. 798 */ 799 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 800 end = addr + (vm_size_t)round_page(uap->len); 801 map = &td->td_proc->p_vmspace->vm_map; 802 if (end > vm_map_max(map) || end < addr) 803 return (EINVAL); 804 805 /* 806 * Address of byte vector 807 */ 808 vec = uap->vec; 809 810 pmap = vmspace_pmap(td->td_proc->p_vmspace); 811 812 vm_map_lock_read(map); 813 RestartScan: 814 timestamp = map->timestamp; 815 816 if (!vm_map_lookup_entry(map, addr, &entry)) 817 entry = entry->next; 818 819 /* 820 * Do this on a map entry basis so that if the pages are not 821 * in the current processes address space, we can easily look 822 * up the pages elsewhere. 823 */ 824 lastvecindex = -1; 825 for (current = entry; 826 (current != &map->header) && (current->start < end); 827 current = current->next) { 828 829 /* 830 * ignore submaps (for now) or null objects 831 */ 832 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 833 current->object.vm_object == NULL) 834 continue; 835 836 /* 837 * limit this scan to the current map entry and the 838 * limits for the mincore call 839 */ 840 if (addr < current->start) 841 addr = current->start; 842 cend = current->end; 843 if (cend > end) 844 cend = end; 845 846 /* 847 * scan this entry one page at a time 848 */ 849 while (addr < cend) { 850 /* 851 * Check pmap first, it is likely faster, also 852 * it can provide info as to whether we are the 853 * one referencing or modifying the page. 854 */ 855 mtx_lock(&Giant); 856 mincoreinfo = pmap_mincore(pmap, addr); 857 mtx_unlock(&Giant); 858 if (!mincoreinfo) { 859 vm_pindex_t pindex; 860 vm_ooffset_t offset; 861 vm_page_t m; 862 /* 863 * calculate the page index into the object 864 */ 865 offset = current->offset + (addr - current->start); 866 pindex = OFF_TO_IDX(offset); 867 VM_OBJECT_LOCK(current->object.vm_object); 868 m = vm_page_lookup(current->object.vm_object, 869 pindex); 870 /* 871 * if the page is resident, then gather information about 872 * it. 873 */ 874 if (m) { 875 mincoreinfo = MINCORE_INCORE; 876 vm_page_lock_queues(); 877 if (m->dirty || 878 pmap_is_modified(m)) 879 mincoreinfo |= MINCORE_MODIFIED_OTHER; 880 if ((m->flags & PG_REFERENCED) || 881 pmap_ts_referenced(m)) { 882 vm_page_flag_set(m, PG_REFERENCED); 883 mincoreinfo |= MINCORE_REFERENCED_OTHER; 884 } 885 vm_page_unlock_queues(); 886 } 887 VM_OBJECT_UNLOCK(current->object.vm_object); 888 } 889 890 /* 891 * subyte may page fault. In case it needs to modify 892 * the map, we release the lock. 893 */ 894 vm_map_unlock_read(map); 895 896 /* 897 * calculate index into user supplied byte vector 898 */ 899 vecindex = OFF_TO_IDX(addr - first_addr); 900 901 /* 902 * If we have skipped map entries, we need to make sure that 903 * the byte vector is zeroed for those skipped entries. 904 */ 905 while ((lastvecindex + 1) < vecindex) { 906 error = subyte(vec + lastvecindex, 0); 907 if (error) { 908 error = EFAULT; 909 goto done2; 910 } 911 ++lastvecindex; 912 } 913 914 /* 915 * Pass the page information to the user 916 */ 917 error = subyte(vec + vecindex, mincoreinfo); 918 if (error) { 919 error = EFAULT; 920 goto done2; 921 } 922 923 /* 924 * If the map has changed, due to the subyte, the previous 925 * output may be invalid. 926 */ 927 vm_map_lock_read(map); 928 if (timestamp != map->timestamp) 929 goto RestartScan; 930 931 lastvecindex = vecindex; 932 addr += PAGE_SIZE; 933 } 934 } 935 936 /* 937 * subyte may page fault. In case it needs to modify 938 * the map, we release the lock. 939 */ 940 vm_map_unlock_read(map); 941 942 /* 943 * Zero the last entries in the byte vector. 944 */ 945 vecindex = OFF_TO_IDX(end - first_addr); 946 while ((lastvecindex + 1) < vecindex) { 947 error = subyte(vec + lastvecindex, 0); 948 if (error) { 949 error = EFAULT; 950 goto done2; 951 } 952 ++lastvecindex; 953 } 954 955 /* 956 * If the map has changed, due to the subyte, the previous 957 * output may be invalid. 958 */ 959 vm_map_lock_read(map); 960 if (timestamp != map->timestamp) 961 goto RestartScan; 962 vm_map_unlock_read(map); 963 done2: 964 return (error); 965 } 966 967 #ifndef _SYS_SYSPROTO_H_ 968 struct mlock_args { 969 const void *addr; 970 size_t len; 971 }; 972 #endif 973 /* 974 * MPSAFE 975 */ 976 int 977 mlock(td, uap) 978 struct thread *td; 979 struct mlock_args *uap; 980 { 981 vm_offset_t addr; 982 vm_size_t size, pageoff; 983 int error; 984 985 addr = (vm_offset_t) uap->addr; 986 size = uap->len; 987 988 pageoff = (addr & PAGE_MASK); 989 addr -= pageoff; 990 size += pageoff; 991 size = (vm_size_t) round_page(size); 992 993 /* disable wrap around */ 994 if (addr + size < addr) 995 return (EINVAL); 996 997 if (atop(size) + cnt.v_wire_count > vm_page_max_wired) 998 return (EAGAIN); 999 1000 #if 0 1001 if (size + ptoa(pmap_wired_count(vm_map_pmap(&td->td_proc->p_vmspace->vm_map))) > 1002 td->td_proc->p_rlimit[RLIMIT_MEMLOCK].rlim_cur) 1003 return (ENOMEM); 1004 #else 1005 error = suser(td); 1006 if (error) 1007 return (error); 1008 #endif 1009 1010 error = vm_map_wire(&td->td_proc->p_vmspace->vm_map, addr, 1011 addr + size, VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); 1012 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1013 } 1014 1015 #ifndef _SYS_SYSPROTO_H_ 1016 struct mlockall_args { 1017 int how; 1018 }; 1019 #endif 1020 1021 /* 1022 * MPSAFE 1023 */ 1024 int 1025 mlockall(td, uap) 1026 struct thread *td; 1027 struct mlockall_args *uap; 1028 { 1029 vm_map_t map; 1030 int error; 1031 1032 map = &td->td_proc->p_vmspace->vm_map; 1033 error = 0; 1034 1035 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1036 return (EINVAL); 1037 1038 #if 0 1039 /* 1040 * If wiring all pages in the process would cause it to exceed 1041 * a hard resource limit, return ENOMEM. 1042 */ 1043 if (map->size - ptoa(pmap_wired_count(vm_map_pmap(map)) > 1044 td->td_proc->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)) 1045 return (ENOMEM); 1046 #else 1047 error = suser(td); 1048 if (error) 1049 return (error); 1050 #endif 1051 1052 if (uap->how & MCL_FUTURE) { 1053 vm_map_lock(map); 1054 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1055 vm_map_unlock(map); 1056 error = 0; 1057 } 1058 1059 if (uap->how & MCL_CURRENT) { 1060 /* 1061 * P1003.1-2001 mandates that all currently mapped pages 1062 * will be memory resident and locked (wired) upon return 1063 * from mlockall(). vm_map_wire() will wire pages, by 1064 * calling vm_fault_wire() for each page in the region. 1065 */ 1066 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1067 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1068 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1069 } 1070 1071 return (error); 1072 } 1073 1074 #ifndef _SYS_SYSPROTO_H_ 1075 struct munlockall_args { 1076 register_t dummy; 1077 }; 1078 #endif 1079 1080 /* 1081 * MPSAFE 1082 */ 1083 int 1084 munlockall(td, uap) 1085 struct thread *td; 1086 struct munlockall_args *uap; 1087 { 1088 vm_map_t map; 1089 int error; 1090 1091 map = &td->td_proc->p_vmspace->vm_map; 1092 error = suser(td); 1093 if (error) 1094 return (error); 1095 1096 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1097 vm_map_lock(map); 1098 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1099 vm_map_unlock(map); 1100 1101 /* Forcibly unwire all pages. */ 1102 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1103 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1104 1105 return (error); 1106 } 1107 1108 #ifndef _SYS_SYSPROTO_H_ 1109 struct munlock_args { 1110 const void *addr; 1111 size_t len; 1112 }; 1113 #endif 1114 /* 1115 * MPSAFE 1116 */ 1117 int 1118 munlock(td, uap) 1119 struct thread *td; 1120 struct munlock_args *uap; 1121 { 1122 vm_offset_t addr; 1123 vm_size_t size, pageoff; 1124 int error; 1125 1126 addr = (vm_offset_t) uap->addr; 1127 size = uap->len; 1128 1129 pageoff = (addr & PAGE_MASK); 1130 addr -= pageoff; 1131 size += pageoff; 1132 size = (vm_size_t) round_page(size); 1133 1134 /* disable wrap around */ 1135 if (addr + size < addr) 1136 return (EINVAL); 1137 1138 error = suser(td); 1139 if (error) 1140 return (error); 1141 1142 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, addr, 1143 addr + size, VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); 1144 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1145 } 1146 1147 /* 1148 * vm_mmap() 1149 * 1150 * MPSAFE 1151 * 1152 * Internal version of mmap. Currently used by mmap, exec, and sys5 1153 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1154 */ 1155 int 1156 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1157 vm_prot_t maxprot, int flags, 1158 void *handle, 1159 vm_ooffset_t foff) 1160 { 1161 boolean_t fitit; 1162 vm_object_t object; 1163 struct vnode *vp = NULL; 1164 objtype_t type; 1165 int rv = KERN_SUCCESS; 1166 vm_ooffset_t objsize; 1167 int docow; 1168 struct thread *td = curthread; 1169 1170 if (size == 0) 1171 return (0); 1172 1173 objsize = size = round_page(size); 1174 1175 if (td->td_proc->p_vmspace->vm_map.size + size > 1176 td->td_proc->p_rlimit[RLIMIT_VMEM].rlim_cur) { 1177 return(ENOMEM); 1178 } 1179 1180 /* 1181 * We currently can only deal with page aligned file offsets. 1182 * The check is here rather than in the syscall because the 1183 * kernel calls this function internally for other mmaping 1184 * operations (such as in exec) and non-aligned offsets will 1185 * cause pmap inconsistencies...so we want to be sure to 1186 * disallow this in all cases. 1187 */ 1188 if (foff & PAGE_MASK) 1189 return (EINVAL); 1190 1191 if ((flags & MAP_FIXED) == 0) { 1192 fitit = TRUE; 1193 *addr = round_page(*addr); 1194 } else { 1195 if (*addr != trunc_page(*addr)) 1196 return (EINVAL); 1197 fitit = FALSE; 1198 (void) vm_map_remove(map, *addr, *addr + size); 1199 } 1200 1201 /* 1202 * Lookup/allocate object. 1203 */ 1204 if (flags & MAP_ANON) { 1205 type = OBJT_DEFAULT; 1206 /* 1207 * Unnamed anonymous regions always start at 0. 1208 */ 1209 if (handle == 0) 1210 foff = 0; 1211 } else { 1212 vp = (struct vnode *) handle; 1213 mtx_lock(&Giant); 1214 ASSERT_VOP_LOCKED(vp, "vm_mmap"); 1215 if (vp->v_type == VCHR) { 1216 type = OBJT_DEVICE; 1217 handle = vp->v_rdev; 1218 } else { 1219 struct vattr vat; 1220 int error; 1221 1222 error = VOP_GETATTR(vp, &vat, td->td_ucred, td); 1223 if (error) { 1224 mtx_unlock(&Giant); 1225 return (error); 1226 } 1227 objsize = round_page(vat.va_size); 1228 type = OBJT_VNODE; 1229 /* 1230 * if it is a regular file without any references 1231 * we do not need to sync it. 1232 */ 1233 if (vp->v_type == VREG && vat.va_nlink == 0) { 1234 flags |= MAP_NOSYNC; 1235 } 1236 } 1237 mtx_unlock(&Giant); 1238 } 1239 1240 if (handle == NULL) { 1241 object = NULL; 1242 docow = 0; 1243 } else { 1244 object = vm_pager_allocate(type, 1245 handle, objsize, prot, foff); 1246 if (object == NULL) { 1247 return (type == OBJT_DEVICE ? EINVAL : ENOMEM); 1248 } 1249 docow = MAP_PREFAULT_PARTIAL; 1250 } 1251 1252 /* 1253 * Force device mappings to be shared. 1254 */ 1255 if (type == OBJT_DEVICE) { 1256 flags &= ~(MAP_PRIVATE|MAP_COPY); 1257 flags |= MAP_SHARED; 1258 } 1259 1260 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1261 docow |= MAP_COPY_ON_WRITE; 1262 if (flags & MAP_NOSYNC) 1263 docow |= MAP_DISABLE_SYNCER; 1264 if (flags & MAP_NOCORE) 1265 docow |= MAP_DISABLE_COREDUMP; 1266 1267 #if defined(VM_PROT_READ_IS_EXEC) 1268 if (prot & VM_PROT_READ) 1269 prot |= VM_PROT_EXECUTE; 1270 1271 if (maxprot & VM_PROT_READ) 1272 maxprot |= VM_PROT_EXECUTE; 1273 #endif 1274 1275 if (fitit) 1276 *addr = pmap_addr_hint(object, *addr, size); 1277 1278 if (flags & MAP_STACK) 1279 rv = vm_map_stack(map, *addr, size, prot, maxprot, 1280 docow | MAP_STACK_GROWS_DOWN); 1281 else 1282 rv = vm_map_find(map, object, foff, addr, size, fitit, 1283 prot, maxprot, docow); 1284 1285 if (rv != KERN_SUCCESS) { 1286 /* 1287 * Lose the object reference. Will destroy the 1288 * object if it's an unnamed anonymous mapping 1289 * or named anonymous without other references. 1290 */ 1291 vm_object_deallocate(object); 1292 } else if (flags & MAP_SHARED) { 1293 /* 1294 * Shared memory is also shared with children. 1295 */ 1296 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); 1297 if (rv != KERN_SUCCESS) 1298 (void) vm_map_remove(map, *addr, *addr + size); 1299 } 1300 1301 /* 1302 * If the process has requested that all future mappings 1303 * be wired, then heed this. 1304 */ 1305 if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE)) 1306 vm_map_wire(map, *addr, *addr + size, 1307 VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); 1308 1309 switch (rv) { 1310 case KERN_SUCCESS: 1311 return (0); 1312 case KERN_INVALID_ADDRESS: 1313 case KERN_NO_SPACE: 1314 return (ENOMEM); 1315 case KERN_PROTECTION_FAILURE: 1316 return (EACCES); 1317 default: 1318 return (EINVAL); 1319 } 1320 } 1321