1 /* 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 39 * 40 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 41 * $FreeBSD$ 42 */ 43 44 /* 45 * Mapped file (mmap) interface to VM 46 */ 47 48 #include "opt_bleed.h" 49 #include "opt_compat.h" 50 #include "opt_rlimit.h" 51 52 #include <sys/param.h> 53 #include <sys/systm.h> 54 #include <sys/kernel.h> 55 #include <sys/lock.h> 56 #include <sys/mutex.h> 57 #include <sys/sysproto.h> 58 #include <sys/filedesc.h> 59 #include <sys/proc.h> 60 #include <sys/vnode.h> 61 #include <sys/fcntl.h> 62 #include <sys/file.h> 63 #include <sys/mman.h> 64 #include <sys/conf.h> 65 #include <sys/stat.h> 66 #include <sys/vmmeter.h> 67 #include <sys/sysctl.h> 68 69 #include <vm/vm.h> 70 #include <vm/vm_param.h> 71 #include <vm/pmap.h> 72 #include <vm/vm_map.h> 73 #include <vm/vm_object.h> 74 #include <vm/vm_page.h> 75 #include <vm/vm_pager.h> 76 #include <vm/vm_pageout.h> 77 #include <vm/vm_extern.h> 78 #include <vm/vm_page.h> 79 #include <vm/vm_kern.h> 80 81 #ifndef _SYS_SYSPROTO_H_ 82 struct sbrk_args { 83 int incr; 84 }; 85 #endif 86 87 static int max_proc_mmap; 88 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, ""); 89 90 /* 91 * Set the maximum number of vm_map_entry structures per process. Roughly 92 * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100 93 * of our KVM malloc space still results in generous limits. We want a 94 * default that is good enough to prevent the kernel running out of resources 95 * if attacked from compromised user account but generous enough such that 96 * multi-threaded processes are not unduly inconvenienced. 97 */ 98 99 static void vmmapentry_rsrc_init __P((void *)); 100 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, NULL) 101 102 static void 103 vmmapentry_rsrc_init(dummy) 104 void *dummy; 105 { 106 max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry); 107 max_proc_mmap /= 100; 108 } 109 110 /* 111 * MPSAFE 112 */ 113 /* ARGSUSED */ 114 int 115 sbrk(p, uap) 116 struct proc *p; 117 struct sbrk_args *uap; 118 { 119 /* Not yet implemented */ 120 /* mtx_lock(&Giant); */ 121 /* mtx_unlock(&Giant); */ 122 return (EOPNOTSUPP); 123 } 124 125 #ifndef _SYS_SYSPROTO_H_ 126 struct sstk_args { 127 int incr; 128 }; 129 #endif 130 131 /* 132 * MPSAFE 133 */ 134 /* ARGSUSED */ 135 int 136 sstk(p, uap) 137 struct proc *p; 138 struct sstk_args *uap; 139 { 140 /* Not yet implemented */ 141 /* mtx_lock(&Giant); */ 142 /* mtx_unlock(&Giant); */ 143 return (EOPNOTSUPP); 144 } 145 146 #if defined(COMPAT_43) || defined(COMPAT_SUNOS) 147 #ifndef _SYS_SYSPROTO_H_ 148 struct getpagesize_args { 149 int dummy; 150 }; 151 #endif 152 153 /* ARGSUSED */ 154 int 155 ogetpagesize(p, uap) 156 struct proc *p; 157 struct getpagesize_args *uap; 158 { 159 /* MP SAFE */ 160 p->p_retval[0] = PAGE_SIZE; 161 return (0); 162 } 163 #endif /* COMPAT_43 || COMPAT_SUNOS */ 164 165 166 /* 167 * Memory Map (mmap) system call. Note that the file offset 168 * and address are allowed to be NOT page aligned, though if 169 * the MAP_FIXED flag it set, both must have the same remainder 170 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 171 * page-aligned, the actual mapping starts at trunc_page(addr) 172 * and the return value is adjusted up by the page offset. 173 * 174 * Generally speaking, only character devices which are themselves 175 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 176 * there would be no cache coherency between a descriptor and a VM mapping 177 * both to the same character device. 178 * 179 * Block devices can be mmap'd no matter what they represent. Cache coherency 180 * is maintained as long as you do not write directly to the underlying 181 * character device. 182 */ 183 #ifndef _SYS_SYSPROTO_H_ 184 struct mmap_args { 185 void *addr; 186 size_t len; 187 int prot; 188 int flags; 189 int fd; 190 long pad; 191 off_t pos; 192 }; 193 #endif 194 195 /* 196 * MPSAFE 197 */ 198 int 199 mmap(p, uap) 200 struct proc *p; 201 struct mmap_args *uap; 202 { 203 struct filedesc *fdp = p->p_fd; 204 struct file *fp = NULL; 205 struct vnode *vp; 206 vm_offset_t addr; 207 vm_size_t size, pageoff; 208 vm_prot_t prot, maxprot; 209 void *handle; 210 int flags, error; 211 int disablexworkaround; 212 off_t pos; 213 struct vmspace *vms = p->p_vmspace; 214 vm_object_t obj; 215 216 addr = (vm_offset_t) uap->addr; 217 size = uap->len; 218 prot = uap->prot & VM_PROT_ALL; 219 flags = uap->flags; 220 pos = uap->pos; 221 222 /* make sure mapping fits into numeric range etc */ 223 if ((ssize_t) uap->len < 0 || 224 ((flags & MAP_ANON) && uap->fd != -1)) 225 return (EINVAL); 226 227 if (flags & MAP_STACK) { 228 if ((uap->fd != -1) || 229 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 230 return (EINVAL); 231 flags |= MAP_ANON; 232 pos = 0; 233 } 234 235 /* 236 * Align the file position to a page boundary, 237 * and save its page offset component. 238 */ 239 pageoff = (pos & PAGE_MASK); 240 pos -= pageoff; 241 242 /* Adjust size for rounding (on both ends). */ 243 size += pageoff; /* low end... */ 244 size = (vm_size_t) round_page(size); /* hi end */ 245 246 /* 247 * Check for illegal addresses. Watch out for address wrap... Note 248 * that VM_*_ADDRESS are not constants due to casts (argh). 249 */ 250 if (flags & MAP_FIXED) { 251 /* 252 * The specified address must have the same remainder 253 * as the file offset taken modulo PAGE_SIZE, so it 254 * should be aligned after adjustment by pageoff. 255 */ 256 addr -= pageoff; 257 if (addr & PAGE_MASK) 258 return (EINVAL); 259 /* Address range must be all in user VM space. */ 260 if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS) 261 return (EINVAL); 262 #ifndef i386 263 if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS) 264 return (EINVAL); 265 #endif 266 if (addr + size < addr) 267 return (EINVAL); 268 } 269 /* 270 * XXX for non-fixed mappings where no hint is provided or 271 * the hint would fall in the potential heap space, 272 * place it after the end of the largest possible heap. 273 * 274 * There should really be a pmap call to determine a reasonable 275 * location. 276 */ 277 else if (addr == 0 || 278 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 279 addr < round_page((vm_offset_t)vms->vm_daddr + MAXDSIZ))) 280 addr = round_page((vm_offset_t)vms->vm_daddr + MAXDSIZ); 281 282 mtx_lock(&Giant); /* syscall marked mp-safe but isn't */ 283 if (flags & MAP_ANON) { 284 /* 285 * Mapping blank space is trivial. 286 */ 287 handle = NULL; 288 maxprot = VM_PROT_ALL; 289 pos = 0; 290 } else { 291 /* 292 * Mapping file, get fp for validation. Obtain vnode and make 293 * sure it is of appropriate type. 294 */ 295 if (((unsigned) uap->fd) >= fdp->fd_nfiles || 296 (fp = fdp->fd_ofiles[uap->fd]) == NULL) { 297 error = EBADF; 298 goto done2; 299 } 300 if (fp->f_type != DTYPE_VNODE) { 301 error = EINVAL; 302 goto done2; 303 } 304 305 /* 306 * don't let the descriptor disappear on us if we block 307 */ 308 fhold(fp); 309 310 /* 311 * POSIX shared-memory objects are defined to have 312 * kernel persistence, and are not defined to support 313 * read(2)/write(2) -- or even open(2). Thus, we can 314 * use MAP_ASYNC to trade on-disk coherence for speed. 315 * The shm_open(3) library routine turns on the FPOSIXSHM 316 * flag to request this behavior. 317 */ 318 if (fp->f_flag & FPOSIXSHM) 319 flags |= MAP_NOSYNC; 320 vp = (struct vnode *) fp->f_data; 321 if (vp->v_type != VREG && vp->v_type != VCHR) { 322 error = EINVAL; 323 goto done; 324 } 325 if (vp->v_type == VREG) { 326 /* 327 * Get the proper underlying object 328 */ 329 if (VOP_GETVOBJECT(vp, &obj) != 0) { 330 error = EINVAL; 331 goto done; 332 } 333 vp = (struct vnode*)obj->handle; 334 } 335 /* 336 * XXX hack to handle use of /dev/zero to map anon memory (ala 337 * SunOS). 338 */ 339 if ((vp->v_type == VCHR) && 340 (vp->v_rdev->si_devsw->d_flags & D_MMAP_ANON)) { 341 handle = NULL; 342 maxprot = VM_PROT_ALL; 343 flags |= MAP_ANON; 344 pos = 0; 345 } else { 346 /* 347 * cdevs does not provide private mappings of any kind. 348 */ 349 /* 350 * However, for XIG X server to continue to work, 351 * we should allow the superuser to do it anyway. 352 * We only allow it at securelevel < 1. 353 * (Because the XIG X server writes directly to video 354 * memory via /dev/mem, it should never work at any 355 * other securelevel. 356 * XXX this will have to go 357 */ 358 if (securelevel >= 1) 359 disablexworkaround = 1; 360 else 361 disablexworkaround = suser(p); 362 if (vp->v_type == VCHR && disablexworkaround && 363 (flags & (MAP_PRIVATE|MAP_COPY))) { 364 error = EINVAL; 365 goto done; 366 } 367 /* 368 * Ensure that file and memory protections are 369 * compatible. Note that we only worry about 370 * writability if mapping is shared; in this case, 371 * current and max prot are dictated by the open file. 372 * XXX use the vnode instead? Problem is: what 373 * credentials do we use for determination? What if 374 * proc does a setuid? 375 */ 376 maxprot = VM_PROT_EXECUTE; /* ??? */ 377 if (fp->f_flag & FREAD) { 378 maxprot |= VM_PROT_READ; 379 } else if (prot & PROT_READ) { 380 error = EACCES; 381 goto done; 382 } 383 /* 384 * If we are sharing potential changes (either via 385 * MAP_SHARED or via the implicit sharing of character 386 * device mappings), and we are trying to get write 387 * permission although we opened it without asking 388 * for it, bail out. Check for superuser, only if 389 * we're at securelevel < 1, to allow the XIG X server 390 * to continue to work. 391 */ 392 393 if ((flags & MAP_SHARED) != 0 || 394 (vp->v_type == VCHR && disablexworkaround)) { 395 if ((fp->f_flag & FWRITE) != 0) { 396 struct vattr va; 397 if ((error = 398 VOP_GETATTR(vp, &va, 399 p->p_ucred, p))) { 400 goto done; 401 } 402 if ((va.va_flags & 403 (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0) { 404 maxprot |= VM_PROT_WRITE; 405 } else if (prot & PROT_WRITE) { 406 error = EPERM; 407 goto done; 408 } 409 } else if ((prot & PROT_WRITE) != 0) { 410 error = EACCES; 411 goto done; 412 } 413 } else { 414 maxprot |= VM_PROT_WRITE; 415 } 416 417 handle = (void *)vp; 418 } 419 } 420 421 /* 422 * Do not allow more then a certain number of vm_map_entry structures 423 * per process. Scale with the number of rforks sharing the map 424 * to make the limit reasonable for threads. 425 */ 426 if (max_proc_mmap && 427 vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) { 428 error = ENOMEM; 429 goto done; 430 } 431 432 mtx_unlock(&Giant); 433 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 434 flags, handle, pos); 435 if (error == 0) 436 p->p_retval[0] = (register_t) (addr + pageoff); 437 mtx_lock(&Giant); 438 done: 439 if (fp) 440 fdrop(fp, p); 441 done2: 442 mtx_unlock(&Giant); 443 return (error); 444 } 445 446 #ifdef COMPAT_43 447 #ifndef _SYS_SYSPROTO_H_ 448 struct ommap_args { 449 caddr_t addr; 450 int len; 451 int prot; 452 int flags; 453 int fd; 454 long pos; 455 }; 456 #endif 457 int 458 ommap(p, uap) 459 struct proc *p; 460 struct ommap_args *uap; 461 { 462 struct mmap_args nargs; 463 static const char cvtbsdprot[8] = { 464 0, 465 PROT_EXEC, 466 PROT_WRITE, 467 PROT_EXEC | PROT_WRITE, 468 PROT_READ, 469 PROT_EXEC | PROT_READ, 470 PROT_WRITE | PROT_READ, 471 PROT_EXEC | PROT_WRITE | PROT_READ, 472 }; 473 474 #define OMAP_ANON 0x0002 475 #define OMAP_COPY 0x0020 476 #define OMAP_SHARED 0x0010 477 #define OMAP_FIXED 0x0100 478 479 nargs.addr = uap->addr; 480 nargs.len = uap->len; 481 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 482 nargs.flags = 0; 483 if (uap->flags & OMAP_ANON) 484 nargs.flags |= MAP_ANON; 485 if (uap->flags & OMAP_COPY) 486 nargs.flags |= MAP_COPY; 487 if (uap->flags & OMAP_SHARED) 488 nargs.flags |= MAP_SHARED; 489 else 490 nargs.flags |= MAP_PRIVATE; 491 if (uap->flags & OMAP_FIXED) 492 nargs.flags |= MAP_FIXED; 493 nargs.fd = uap->fd; 494 nargs.pos = uap->pos; 495 return (mmap(p, &nargs)); 496 } 497 #endif /* COMPAT_43 */ 498 499 500 #ifndef _SYS_SYSPROTO_H_ 501 struct msync_args { 502 void *addr; 503 int len; 504 int flags; 505 }; 506 #endif 507 /* 508 * MPSAFE 509 */ 510 int 511 msync(p, uap) 512 struct proc *p; 513 struct msync_args *uap; 514 { 515 vm_offset_t addr; 516 vm_size_t size, pageoff; 517 int flags; 518 vm_map_t map; 519 int rv; 520 521 addr = (vm_offset_t) uap->addr; 522 size = uap->len; 523 flags = uap->flags; 524 525 pageoff = (addr & PAGE_MASK); 526 addr -= pageoff; 527 size += pageoff; 528 size = (vm_size_t) round_page(size); 529 if (addr + size < addr) 530 return(EINVAL); 531 532 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 533 return (EINVAL); 534 535 mtx_lock(&Giant); 536 537 map = &p->p_vmspace->vm_map; 538 539 /* 540 * XXX Gak! If size is zero we are supposed to sync "all modified 541 * pages with the region containing addr". Unfortunately, we don't 542 * really keep track of individual mmaps so we approximate by flushing 543 * the range of the map entry containing addr. This can be incorrect 544 * if the region splits or is coalesced with a neighbor. 545 */ 546 if (size == 0) { 547 vm_map_entry_t entry; 548 549 vm_map_lock_read(map); 550 rv = vm_map_lookup_entry(map, addr, &entry); 551 vm_map_unlock_read(map); 552 if (rv == FALSE) { 553 rv = -1; 554 goto done2; 555 } 556 addr = entry->start; 557 size = entry->end - entry->start; 558 } 559 560 /* 561 * Clean the pages and interpret the return value. 562 */ 563 rv = vm_map_clean(map, addr, addr + size, (flags & MS_ASYNC) == 0, 564 (flags & MS_INVALIDATE) != 0); 565 566 done2: 567 mtx_unlock(&Giant); 568 569 switch (rv) { 570 case KERN_SUCCESS: 571 return(0); 572 case KERN_INVALID_ADDRESS: 573 return (EINVAL); /* Sun returns ENOMEM? */ 574 case KERN_FAILURE: 575 return (EIO); 576 default: 577 return (EINVAL); 578 } 579 } 580 581 #ifndef _SYS_SYSPROTO_H_ 582 struct munmap_args { 583 void *addr; 584 size_t len; 585 }; 586 #endif 587 /* 588 * MPSAFE 589 */ 590 int 591 munmap(p, uap) 592 struct proc *p; 593 struct munmap_args *uap; 594 { 595 vm_offset_t addr; 596 vm_size_t size, pageoff; 597 vm_map_t map; 598 599 addr = (vm_offset_t) uap->addr; 600 size = uap->len; 601 602 pageoff = (addr & PAGE_MASK); 603 addr -= pageoff; 604 size += pageoff; 605 size = (vm_size_t) round_page(size); 606 if (addr + size < addr) 607 return(EINVAL); 608 609 if (size == 0) 610 return (0); 611 612 /* 613 * Check for illegal addresses. Watch out for address wrap... Note 614 * that VM_*_ADDRESS are not constants due to casts (argh). 615 */ 616 if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS) 617 return (EINVAL); 618 #ifndef i386 619 if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS) 620 return (EINVAL); 621 #endif 622 mtx_lock(&Giant); 623 map = &p->p_vmspace->vm_map; 624 /* 625 * Make sure entire range is allocated. 626 */ 627 if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) { 628 mtx_unlock(&Giant); 629 return (EINVAL); 630 } 631 /* returns nothing but KERN_SUCCESS anyway */ 632 (void) vm_map_remove(map, addr, addr + size); 633 mtx_unlock(&Giant); 634 return (0); 635 } 636 637 #if 0 638 void 639 munmapfd(p, fd) 640 struct proc *p; 641 int fd; 642 { 643 /* 644 * XXX should unmap any regions mapped to this file 645 */ 646 p->p_fd->fd_ofileflags[fd] &= ~UF_MAPPED; 647 } 648 #endif 649 650 #ifndef _SYS_SYSPROTO_H_ 651 struct mprotect_args { 652 const void *addr; 653 size_t len; 654 int prot; 655 }; 656 #endif 657 /* 658 * MPSAFE 659 */ 660 int 661 mprotect(p, uap) 662 struct proc *p; 663 struct mprotect_args *uap; 664 { 665 vm_offset_t addr; 666 vm_size_t size, pageoff; 667 vm_prot_t prot; 668 int ret; 669 670 addr = (vm_offset_t) uap->addr; 671 size = uap->len; 672 prot = uap->prot & VM_PROT_ALL; 673 #if defined(VM_PROT_READ_IS_EXEC) 674 if (prot & VM_PROT_READ) 675 prot |= VM_PROT_EXECUTE; 676 #endif 677 678 pageoff = (addr & PAGE_MASK); 679 addr -= pageoff; 680 size += pageoff; 681 size = (vm_size_t) round_page(size); 682 if (addr + size < addr) 683 return(EINVAL); 684 685 mtx_lock(&Giant); 686 ret = vm_map_protect(&p->p_vmspace->vm_map, addr, 687 addr + size, prot, FALSE); 688 mtx_unlock(&Giant); 689 switch (ret) { 690 case KERN_SUCCESS: 691 return (0); 692 case KERN_PROTECTION_FAILURE: 693 return (EACCES); 694 } 695 return (EINVAL); 696 } 697 698 #ifndef _SYS_SYSPROTO_H_ 699 struct minherit_args { 700 void *addr; 701 size_t len; 702 int inherit; 703 }; 704 #endif 705 /* 706 * MPSAFE 707 */ 708 int 709 minherit(p, uap) 710 struct proc *p; 711 struct minherit_args *uap; 712 { 713 vm_offset_t addr; 714 vm_size_t size, pageoff; 715 vm_inherit_t inherit; 716 int ret; 717 718 addr = (vm_offset_t)uap->addr; 719 size = uap->len; 720 inherit = uap->inherit; 721 722 pageoff = (addr & PAGE_MASK); 723 addr -= pageoff; 724 size += pageoff; 725 size = (vm_size_t) round_page(size); 726 if (addr + size < addr) 727 return(EINVAL); 728 729 mtx_lock(&Giant); 730 ret = vm_map_inherit(&p->p_vmspace->vm_map, addr, addr+size, 731 inherit); 732 mtx_unlock(&Giant); 733 734 switch (ret) { 735 case KERN_SUCCESS: 736 return (0); 737 case KERN_PROTECTION_FAILURE: 738 return (EACCES); 739 } 740 return (EINVAL); 741 } 742 743 #ifndef _SYS_SYSPROTO_H_ 744 struct madvise_args { 745 void *addr; 746 size_t len; 747 int behav; 748 }; 749 #endif 750 751 /* 752 * MPSAFE 753 */ 754 /* ARGSUSED */ 755 int 756 madvise(p, uap) 757 struct proc *p; 758 struct madvise_args *uap; 759 { 760 vm_offset_t start, end; 761 int ret; 762 763 /* 764 * Check for illegal behavior 765 */ 766 if (uap->behav < 0 || uap->behav > MADV_CORE) 767 return (EINVAL); 768 /* 769 * Check for illegal addresses. Watch out for address wrap... Note 770 * that VM_*_ADDRESS are not constants due to casts (argh). 771 */ 772 if (VM_MAXUSER_ADDRESS > 0 && 773 ((vm_offset_t) uap->addr + uap->len) > VM_MAXUSER_ADDRESS) 774 return (EINVAL); 775 #ifndef i386 776 if (VM_MIN_ADDRESS > 0 && uap->addr < VM_MIN_ADDRESS) 777 return (EINVAL); 778 #endif 779 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 780 return (EINVAL); 781 782 /* 783 * Since this routine is only advisory, we default to conservative 784 * behavior. 785 */ 786 start = trunc_page((vm_offset_t) uap->addr); 787 end = round_page((vm_offset_t) uap->addr + uap->len); 788 789 mtx_lock(&Giant); 790 ret = vm_map_madvise(&p->p_vmspace->vm_map, start, end, uap->behav); 791 mtx_unlock(&Giant); 792 return (ret ? EINVAL : 0); 793 } 794 795 #ifndef _SYS_SYSPROTO_H_ 796 struct mincore_args { 797 const void *addr; 798 size_t len; 799 char *vec; 800 }; 801 #endif 802 803 /* 804 * MPSAFE 805 */ 806 /* ARGSUSED */ 807 int 808 mincore(p, uap) 809 struct proc *p; 810 struct mincore_args *uap; 811 { 812 vm_offset_t addr, first_addr; 813 vm_offset_t end, cend; 814 pmap_t pmap; 815 vm_map_t map; 816 char *vec; 817 int error = 0; 818 int vecindex, lastvecindex; 819 vm_map_entry_t current; 820 vm_map_entry_t entry; 821 int mincoreinfo; 822 unsigned int timestamp; 823 824 /* 825 * Make sure that the addresses presented are valid for user 826 * mode. 827 */ 828 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 829 end = addr + (vm_size_t)round_page(uap->len); 830 if (VM_MAXUSER_ADDRESS > 0 && end > VM_MAXUSER_ADDRESS) 831 return (EINVAL); 832 if (end < addr) 833 return (EINVAL); 834 835 /* 836 * Address of byte vector 837 */ 838 vec = uap->vec; 839 840 mtx_lock(&Giant); 841 map = &p->p_vmspace->vm_map; 842 pmap = vmspace_pmap(p->p_vmspace); 843 844 vm_map_lock_read(map); 845 RestartScan: 846 timestamp = map->timestamp; 847 848 if (!vm_map_lookup_entry(map, addr, &entry)) 849 entry = entry->next; 850 851 /* 852 * Do this on a map entry basis so that if the pages are not 853 * in the current processes address space, we can easily look 854 * up the pages elsewhere. 855 */ 856 lastvecindex = -1; 857 for (current = entry; 858 (current != &map->header) && (current->start < end); 859 current = current->next) { 860 861 /* 862 * ignore submaps (for now) or null objects 863 */ 864 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 865 current->object.vm_object == NULL) 866 continue; 867 868 /* 869 * limit this scan to the current map entry and the 870 * limits for the mincore call 871 */ 872 if (addr < current->start) 873 addr = current->start; 874 cend = current->end; 875 if (cend > end) 876 cend = end; 877 878 /* 879 * scan this entry one page at a time 880 */ 881 while (addr < cend) { 882 /* 883 * Check pmap first, it is likely faster, also 884 * it can provide info as to whether we are the 885 * one referencing or modifying the page. 886 */ 887 mincoreinfo = pmap_mincore(pmap, addr); 888 if (!mincoreinfo) { 889 vm_pindex_t pindex; 890 vm_ooffset_t offset; 891 vm_page_t m; 892 /* 893 * calculate the page index into the object 894 */ 895 offset = current->offset + (addr - current->start); 896 pindex = OFF_TO_IDX(offset); 897 m = vm_page_lookup(current->object.vm_object, 898 pindex); 899 /* 900 * if the page is resident, then gather information about 901 * it. 902 */ 903 if (m) { 904 mincoreinfo = MINCORE_INCORE; 905 if (m->dirty || 906 pmap_is_modified(m)) 907 mincoreinfo |= MINCORE_MODIFIED_OTHER; 908 if ((m->flags & PG_REFERENCED) || 909 pmap_ts_referenced(m)) { 910 vm_page_flag_set(m, PG_REFERENCED); 911 mincoreinfo |= MINCORE_REFERENCED_OTHER; 912 } 913 } 914 } 915 916 /* 917 * subyte may page fault. In case it needs to modify 918 * the map, we release the lock. 919 */ 920 vm_map_unlock_read(map); 921 922 /* 923 * calculate index into user supplied byte vector 924 */ 925 vecindex = OFF_TO_IDX(addr - first_addr); 926 927 /* 928 * If we have skipped map entries, we need to make sure that 929 * the byte vector is zeroed for those skipped entries. 930 */ 931 while ((lastvecindex + 1) < vecindex) { 932 error = subyte( vec + lastvecindex, 0); 933 if (error) { 934 error = EFAULT; 935 goto done2; 936 } 937 ++lastvecindex; 938 } 939 940 /* 941 * Pass the page information to the user 942 */ 943 error = subyte( vec + vecindex, mincoreinfo); 944 if (error) { 945 error = EFAULT; 946 goto done2; 947 } 948 949 /* 950 * If the map has changed, due to the subyte, the previous 951 * output may be invalid. 952 */ 953 vm_map_lock_read(map); 954 if (timestamp != map->timestamp) 955 goto RestartScan; 956 957 lastvecindex = vecindex; 958 addr += PAGE_SIZE; 959 } 960 } 961 962 /* 963 * subyte may page fault. In case it needs to modify 964 * the map, we release the lock. 965 */ 966 vm_map_unlock_read(map); 967 968 /* 969 * Zero the last entries in the byte vector. 970 */ 971 vecindex = OFF_TO_IDX(end - first_addr); 972 while ((lastvecindex + 1) < vecindex) { 973 error = subyte( vec + lastvecindex, 0); 974 if (error) { 975 error = EFAULT; 976 goto done2; 977 } 978 ++lastvecindex; 979 } 980 981 /* 982 * If the map has changed, due to the subyte, the previous 983 * output may be invalid. 984 */ 985 vm_map_lock_read(map); 986 if (timestamp != map->timestamp) 987 goto RestartScan; 988 vm_map_unlock_read(map); 989 done2: 990 mtx_unlock(&Giant); 991 return (error); 992 } 993 994 #ifndef _SYS_SYSPROTO_H_ 995 struct mlock_args { 996 const void *addr; 997 size_t len; 998 }; 999 #endif 1000 /* 1001 * MPSAFE 1002 */ 1003 int 1004 mlock(p, uap) 1005 struct proc *p; 1006 struct mlock_args *uap; 1007 { 1008 vm_offset_t addr; 1009 vm_size_t size, pageoff; 1010 int error; 1011 1012 addr = (vm_offset_t) uap->addr; 1013 size = uap->len; 1014 1015 pageoff = (addr & PAGE_MASK); 1016 addr -= pageoff; 1017 size += pageoff; 1018 size = (vm_size_t) round_page(size); 1019 1020 /* disable wrap around */ 1021 if (addr + size < addr) 1022 return (EINVAL); 1023 1024 if (atop(size) + cnt.v_wire_count > vm_page_max_wired) 1025 return (EAGAIN); 1026 1027 #ifdef pmap_wired_count 1028 if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) > 1029 p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur) 1030 return (ENOMEM); 1031 #else 1032 error = suser(p); 1033 if (error) 1034 return (error); 1035 #endif 1036 1037 mtx_lock(&Giant); 1038 error = vm_map_user_pageable(&p->p_vmspace->vm_map, addr, 1039 addr + size, FALSE); 1040 mtx_unlock(&Giant); 1041 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1042 } 1043 1044 #ifndef _SYS_SYSPROTO_H_ 1045 struct mlockall_args { 1046 int how; 1047 }; 1048 #endif 1049 1050 /* 1051 * MPSAFE 1052 */ 1053 int 1054 mlockall(p, uap) 1055 struct proc *p; 1056 struct mlockall_args *uap; 1057 { 1058 /* mtx_lock(&Giant); */ 1059 /* mtx_unlock(&Giant); */ 1060 return 0; 1061 } 1062 1063 #ifndef _SYS_SYSPROTO_H_ 1064 struct mlockall_args { 1065 int how; 1066 }; 1067 #endif 1068 1069 /* 1070 * MPSAFE 1071 */ 1072 int 1073 munlockall(p, uap) 1074 struct proc *p; 1075 struct munlockall_args *uap; 1076 { 1077 /* mtx_lock(&Giant); */ 1078 /* mtx_unlock(&Giant); */ 1079 return 0; 1080 } 1081 1082 #ifndef _SYS_SYSPROTO_H_ 1083 struct munlock_args { 1084 const void *addr; 1085 size_t len; 1086 }; 1087 #endif 1088 /* 1089 * MPSAFE 1090 */ 1091 int 1092 munlock(p, uap) 1093 struct proc *p; 1094 struct munlock_args *uap; 1095 { 1096 vm_offset_t addr; 1097 vm_size_t size, pageoff; 1098 int error; 1099 1100 addr = (vm_offset_t) uap->addr; 1101 size = uap->len; 1102 1103 pageoff = (addr & PAGE_MASK); 1104 addr -= pageoff; 1105 size += pageoff; 1106 size = (vm_size_t) round_page(size); 1107 1108 /* disable wrap around */ 1109 if (addr + size < addr) 1110 return (EINVAL); 1111 1112 #ifndef pmap_wired_count 1113 error = suser(p); 1114 if (error) 1115 return (error); 1116 #endif 1117 1118 mtx_lock(&Giant); 1119 error = vm_map_user_pageable(&p->p_vmspace->vm_map, addr, 1120 addr + size, TRUE); 1121 mtx_unlock(&Giant); 1122 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1123 } 1124 1125 /* 1126 * vm_mmap() 1127 * 1128 * MPSAFE 1129 * 1130 * Internal version of mmap. Currently used by mmap, exec, and sys5 1131 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1132 */ 1133 int 1134 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1135 vm_prot_t maxprot, int flags, 1136 void *handle, 1137 vm_ooffset_t foff) 1138 { 1139 boolean_t fitit; 1140 vm_object_t object; 1141 struct vnode *vp = NULL; 1142 objtype_t type; 1143 int rv = KERN_SUCCESS; 1144 vm_ooffset_t objsize; 1145 int docow; 1146 struct proc *p = curproc; 1147 1148 if (size == 0) 1149 return (0); 1150 1151 objsize = size = round_page(size); 1152 1153 /* 1154 * We currently can only deal with page aligned file offsets. 1155 * The check is here rather than in the syscall because the 1156 * kernel calls this function internally for other mmaping 1157 * operations (such as in exec) and non-aligned offsets will 1158 * cause pmap inconsistencies...so we want to be sure to 1159 * disallow this in all cases. 1160 */ 1161 if (foff & PAGE_MASK) 1162 return (EINVAL); 1163 1164 if ((flags & MAP_FIXED) == 0) { 1165 fitit = TRUE; 1166 *addr = round_page(*addr); 1167 mtx_lock(&Giant); 1168 } else { 1169 if (*addr != trunc_page(*addr)) 1170 return (EINVAL); 1171 fitit = FALSE; 1172 mtx_lock(&Giant); 1173 (void) vm_map_remove(map, *addr, *addr + size); 1174 } 1175 1176 /* 1177 * Lookup/allocate object. 1178 */ 1179 if (flags & MAP_ANON) { 1180 type = OBJT_DEFAULT; 1181 /* 1182 * Unnamed anonymous regions always start at 0. 1183 */ 1184 if (handle == 0) 1185 foff = 0; 1186 } else { 1187 vp = (struct vnode *) handle; 1188 if (vp->v_type == VCHR) { 1189 type = OBJT_DEVICE; 1190 handle = (void *)(intptr_t)vp->v_rdev; 1191 } else { 1192 struct vattr vat; 1193 int error; 1194 1195 error = VOP_GETATTR(vp, &vat, p->p_ucred, p); 1196 if (error) { 1197 mtx_unlock(&Giant); 1198 return (error); 1199 } 1200 objsize = round_page(vat.va_size); 1201 type = OBJT_VNODE; 1202 /* 1203 * if it is a regular file without any references 1204 * we do not need to sync it. 1205 */ 1206 if (vp->v_type == VREG && vat.va_nlink == 0) { 1207 flags |= MAP_NOSYNC; 1208 } 1209 } 1210 } 1211 1212 if (handle == NULL) { 1213 object = NULL; 1214 docow = 0; 1215 } else { 1216 object = vm_pager_allocate(type, 1217 handle, objsize, prot, foff); 1218 if (object == NULL) { 1219 mtx_unlock(&Giant); 1220 return (type == OBJT_DEVICE ? EINVAL : ENOMEM); 1221 } 1222 docow = MAP_PREFAULT_PARTIAL; 1223 } 1224 1225 /* 1226 * Force device mappings to be shared. 1227 */ 1228 if (type == OBJT_DEVICE || type == OBJT_PHYS) { 1229 flags &= ~(MAP_PRIVATE|MAP_COPY); 1230 flags |= MAP_SHARED; 1231 } 1232 1233 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1234 docow |= MAP_COPY_ON_WRITE; 1235 if (flags & MAP_NOSYNC) 1236 docow |= MAP_DISABLE_SYNCER; 1237 if (flags & MAP_NOCORE) 1238 docow |= MAP_DISABLE_COREDUMP; 1239 1240 #if defined(VM_PROT_READ_IS_EXEC) 1241 if (prot & VM_PROT_READ) 1242 prot |= VM_PROT_EXECUTE; 1243 1244 if (maxprot & VM_PROT_READ) 1245 maxprot |= VM_PROT_EXECUTE; 1246 #endif 1247 1248 if (fitit) 1249 *addr = pmap_addr_hint(object, *addr, size); 1250 1251 if (flags & MAP_STACK) 1252 rv = vm_map_stack (map, *addr, size, prot, 1253 maxprot, docow); 1254 else 1255 rv = vm_map_find(map, object, foff, addr, size, fitit, 1256 prot, maxprot, docow); 1257 1258 if (rv != KERN_SUCCESS) { 1259 /* 1260 * Lose the object reference. Will destroy the 1261 * object if it's an unnamed anonymous mapping 1262 * or named anonymous without other references. 1263 */ 1264 vm_object_deallocate(object); 1265 } else if (flags & MAP_SHARED) { 1266 /* 1267 * Shared memory is also shared with children. 1268 */ 1269 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); 1270 if (rv != KERN_SUCCESS) 1271 (void) vm_map_remove(map, *addr, *addr + size); 1272 } 1273 mtx_unlock(&Giant); 1274 switch (rv) { 1275 case KERN_SUCCESS: 1276 return (0); 1277 case KERN_INVALID_ADDRESS: 1278 case KERN_NO_SPACE: 1279 return (ENOMEM); 1280 case KERN_PROTECTION_FAILURE: 1281 return (EACCES); 1282 default: 1283 return (EINVAL); 1284 } 1285 } 1286