1 /* 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 39 * 40 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 41 * $FreeBSD$ 42 */ 43 44 /* 45 * Mapped file (mmap) interface to VM 46 */ 47 48 #include "opt_bleed.h" 49 #include "opt_compat.h" 50 #include "opt_rlimit.h" 51 52 #include <sys/param.h> 53 #include <sys/systm.h> 54 #include <sys/kernel.h> 55 #include <sys/lock.h> 56 #include <sys/mutex.h> 57 #include <sys/sysproto.h> 58 #include <sys/filedesc.h> 59 #include <sys/proc.h> 60 #include <sys/vnode.h> 61 #include <sys/fcntl.h> 62 #include <sys/file.h> 63 #include <sys/mman.h> 64 #include <sys/conf.h> 65 #include <sys/stat.h> 66 #include <sys/vmmeter.h> 67 #include <sys/sysctl.h> 68 69 #include <vm/vm.h> 70 #include <vm/vm_param.h> 71 #include <vm/pmap.h> 72 #include <vm/vm_map.h> 73 #include <vm/vm_object.h> 74 #include <vm/vm_page.h> 75 #include <vm/vm_pager.h> 76 #include <vm/vm_pageout.h> 77 #include <vm/vm_extern.h> 78 #include <vm/vm_page.h> 79 #include <vm/vm_kern.h> 80 81 #ifndef _SYS_SYSPROTO_H_ 82 struct sbrk_args { 83 int incr; 84 }; 85 #endif 86 87 static int max_proc_mmap; 88 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, ""); 89 90 /* 91 * Set the maximum number of vm_map_entry structures per process. Roughly 92 * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100 93 * of our KVM malloc space still results in generous limits. We want a 94 * default that is good enough to prevent the kernel running out of resources 95 * if attacked from compromised user account but generous enough such that 96 * multi-threaded processes are not unduly inconvenienced. 97 */ 98 99 static void vmmapentry_rsrc_init __P((void *)); 100 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, NULL) 101 102 static void 103 vmmapentry_rsrc_init(dummy) 104 void *dummy; 105 { 106 max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry); 107 max_proc_mmap /= 100; 108 } 109 110 /* ARGSUSED */ 111 int 112 sbrk(p, uap) 113 struct proc *p; 114 struct sbrk_args *uap; 115 { 116 117 /* Not yet implemented */ 118 return (EOPNOTSUPP); 119 } 120 121 #ifndef _SYS_SYSPROTO_H_ 122 struct sstk_args { 123 int incr; 124 }; 125 #endif 126 127 /* ARGSUSED */ 128 int 129 sstk(p, uap) 130 struct proc *p; 131 struct sstk_args *uap; 132 { 133 134 /* Not yet implemented */ 135 return (EOPNOTSUPP); 136 } 137 138 #if defined(COMPAT_43) || defined(COMPAT_SUNOS) 139 #ifndef _SYS_SYSPROTO_H_ 140 struct getpagesize_args { 141 int dummy; 142 }; 143 #endif 144 145 /* ARGSUSED */ 146 int 147 ogetpagesize(p, uap) 148 struct proc *p; 149 struct getpagesize_args *uap; 150 { 151 152 p->p_retval[0] = PAGE_SIZE; 153 return (0); 154 } 155 #endif /* COMPAT_43 || COMPAT_SUNOS */ 156 157 158 /* 159 * Memory Map (mmap) system call. Note that the file offset 160 * and address are allowed to be NOT page aligned, though if 161 * the MAP_FIXED flag it set, both must have the same remainder 162 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 163 * page-aligned, the actual mapping starts at trunc_page(addr) 164 * and the return value is adjusted up by the page offset. 165 * 166 * Generally speaking, only character devices which are themselves 167 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 168 * there would be no cache coherency between a descriptor and a VM mapping 169 * both to the same character device. 170 * 171 * Block devices can be mmap'd no matter what they represent. Cache coherency 172 * is maintained as long as you do not write directly to the underlying 173 * character device. 174 */ 175 #ifndef _SYS_SYSPROTO_H_ 176 struct mmap_args { 177 void *addr; 178 size_t len; 179 int prot; 180 int flags; 181 int fd; 182 long pad; 183 off_t pos; 184 }; 185 #endif 186 187 int 188 mmap(p, uap) 189 struct proc *p; 190 register struct mmap_args *uap; 191 { 192 register struct filedesc *fdp = p->p_fd; 193 register struct file *fp = NULL; 194 struct vnode *vp; 195 vm_offset_t addr; 196 vm_size_t size, pageoff; 197 vm_prot_t prot, maxprot; 198 void *handle; 199 int flags, error; 200 int disablexworkaround; 201 off_t pos; 202 struct vmspace *vms = p->p_vmspace; 203 vm_object_t obj; 204 205 addr = (vm_offset_t) uap->addr; 206 size = uap->len; 207 prot = uap->prot & VM_PROT_ALL; 208 flags = uap->flags; 209 pos = uap->pos; 210 211 /* make sure mapping fits into numeric range etc */ 212 if ((ssize_t) uap->len < 0 || 213 ((flags & MAP_ANON) && uap->fd != -1)) 214 return (EINVAL); 215 216 if (flags & MAP_STACK) { 217 if ((uap->fd != -1) || 218 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 219 return (EINVAL); 220 flags |= MAP_ANON; 221 pos = 0; 222 } 223 224 /* 225 * Align the file position to a page boundary, 226 * and save its page offset component. 227 */ 228 pageoff = (pos & PAGE_MASK); 229 pos -= pageoff; 230 231 /* Adjust size for rounding (on both ends). */ 232 size += pageoff; /* low end... */ 233 size = (vm_size_t) round_page(size); /* hi end */ 234 235 /* 236 * Check for illegal addresses. Watch out for address wrap... Note 237 * that VM_*_ADDRESS are not constants due to casts (argh). 238 */ 239 if (flags & MAP_FIXED) { 240 /* 241 * The specified address must have the same remainder 242 * as the file offset taken modulo PAGE_SIZE, so it 243 * should be aligned after adjustment by pageoff. 244 */ 245 addr -= pageoff; 246 if (addr & PAGE_MASK) 247 return (EINVAL); 248 /* Address range must be all in user VM space. */ 249 if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS) 250 return (EINVAL); 251 #ifndef i386 252 if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS) 253 return (EINVAL); 254 #endif 255 if (addr + size < addr) 256 return (EINVAL); 257 } 258 /* 259 * XXX for non-fixed mappings where no hint is provided or 260 * the hint would fall in the potential heap space, 261 * place it after the end of the largest possible heap. 262 * 263 * There should really be a pmap call to determine a reasonable 264 * location. 265 */ 266 else if (addr == 0 || 267 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 268 addr < round_page((vm_offset_t)vms->vm_daddr + MAXDSIZ))) 269 addr = round_page((vm_offset_t)vms->vm_daddr + MAXDSIZ); 270 271 mtx_lock(&Giant); 272 if (flags & MAP_ANON) { 273 /* 274 * Mapping blank space is trivial. 275 */ 276 handle = NULL; 277 maxprot = VM_PROT_ALL; 278 pos = 0; 279 } else { 280 /* 281 * Mapping file, get fp for validation. Obtain vnode and make 282 * sure it is of appropriate type. 283 */ 284 if (((unsigned) uap->fd) >= fdp->fd_nfiles || 285 (fp = fdp->fd_ofiles[uap->fd]) == NULL) { 286 mtx_unlock(&Giant); 287 return (EBADF); 288 } 289 if (fp->f_type != DTYPE_VNODE) { 290 mtx_unlock(&Giant); 291 return (EINVAL); 292 } 293 294 /* 295 * don't let the descriptor disappear on us if we block 296 */ 297 fhold(fp); 298 299 /* 300 * POSIX shared-memory objects are defined to have 301 * kernel persistence, and are not defined to support 302 * read(2)/write(2) -- or even open(2). Thus, we can 303 * use MAP_ASYNC to trade on-disk coherence for speed. 304 * The shm_open(3) library routine turns on the FPOSIXSHM 305 * flag to request this behavior. 306 */ 307 if (fp->f_flag & FPOSIXSHM) 308 flags |= MAP_NOSYNC; 309 vp = (struct vnode *) fp->f_data; 310 if (vp->v_type != VREG && vp->v_type != VCHR) { 311 error = EINVAL; 312 goto done; 313 } 314 if (vp->v_type == VREG) { 315 /* 316 * Get the proper underlying object 317 */ 318 if (VOP_GETVOBJECT(vp, &obj) != 0) 319 return (EINVAL); 320 vp = (struct vnode*)obj->handle; 321 } 322 /* 323 * XXX hack to handle use of /dev/zero to map anon memory (ala 324 * SunOS). 325 */ 326 if ((vp->v_type == VCHR) && 327 (vp->v_rdev->si_devsw->d_flags & D_MMAP_ANON)) { 328 handle = NULL; 329 maxprot = VM_PROT_ALL; 330 flags |= MAP_ANON; 331 pos = 0; 332 } else { 333 /* 334 * cdevs does not provide private mappings of any kind. 335 */ 336 /* 337 * However, for XIG X server to continue to work, 338 * we should allow the superuser to do it anyway. 339 * We only allow it at securelevel < 1. 340 * (Because the XIG X server writes directly to video 341 * memory via /dev/mem, it should never work at any 342 * other securelevel. 343 * XXX this will have to go 344 */ 345 if (securelevel >= 1) 346 disablexworkaround = 1; 347 else 348 disablexworkaround = suser(p); 349 if (vp->v_type == VCHR && disablexworkaround && 350 (flags & (MAP_PRIVATE|MAP_COPY))) { 351 error = EINVAL; 352 goto done; 353 } 354 /* 355 * Ensure that file and memory protections are 356 * compatible. Note that we only worry about 357 * writability if mapping is shared; in this case, 358 * current and max prot are dictated by the open file. 359 * XXX use the vnode instead? Problem is: what 360 * credentials do we use for determination? What if 361 * proc does a setuid? 362 */ 363 maxprot = VM_PROT_EXECUTE; /* ??? */ 364 if (fp->f_flag & FREAD) { 365 maxprot |= VM_PROT_READ; 366 } else if (prot & PROT_READ) { 367 error = EACCES; 368 goto done; 369 } 370 /* 371 * If we are sharing potential changes (either via 372 * MAP_SHARED or via the implicit sharing of character 373 * device mappings), and we are trying to get write 374 * permission although we opened it without asking 375 * for it, bail out. Check for superuser, only if 376 * we're at securelevel < 1, to allow the XIG X server 377 * to continue to work. 378 */ 379 380 if ((flags & MAP_SHARED) != 0 || 381 (vp->v_type == VCHR && disablexworkaround)) { 382 if ((fp->f_flag & FWRITE) != 0) { 383 struct vattr va; 384 if ((error = 385 VOP_GETATTR(vp, &va, 386 p->p_ucred, p))) { 387 goto done; 388 } 389 if ((va.va_flags & 390 (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0) { 391 maxprot |= VM_PROT_WRITE; 392 } else if (prot & PROT_WRITE) { 393 error = EPERM; 394 goto done; 395 } 396 } else if ((prot & PROT_WRITE) != 0) { 397 error = EACCES; 398 goto done; 399 } 400 } else { 401 maxprot |= VM_PROT_WRITE; 402 } 403 404 handle = (void *)vp; 405 } 406 } 407 408 /* 409 * Do not allow more then a certain number of vm_map_entry structures 410 * per process. Scale with the number of rforks sharing the map 411 * to make the limit reasonable for threads. 412 */ 413 if (max_proc_mmap && 414 vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) { 415 error = ENOMEM; 416 goto done; 417 } 418 419 mtx_unlock(&Giant); 420 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 421 flags, handle, pos); 422 if (error == 0) 423 p->p_retval[0] = (register_t) (addr + pageoff); 424 mtx_lock(&Giant); 425 done: 426 if (fp) 427 fdrop(fp, p); 428 mtx_unlock(&Giant); 429 return (error); 430 } 431 432 #ifdef COMPAT_43 433 #ifndef _SYS_SYSPROTO_H_ 434 struct ommap_args { 435 caddr_t addr; 436 int len; 437 int prot; 438 int flags; 439 int fd; 440 long pos; 441 }; 442 #endif 443 int 444 ommap(p, uap) 445 struct proc *p; 446 register struct ommap_args *uap; 447 { 448 struct mmap_args nargs; 449 static const char cvtbsdprot[8] = { 450 0, 451 PROT_EXEC, 452 PROT_WRITE, 453 PROT_EXEC | PROT_WRITE, 454 PROT_READ, 455 PROT_EXEC | PROT_READ, 456 PROT_WRITE | PROT_READ, 457 PROT_EXEC | PROT_WRITE | PROT_READ, 458 }; 459 460 #define OMAP_ANON 0x0002 461 #define OMAP_COPY 0x0020 462 #define OMAP_SHARED 0x0010 463 #define OMAP_FIXED 0x0100 464 #define OMAP_INHERIT 0x0800 465 466 nargs.addr = uap->addr; 467 nargs.len = uap->len; 468 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 469 nargs.flags = 0; 470 if (uap->flags & OMAP_ANON) 471 nargs.flags |= MAP_ANON; 472 if (uap->flags & OMAP_COPY) 473 nargs.flags |= MAP_COPY; 474 if (uap->flags & OMAP_SHARED) 475 nargs.flags |= MAP_SHARED; 476 else 477 nargs.flags |= MAP_PRIVATE; 478 if (uap->flags & OMAP_FIXED) 479 nargs.flags |= MAP_FIXED; 480 if (uap->flags & OMAP_INHERIT) 481 nargs.flags |= MAP_INHERIT; 482 nargs.fd = uap->fd; 483 nargs.pos = uap->pos; 484 return (mmap(p, &nargs)); 485 } 486 #endif /* COMPAT_43 */ 487 488 489 #ifndef _SYS_SYSPROTO_H_ 490 struct msync_args { 491 void *addr; 492 int len; 493 int flags; 494 }; 495 #endif 496 int 497 msync(p, uap) 498 struct proc *p; 499 struct msync_args *uap; 500 { 501 vm_offset_t addr; 502 vm_size_t size, pageoff; 503 int flags; 504 vm_map_t map; 505 int rv; 506 507 addr = (vm_offset_t) uap->addr; 508 size = uap->len; 509 flags = uap->flags; 510 511 pageoff = (addr & PAGE_MASK); 512 addr -= pageoff; 513 size += pageoff; 514 size = (vm_size_t) round_page(size); 515 if (addr + size < addr) 516 return(EINVAL); 517 518 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 519 return (EINVAL); 520 521 map = &p->p_vmspace->vm_map; 522 523 /* 524 * XXX Gak! If size is zero we are supposed to sync "all modified 525 * pages with the region containing addr". Unfortunately, we don't 526 * really keep track of individual mmaps so we approximate by flushing 527 * the range of the map entry containing addr. This can be incorrect 528 * if the region splits or is coalesced with a neighbor. 529 */ 530 #ifndef BLEED 531 mtx_lock(&Giant); 532 #endif 533 mtx_lock(&vm_mtx); 534 if (size == 0) { 535 vm_map_entry_t entry; 536 537 vm_map_lock_read(map); 538 rv = vm_map_lookup_entry(map, addr, &entry); 539 vm_map_unlock_read(map); 540 if (rv == FALSE) { 541 mtx_unlock(&vm_mtx); 542 #ifndef BLEED 543 mtx_unlock(&Giant); 544 #endif 545 return (EINVAL); 546 } 547 addr = entry->start; 548 size = entry->end - entry->start; 549 } 550 551 /* 552 * Clean the pages and interpret the return value. 553 */ 554 rv = vm_map_clean(map, addr, addr + size, (flags & MS_ASYNC) == 0, 555 (flags & MS_INVALIDATE) != 0); 556 557 mtx_unlock(&vm_mtx); 558 #ifndef BLEED 559 mtx_unlock(&Giant); 560 #endif 561 switch (rv) { 562 case KERN_SUCCESS: 563 break; 564 case KERN_INVALID_ADDRESS: 565 return (EINVAL); /* Sun returns ENOMEM? */ 566 case KERN_FAILURE: 567 return (EIO); 568 default: 569 return (EINVAL); 570 } 571 572 return (0); 573 } 574 575 #ifndef _SYS_SYSPROTO_H_ 576 struct munmap_args { 577 void *addr; 578 size_t len; 579 }; 580 #endif 581 int 582 munmap(p, uap) 583 register struct proc *p; 584 register struct munmap_args *uap; 585 { 586 vm_offset_t addr; 587 vm_size_t size, pageoff; 588 vm_map_t map; 589 590 addr = (vm_offset_t) uap->addr; 591 size = uap->len; 592 593 pageoff = (addr & PAGE_MASK); 594 addr -= pageoff; 595 size += pageoff; 596 size = (vm_size_t) round_page(size); 597 if (addr + size < addr) 598 return(EINVAL); 599 600 if (size == 0) 601 return (0); 602 603 /* 604 * Check for illegal addresses. Watch out for address wrap... Note 605 * that VM_*_ADDRESS are not constants due to casts (argh). 606 */ 607 if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS) 608 return (EINVAL); 609 #ifndef i386 610 if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS) 611 return (EINVAL); 612 #endif 613 map = &p->p_vmspace->vm_map; 614 /* 615 * Make sure entire range is allocated. 616 */ 617 mtx_lock(&Giant); 618 mtx_lock(&vm_mtx); 619 if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) { 620 mtx_unlock(&vm_mtx); 621 mtx_unlock(&Giant); 622 return (EINVAL); 623 } 624 /* returns nothing but KERN_SUCCESS anyway */ 625 (void) vm_map_remove(map, addr, addr + size); 626 mtx_unlock(&vm_mtx); 627 mtx_unlock(&Giant); 628 return (0); 629 } 630 631 #if 0 632 void 633 munmapfd(p, fd) 634 struct proc *p; 635 int fd; 636 { 637 /* 638 * XXX should unmap any regions mapped to this file 639 */ 640 p->p_fd->fd_ofileflags[fd] &= ~UF_MAPPED; 641 } 642 #endif 643 644 #ifndef _SYS_SYSPROTO_H_ 645 struct mprotect_args { 646 const void *addr; 647 size_t len; 648 int prot; 649 }; 650 #endif 651 int 652 mprotect(p, uap) 653 struct proc *p; 654 struct mprotect_args *uap; 655 { 656 vm_offset_t addr; 657 vm_size_t size, pageoff; 658 register vm_prot_t prot; 659 int ret; 660 661 addr = (vm_offset_t) uap->addr; 662 size = uap->len; 663 prot = uap->prot & VM_PROT_ALL; 664 #if defined(VM_PROT_READ_IS_EXEC) 665 if (prot & VM_PROT_READ) 666 prot |= VM_PROT_EXECUTE; 667 #endif 668 669 pageoff = (addr & PAGE_MASK); 670 addr -= pageoff; 671 size += pageoff; 672 size = (vm_size_t) round_page(size); 673 if (addr + size < addr) 674 return(EINVAL); 675 676 mtx_lock(&Giant); 677 mtx_lock(&vm_mtx); 678 ret = vm_map_protect(&p->p_vmspace->vm_map, addr, 679 addr + size, prot, FALSE); 680 mtx_unlock(&vm_mtx); 681 mtx_unlock(&Giant); 682 switch (ret) { 683 case KERN_SUCCESS: 684 return (0); 685 case KERN_PROTECTION_FAILURE: 686 return (EACCES); 687 } 688 return (EINVAL); 689 } 690 691 #ifndef _SYS_SYSPROTO_H_ 692 struct minherit_args { 693 void *addr; 694 size_t len; 695 int inherit; 696 }; 697 #endif 698 int 699 minherit(p, uap) 700 struct proc *p; 701 struct minherit_args *uap; 702 { 703 vm_offset_t addr; 704 vm_size_t size, pageoff; 705 register vm_inherit_t inherit; 706 int ret; 707 708 addr = (vm_offset_t)uap->addr; 709 size = uap->len; 710 inherit = uap->inherit; 711 712 pageoff = (addr & PAGE_MASK); 713 addr -= pageoff; 714 size += pageoff; 715 size = (vm_size_t) round_page(size); 716 if (addr + size < addr) 717 return(EINVAL); 718 719 #ifndef BLEED 720 mtx_lock(&Giant); 721 #endif 722 mtx_lock(&vm_mtx); 723 ret = vm_map_inherit(&p->p_vmspace->vm_map, addr, addr+size, 724 inherit); 725 mtx_unlock(&vm_mtx); 726 #ifndef BLEED 727 mtx_unlock(&Giant); 728 #endif 729 730 switch (ret) { 731 case KERN_SUCCESS: 732 return (0); 733 case KERN_PROTECTION_FAILURE: 734 return (EACCES); 735 } 736 return (EINVAL); 737 } 738 739 #ifndef _SYS_SYSPROTO_H_ 740 struct madvise_args { 741 void *addr; 742 size_t len; 743 int behav; 744 }; 745 #endif 746 747 /* ARGSUSED */ 748 int 749 madvise(p, uap) 750 struct proc *p; 751 struct madvise_args *uap; 752 { 753 vm_offset_t start, end; 754 int ret; 755 756 /* 757 * Check for illegal behavior 758 */ 759 if (uap->behav < 0 || uap->behav > MADV_CORE) 760 return (EINVAL); 761 /* 762 * Check for illegal addresses. Watch out for address wrap... Note 763 * that VM_*_ADDRESS are not constants due to casts (argh). 764 */ 765 if (VM_MAXUSER_ADDRESS > 0 && 766 ((vm_offset_t) uap->addr + uap->len) > VM_MAXUSER_ADDRESS) 767 return (EINVAL); 768 #ifndef i386 769 if (VM_MIN_ADDRESS > 0 && uap->addr < VM_MIN_ADDRESS) 770 return (EINVAL); 771 #endif 772 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 773 return (EINVAL); 774 775 /* 776 * Since this routine is only advisory, we default to conservative 777 * behavior. 778 */ 779 start = trunc_page((vm_offset_t) uap->addr); 780 end = round_page((vm_offset_t) uap->addr + uap->len); 781 782 #ifndef BLEED 783 mtx_lock(&Giant); 784 #endif 785 mtx_lock(&vm_mtx); 786 ret = vm_map_madvise(&p->p_vmspace->vm_map, start, end, uap->behav); 787 mtx_unlock(&vm_mtx); 788 #ifndef BLEED 789 mtx_unlock(&Giant); 790 #endif 791 return (ret ? EINVAL : 0); 792 } 793 794 #ifndef _SYS_SYSPROTO_H_ 795 struct mincore_args { 796 const void *addr; 797 size_t len; 798 char *vec; 799 }; 800 #endif 801 802 /* ARGSUSED */ 803 int 804 mincore(p, uap) 805 struct proc *p; 806 struct mincore_args *uap; 807 { 808 vm_offset_t addr, first_addr; 809 vm_offset_t end, cend; 810 pmap_t pmap; 811 vm_map_t map; 812 char *vec; 813 int error; 814 int vecindex, lastvecindex; 815 register vm_map_entry_t current; 816 vm_map_entry_t entry; 817 int mincoreinfo; 818 unsigned int timestamp; 819 820 /* 821 * Make sure that the addresses presented are valid for user 822 * mode. 823 */ 824 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 825 end = addr + (vm_size_t)round_page(uap->len); 826 if (VM_MAXUSER_ADDRESS > 0 && end > VM_MAXUSER_ADDRESS) 827 return (EINVAL); 828 if (end < addr) 829 return (EINVAL); 830 831 /* 832 * Address of byte vector 833 */ 834 vec = uap->vec; 835 836 map = &p->p_vmspace->vm_map; 837 #ifndef BLEED 838 mtx_lock(&Giant); 839 #endif 840 mtx_lock(&vm_mtx); 841 pmap = vmspace_pmap(p->p_vmspace); 842 843 vm_map_lock_read(map); 844 RestartScan: 845 timestamp = map->timestamp; 846 847 if (!vm_map_lookup_entry(map, addr, &entry)) 848 entry = entry->next; 849 850 /* 851 * Do this on a map entry basis so that if the pages are not 852 * in the current processes address space, we can easily look 853 * up the pages elsewhere. 854 */ 855 lastvecindex = -1; 856 for(current = entry; 857 (current != &map->header) && (current->start < end); 858 current = current->next) { 859 860 /* 861 * ignore submaps (for now) or null objects 862 */ 863 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 864 current->object.vm_object == NULL) 865 continue; 866 867 /* 868 * limit this scan to the current map entry and the 869 * limits for the mincore call 870 */ 871 if (addr < current->start) 872 addr = current->start; 873 cend = current->end; 874 if (cend > end) 875 cend = end; 876 877 /* 878 * scan this entry one page at a time 879 */ 880 while(addr < cend) { 881 /* 882 * Check pmap first, it is likely faster, also 883 * it can provide info as to whether we are the 884 * one referencing or modifying the page. 885 */ 886 mincoreinfo = pmap_mincore(pmap, addr); 887 if (!mincoreinfo) { 888 vm_pindex_t pindex; 889 vm_ooffset_t offset; 890 vm_page_t m; 891 /* 892 * calculate the page index into the object 893 */ 894 offset = current->offset + (addr - current->start); 895 pindex = OFF_TO_IDX(offset); 896 m = vm_page_lookup(current->object.vm_object, 897 pindex); 898 /* 899 * if the page is resident, then gather information about 900 * it. 901 */ 902 if (m) { 903 mincoreinfo = MINCORE_INCORE; 904 if (m->dirty || 905 pmap_is_modified(m)) 906 mincoreinfo |= MINCORE_MODIFIED_OTHER; 907 if ((m->flags & PG_REFERENCED) || 908 pmap_ts_referenced(m)) { 909 vm_page_flag_set(m, PG_REFERENCED); 910 mincoreinfo |= MINCORE_REFERENCED_OTHER; 911 } 912 } 913 } 914 915 /* 916 * subyte may page fault. In case it needs to modify 917 * the map, we release the lock. 918 */ 919 vm_map_unlock_read(map); 920 mtx_unlock(&vm_mtx); 921 922 /* 923 * calculate index into user supplied byte vector 924 */ 925 vecindex = OFF_TO_IDX(addr - first_addr); 926 927 /* 928 * If we have skipped map entries, we need to make sure that 929 * the byte vector is zeroed for those skipped entries. 930 */ 931 while((lastvecindex + 1) < vecindex) { 932 error = subyte( vec + lastvecindex, 0); 933 if (error) { 934 #ifndef BLEED 935 mtx_unlock(&Giant); 936 #endif 937 return (EFAULT); 938 } 939 ++lastvecindex; 940 } 941 942 /* 943 * Pass the page information to the user 944 */ 945 error = subyte( vec + vecindex, mincoreinfo); 946 if (error) { 947 #ifndef BLEED 948 mtx_unlock(&Giant); 949 #endif 950 return (EFAULT); 951 } 952 953 /* 954 * If the map has changed, due to the subyte, the previous 955 * output may be invalid. 956 */ 957 mtx_lock(&vm_mtx); 958 vm_map_lock_read(map); 959 if (timestamp != map->timestamp) 960 goto RestartScan; 961 962 lastvecindex = vecindex; 963 addr += PAGE_SIZE; 964 } 965 } 966 967 /* 968 * subyte may page fault. In case it needs to modify 969 * the map, we release the lock. 970 */ 971 vm_map_unlock_read(map); 972 mtx_unlock(&vm_mtx); 973 974 /* 975 * Zero the last entries in the byte vector. 976 */ 977 vecindex = OFF_TO_IDX(end - first_addr); 978 while((lastvecindex + 1) < vecindex) { 979 error = subyte( vec + lastvecindex, 0); 980 if (error) { 981 #ifndef BLEED 982 mtx_unlock(&Giant); 983 #endif 984 return (EFAULT); 985 } 986 ++lastvecindex; 987 } 988 989 /* 990 * If the map has changed, due to the subyte, the previous 991 * output may be invalid. 992 */ 993 mtx_lock(&vm_mtx); 994 vm_map_lock_read(map); 995 if (timestamp != map->timestamp) 996 goto RestartScan; 997 vm_map_unlock_read(map); 998 mtx_unlock(&vm_mtx); 999 #ifndef BLEED 1000 mtx_unlock(&Giant); 1001 #endif 1002 1003 return (0); 1004 } 1005 1006 #ifndef _SYS_SYSPROTO_H_ 1007 struct mlock_args { 1008 const void *addr; 1009 size_t len; 1010 }; 1011 #endif 1012 int 1013 mlock(p, uap) 1014 struct proc *p; 1015 struct mlock_args *uap; 1016 { 1017 vm_offset_t addr; 1018 vm_size_t size, pageoff; 1019 int error; 1020 1021 addr = (vm_offset_t) uap->addr; 1022 size = uap->len; 1023 1024 pageoff = (addr & PAGE_MASK); 1025 addr -= pageoff; 1026 size += pageoff; 1027 size = (vm_size_t) round_page(size); 1028 1029 /* disable wrap around */ 1030 if (addr + size < addr) 1031 return (EINVAL); 1032 1033 if (atop(size) + cnt.v_wire_count > vm_page_max_wired) 1034 return (EAGAIN); 1035 1036 #ifdef pmap_wired_count 1037 if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) > 1038 p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur) 1039 return (ENOMEM); 1040 #else 1041 error = suser(p); 1042 if (error) 1043 return (error); 1044 #endif 1045 1046 #ifndef BLEED 1047 mtx_lock(&Giant); 1048 #endif 1049 mtx_lock(&vm_mtx); 1050 error = vm_map_user_pageable(&p->p_vmspace->vm_map, addr, 1051 addr + size, FALSE); 1052 mtx_unlock(&vm_mtx); 1053 #ifndef BLEED 1054 mtx_unlock(&Giant); 1055 #endif 1056 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1057 } 1058 1059 #ifndef _SYS_SYSPROTO_H_ 1060 struct mlockall_args { 1061 int how; 1062 }; 1063 #endif 1064 1065 int 1066 mlockall(p, uap) 1067 struct proc *p; 1068 struct mlockall_args *uap; 1069 { 1070 return 0; 1071 } 1072 1073 #ifndef _SYS_SYSPROTO_H_ 1074 struct mlockall_args { 1075 int how; 1076 }; 1077 #endif 1078 1079 int 1080 munlockall(p, uap) 1081 struct proc *p; 1082 struct munlockall_args *uap; 1083 { 1084 return 0; 1085 } 1086 1087 #ifndef _SYS_SYSPROTO_H_ 1088 struct munlock_args { 1089 const void *addr; 1090 size_t len; 1091 }; 1092 #endif 1093 int 1094 munlock(p, uap) 1095 struct proc *p; 1096 struct munlock_args *uap; 1097 { 1098 vm_offset_t addr; 1099 vm_size_t size, pageoff; 1100 int error; 1101 1102 addr = (vm_offset_t) uap->addr; 1103 size = uap->len; 1104 1105 pageoff = (addr & PAGE_MASK); 1106 addr -= pageoff; 1107 size += pageoff; 1108 size = (vm_size_t) round_page(size); 1109 1110 /* disable wrap around */ 1111 if (addr + size < addr) 1112 return (EINVAL); 1113 1114 #ifndef pmap_wired_count 1115 error = suser(p); 1116 if (error) 1117 return (error); 1118 #endif 1119 1120 #ifndef BLEED 1121 mtx_lock(&Giant); 1122 #endif 1123 mtx_lock(&vm_mtx); 1124 error = vm_map_user_pageable(&p->p_vmspace->vm_map, addr, 1125 addr + size, TRUE); 1126 mtx_unlock(&vm_mtx); 1127 #ifndef BLEED 1128 mtx_unlock(&Giant); 1129 #endif 1130 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1131 } 1132 1133 /* 1134 * Internal version of mmap. 1135 * Currently used by mmap, exec, and sys5 shared memory. 1136 * Handle is either a vnode pointer or NULL for MAP_ANON. 1137 */ 1138 int 1139 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1140 vm_prot_t maxprot, int flags, 1141 void *handle, 1142 vm_ooffset_t foff) 1143 { 1144 boolean_t fitit; 1145 vm_object_t object; 1146 struct vnode *vp = NULL; 1147 objtype_t type; 1148 int rv = KERN_SUCCESS; 1149 vm_ooffset_t objsize; 1150 int docow; 1151 struct proc *p = curproc; 1152 1153 if (size == 0) 1154 return (0); 1155 1156 objsize = size = round_page(size); 1157 1158 /* 1159 * We currently can only deal with page aligned file offsets. 1160 * The check is here rather than in the syscall because the 1161 * kernel calls this function internally for other mmaping 1162 * operations (such as in exec) and non-aligned offsets will 1163 * cause pmap inconsistencies...so we want to be sure to 1164 * disallow this in all cases. 1165 */ 1166 if (foff & PAGE_MASK) 1167 return (EINVAL); 1168 1169 if ((flags & MAP_FIXED) == 0) { 1170 fitit = TRUE; 1171 *addr = round_page(*addr); 1172 mtx_lock(&Giant); 1173 } else { 1174 if (*addr != trunc_page(*addr)) 1175 return (EINVAL); 1176 fitit = FALSE; 1177 mtx_lock(&Giant); 1178 mtx_lock(&vm_mtx); 1179 (void) vm_map_remove(map, *addr, *addr + size); 1180 mtx_unlock(&vm_mtx); 1181 } 1182 1183 /* 1184 * Lookup/allocate object. 1185 */ 1186 if (flags & MAP_ANON) { 1187 type = OBJT_DEFAULT; 1188 /* 1189 * Unnamed anonymous regions always start at 0. 1190 */ 1191 if (handle == 0) 1192 foff = 0; 1193 } else { 1194 vp = (struct vnode *) handle; 1195 if (vp->v_type == VCHR) { 1196 type = OBJT_DEVICE; 1197 handle = (void *)(intptr_t)vp->v_rdev; 1198 } else { 1199 struct vattr vat; 1200 int error; 1201 1202 error = VOP_GETATTR(vp, &vat, p->p_ucred, p); 1203 if (error) { 1204 mtx_unlock(&Giant); 1205 return (error); 1206 } 1207 objsize = round_page(vat.va_size); 1208 type = OBJT_VNODE; 1209 /* 1210 * if it is a regular file without any references 1211 * we do not need to sync it. 1212 */ 1213 if (vp->v_type == VREG && vat.va_nlink == 0) { 1214 flags |= MAP_NOSYNC; 1215 } 1216 } 1217 } 1218 1219 if (handle == NULL) { 1220 object = NULL; 1221 docow = 0; 1222 } else { 1223 object = vm_pager_allocate(type, 1224 handle, objsize, prot, foff); 1225 if (object == NULL) { 1226 mtx_unlock(&Giant); 1227 return (type == OBJT_DEVICE ? EINVAL : ENOMEM); 1228 } 1229 docow = MAP_PREFAULT_PARTIAL; 1230 } 1231 1232 /* 1233 * Force device mappings to be shared. 1234 */ 1235 if (type == OBJT_DEVICE || type == OBJT_PHYS) { 1236 flags &= ~(MAP_PRIVATE|MAP_COPY); 1237 flags |= MAP_SHARED; 1238 } 1239 1240 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1241 docow |= MAP_COPY_ON_WRITE; 1242 if (flags & MAP_NOSYNC) 1243 docow |= MAP_DISABLE_SYNCER; 1244 if (flags & MAP_NOCORE) 1245 docow |= MAP_DISABLE_COREDUMP; 1246 1247 #if defined(VM_PROT_READ_IS_EXEC) 1248 if (prot & VM_PROT_READ) 1249 prot |= VM_PROT_EXECUTE; 1250 1251 if (maxprot & VM_PROT_READ) 1252 maxprot |= VM_PROT_EXECUTE; 1253 #endif 1254 1255 mtx_lock(&vm_mtx); 1256 if (fitit) 1257 *addr = pmap_addr_hint(object, *addr, size); 1258 1259 if (flags & MAP_STACK) 1260 rv = vm_map_stack (map, *addr, size, prot, 1261 maxprot, docow); 1262 else 1263 rv = vm_map_find(map, object, foff, addr, size, fitit, 1264 prot, maxprot, docow); 1265 1266 if (rv != KERN_SUCCESS) 1267 /* 1268 * Lose the object reference. Will destroy the 1269 * object if it's an unnamed anonymous mapping 1270 * or named anonymous without other references. 1271 */ 1272 vm_object_deallocate(object); 1273 1274 /* 1275 * Shared memory is also shared with children. 1276 */ 1277 else if (flags & (MAP_SHARED|MAP_INHERIT)) { 1278 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); 1279 if (rv != KERN_SUCCESS) 1280 (void) vm_map_remove(map, *addr, *addr + size); 1281 } 1282 mtx_unlock(&vm_mtx); 1283 mtx_unlock(&Giant); 1284 switch (rv) { 1285 case KERN_SUCCESS: 1286 return (0); 1287 case KERN_INVALID_ADDRESS: 1288 case KERN_NO_SPACE: 1289 return (ENOMEM); 1290 case KERN_PROTECTION_FAILURE: 1291 return (EACCES); 1292 default: 1293 return (EINVAL); 1294 } 1295 } 1296