1 /*- 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35 * 36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_compat.h" 47 #include "opt_hwpmc_hooks.h" 48 #include "opt_mac.h" 49 50 #include <sys/param.h> 51 #include <sys/systm.h> 52 #include <sys/kernel.h> 53 #include <sys/lock.h> 54 #include <sys/mutex.h> 55 #include <sys/sysproto.h> 56 #include <sys/filedesc.h> 57 #include <sys/priv.h> 58 #include <sys/proc.h> 59 #include <sys/resource.h> 60 #include <sys/resourcevar.h> 61 #include <sys/vnode.h> 62 #include <sys/fcntl.h> 63 #include <sys/file.h> 64 #include <sys/mman.h> 65 #include <sys/mount.h> 66 #include <sys/conf.h> 67 #include <sys/stat.h> 68 #include <sys/vmmeter.h> 69 #include <sys/sysctl.h> 70 71 #include <security/mac/mac_framework.h> 72 73 #include <vm/vm.h> 74 #include <vm/vm_param.h> 75 #include <vm/pmap.h> 76 #include <vm/vm_map.h> 77 #include <vm/vm_object.h> 78 #include <vm/vm_page.h> 79 #include <vm/vm_pager.h> 80 #include <vm/vm_pageout.h> 81 #include <vm/vm_extern.h> 82 #include <vm/vm_page.h> 83 #include <vm/vm_kern.h> 84 85 #ifdef HWPMC_HOOKS 86 #include <sys/pmckern.h> 87 #endif 88 89 #ifndef _SYS_SYSPROTO_H_ 90 struct sbrk_args { 91 int incr; 92 }; 93 #endif 94 95 static int max_proc_mmap; 96 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, ""); 97 98 /* 99 * Set the maximum number of vm_map_entry structures per process. Roughly 100 * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100 101 * of our KVM malloc space still results in generous limits. We want a 102 * default that is good enough to prevent the kernel running out of resources 103 * if attacked from compromised user account but generous enough such that 104 * multi-threaded processes are not unduly inconvenienced. 105 */ 106 static void vmmapentry_rsrc_init(void *); 107 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, NULL) 108 109 static void 110 vmmapentry_rsrc_init(dummy) 111 void *dummy; 112 { 113 max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry); 114 max_proc_mmap /= 100; 115 } 116 117 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 118 int *, struct vnode *, vm_ooffset_t, vm_object_t *); 119 static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 120 int *, struct cdev *, vm_ooffset_t, vm_object_t *); 121 122 /* 123 * MPSAFE 124 */ 125 /* ARGSUSED */ 126 int 127 sbrk(td, uap) 128 struct thread *td; 129 struct sbrk_args *uap; 130 { 131 /* Not yet implemented */ 132 return (EOPNOTSUPP); 133 } 134 135 #ifndef _SYS_SYSPROTO_H_ 136 struct sstk_args { 137 int incr; 138 }; 139 #endif 140 141 /* 142 * MPSAFE 143 */ 144 /* ARGSUSED */ 145 int 146 sstk(td, uap) 147 struct thread *td; 148 struct sstk_args *uap; 149 { 150 /* Not yet implemented */ 151 return (EOPNOTSUPP); 152 } 153 154 #if defined(COMPAT_43) 155 #ifndef _SYS_SYSPROTO_H_ 156 struct getpagesize_args { 157 int dummy; 158 }; 159 #endif 160 161 /* ARGSUSED */ 162 int 163 ogetpagesize(td, uap) 164 struct thread *td; 165 struct getpagesize_args *uap; 166 { 167 /* MP SAFE */ 168 td->td_retval[0] = PAGE_SIZE; 169 return (0); 170 } 171 #endif /* COMPAT_43 */ 172 173 174 /* 175 * Memory Map (mmap) system call. Note that the file offset 176 * and address are allowed to be NOT page aligned, though if 177 * the MAP_FIXED flag it set, both must have the same remainder 178 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 179 * page-aligned, the actual mapping starts at trunc_page(addr) 180 * and the return value is adjusted up by the page offset. 181 * 182 * Generally speaking, only character devices which are themselves 183 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 184 * there would be no cache coherency between a descriptor and a VM mapping 185 * both to the same character device. 186 * 187 * Block devices can be mmap'd no matter what they represent. Cache coherency 188 * is maintained as long as you do not write directly to the underlying 189 * character device. 190 */ 191 #ifndef _SYS_SYSPROTO_H_ 192 struct mmap_args { 193 void *addr; 194 size_t len; 195 int prot; 196 int flags; 197 int fd; 198 long pad; 199 off_t pos; 200 }; 201 #endif 202 203 /* 204 * MPSAFE 205 */ 206 int 207 mmap(td, uap) 208 struct thread *td; 209 struct mmap_args *uap; 210 { 211 #ifdef HWPMC_HOOKS 212 struct pmckern_map_in pkm; 213 #endif 214 struct file *fp; 215 struct vnode *vp; 216 vm_offset_t addr; 217 vm_size_t size, pageoff; 218 vm_prot_t prot, maxprot; 219 void *handle; 220 objtype_t handle_type; 221 int flags, error; 222 off_t pos; 223 struct vmspace *vms = td->td_proc->p_vmspace; 224 225 addr = (vm_offset_t) uap->addr; 226 size = uap->len; 227 prot = uap->prot & VM_PROT_ALL; 228 flags = uap->flags; 229 pos = uap->pos; 230 231 fp = NULL; 232 /* make sure mapping fits into numeric range etc */ 233 if ((ssize_t) uap->len < 0 || 234 ((flags & MAP_ANON) && uap->fd != -1)) 235 return (EINVAL); 236 237 if (flags & MAP_STACK) { 238 if ((uap->fd != -1) || 239 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 240 return (EINVAL); 241 flags |= MAP_ANON; 242 pos = 0; 243 } 244 245 /* 246 * Align the file position to a page boundary, 247 * and save its page offset component. 248 */ 249 pageoff = (pos & PAGE_MASK); 250 pos -= pageoff; 251 252 /* Adjust size for rounding (on both ends). */ 253 size += pageoff; /* low end... */ 254 size = (vm_size_t) round_page(size); /* hi end */ 255 256 /* 257 * Check for illegal addresses. Watch out for address wrap... Note 258 * that VM_*_ADDRESS are not constants due to casts (argh). 259 */ 260 if (flags & MAP_FIXED) { 261 /* 262 * The specified address must have the same remainder 263 * as the file offset taken modulo PAGE_SIZE, so it 264 * should be aligned after adjustment by pageoff. 265 */ 266 addr -= pageoff; 267 if (addr & PAGE_MASK) 268 return (EINVAL); 269 /* Address range must be all in user VM space. */ 270 if (addr < vm_map_min(&vms->vm_map) || 271 addr + size > vm_map_max(&vms->vm_map)) 272 return (EINVAL); 273 if (addr + size < addr) 274 return (EINVAL); 275 } else { 276 /* 277 * XXX for non-fixed mappings where no hint is provided or 278 * the hint would fall in the potential heap space, 279 * place it after the end of the largest possible heap. 280 * 281 * There should really be a pmap call to determine a reasonable 282 * location. 283 */ 284 PROC_LOCK(td->td_proc); 285 if (addr == 0 || 286 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 287 addr < round_page((vm_offset_t)vms->vm_daddr + 288 lim_max(td->td_proc, RLIMIT_DATA)))) 289 addr = round_page((vm_offset_t)vms->vm_daddr + 290 lim_max(td->td_proc, RLIMIT_DATA)); 291 PROC_UNLOCK(td->td_proc); 292 } 293 if (flags & MAP_ANON) { 294 /* 295 * Mapping blank space is trivial. 296 */ 297 handle = NULL; 298 handle_type = OBJT_DEFAULT; 299 maxprot = VM_PROT_ALL; 300 pos = 0; 301 } else { 302 /* 303 * Mapping file, get fp for validation. Obtain vnode and make 304 * sure it is of appropriate type. 305 * don't let the descriptor disappear on us if we block 306 */ 307 if ((error = fget(td, uap->fd, &fp)) != 0) 308 goto done; 309 if (fp->f_type != DTYPE_VNODE) { 310 error = ENODEV; 311 goto done; 312 } 313 /* 314 * POSIX shared-memory objects are defined to have 315 * kernel persistence, and are not defined to support 316 * read(2)/write(2) -- or even open(2). Thus, we can 317 * use MAP_ASYNC to trade on-disk coherence for speed. 318 * The shm_open(3) library routine turns on the FPOSIXSHM 319 * flag to request this behavior. 320 */ 321 if (fp->f_flag & FPOSIXSHM) 322 flags |= MAP_NOSYNC; 323 vp = fp->f_vnode; 324 /* 325 * Ensure that file and memory protections are 326 * compatible. Note that we only worry about 327 * writability if mapping is shared; in this case, 328 * current and max prot are dictated by the open file. 329 * XXX use the vnode instead? Problem is: what 330 * credentials do we use for determination? What if 331 * proc does a setuid? 332 */ 333 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 334 maxprot = VM_PROT_NONE; 335 else 336 maxprot = VM_PROT_EXECUTE; 337 if (fp->f_flag & FREAD) { 338 maxprot |= VM_PROT_READ; 339 } else if (prot & PROT_READ) { 340 error = EACCES; 341 goto done; 342 } 343 /* 344 * If we are sharing potential changes (either via 345 * MAP_SHARED or via the implicit sharing of character 346 * device mappings), and we are trying to get write 347 * permission although we opened it without asking 348 * for it, bail out. 349 */ 350 if ((flags & MAP_SHARED) != 0) { 351 if ((fp->f_flag & FWRITE) != 0) { 352 maxprot |= VM_PROT_WRITE; 353 } else if ((prot & PROT_WRITE) != 0) { 354 error = EACCES; 355 goto done; 356 } 357 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 358 maxprot |= VM_PROT_WRITE; 359 } 360 handle = (void *)vp; 361 handle_type = OBJT_VNODE; 362 } 363 364 /* 365 * Do not allow more then a certain number of vm_map_entry structures 366 * per process. Scale with the number of rforks sharing the map 367 * to make the limit reasonable for threads. 368 */ 369 if (max_proc_mmap && 370 vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) { 371 error = ENOMEM; 372 goto done; 373 } 374 375 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 376 flags, handle_type, handle, pos); 377 #ifdef HWPMC_HOOKS 378 /* inform hwpmc(4) if an executable is being mapped */ 379 if (error == 0 && handle_type == OBJT_VNODE && 380 (prot & PROT_EXEC)) { 381 pkm.pm_file = handle; 382 pkm.pm_address = (uintptr_t) addr; 383 PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); 384 } 385 #endif 386 if (error == 0) 387 td->td_retval[0] = (register_t) (addr + pageoff); 388 done: 389 if (fp) 390 fdrop(fp, td); 391 392 return (error); 393 } 394 395 int 396 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 397 { 398 struct mmap_args oargs; 399 400 oargs.addr = uap->addr; 401 oargs.len = uap->len; 402 oargs.prot = uap->prot; 403 oargs.flags = uap->flags; 404 oargs.fd = uap->fd; 405 oargs.pos = uap->pos; 406 return (mmap(td, &oargs)); 407 } 408 409 #ifdef COMPAT_43 410 #ifndef _SYS_SYSPROTO_H_ 411 struct ommap_args { 412 caddr_t addr; 413 int len; 414 int prot; 415 int flags; 416 int fd; 417 long pos; 418 }; 419 #endif 420 int 421 ommap(td, uap) 422 struct thread *td; 423 struct ommap_args *uap; 424 { 425 struct mmap_args nargs; 426 static const char cvtbsdprot[8] = { 427 0, 428 PROT_EXEC, 429 PROT_WRITE, 430 PROT_EXEC | PROT_WRITE, 431 PROT_READ, 432 PROT_EXEC | PROT_READ, 433 PROT_WRITE | PROT_READ, 434 PROT_EXEC | PROT_WRITE | PROT_READ, 435 }; 436 437 #define OMAP_ANON 0x0002 438 #define OMAP_COPY 0x0020 439 #define OMAP_SHARED 0x0010 440 #define OMAP_FIXED 0x0100 441 442 nargs.addr = uap->addr; 443 nargs.len = uap->len; 444 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 445 nargs.flags = 0; 446 if (uap->flags & OMAP_ANON) 447 nargs.flags |= MAP_ANON; 448 if (uap->flags & OMAP_COPY) 449 nargs.flags |= MAP_COPY; 450 if (uap->flags & OMAP_SHARED) 451 nargs.flags |= MAP_SHARED; 452 else 453 nargs.flags |= MAP_PRIVATE; 454 if (uap->flags & OMAP_FIXED) 455 nargs.flags |= MAP_FIXED; 456 nargs.fd = uap->fd; 457 nargs.pos = uap->pos; 458 return (mmap(td, &nargs)); 459 } 460 #endif /* COMPAT_43 */ 461 462 463 #ifndef _SYS_SYSPROTO_H_ 464 struct msync_args { 465 void *addr; 466 int len; 467 int flags; 468 }; 469 #endif 470 /* 471 * MPSAFE 472 */ 473 int 474 msync(td, uap) 475 struct thread *td; 476 struct msync_args *uap; 477 { 478 vm_offset_t addr; 479 vm_size_t size, pageoff; 480 int flags; 481 vm_map_t map; 482 int rv; 483 484 addr = (vm_offset_t) uap->addr; 485 size = uap->len; 486 flags = uap->flags; 487 488 pageoff = (addr & PAGE_MASK); 489 addr -= pageoff; 490 size += pageoff; 491 size = (vm_size_t) round_page(size); 492 if (addr + size < addr) 493 return (EINVAL); 494 495 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 496 return (EINVAL); 497 498 map = &td->td_proc->p_vmspace->vm_map; 499 500 /* 501 * Clean the pages and interpret the return value. 502 */ 503 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 504 (flags & MS_INVALIDATE) != 0); 505 switch (rv) { 506 case KERN_SUCCESS: 507 return (0); 508 case KERN_INVALID_ADDRESS: 509 return (EINVAL); /* Sun returns ENOMEM? */ 510 case KERN_INVALID_ARGUMENT: 511 return (EBUSY); 512 default: 513 return (EINVAL); 514 } 515 } 516 517 #ifndef _SYS_SYSPROTO_H_ 518 struct munmap_args { 519 void *addr; 520 size_t len; 521 }; 522 #endif 523 /* 524 * MPSAFE 525 */ 526 int 527 munmap(td, uap) 528 struct thread *td; 529 struct munmap_args *uap; 530 { 531 #ifdef HWPMC_HOOKS 532 struct pmckern_map_out pkm; 533 vm_map_entry_t entry; 534 #endif 535 vm_offset_t addr; 536 vm_size_t size, pageoff; 537 vm_map_t map; 538 539 addr = (vm_offset_t) uap->addr; 540 size = uap->len; 541 if (size == 0) 542 return (EINVAL); 543 544 pageoff = (addr & PAGE_MASK); 545 addr -= pageoff; 546 size += pageoff; 547 size = (vm_size_t) round_page(size); 548 if (addr + size < addr) 549 return (EINVAL); 550 551 /* 552 * Check for illegal addresses. Watch out for address wrap... 553 */ 554 map = &td->td_proc->p_vmspace->vm_map; 555 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 556 return (EINVAL); 557 vm_map_lock(map); 558 /* 559 * Make sure entire range is allocated. 560 */ 561 if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) { 562 vm_map_unlock(map); 563 return (EINVAL); 564 } 565 #ifdef HWPMC_HOOKS 566 /* 567 * Inform hwpmc if the address range being unmapped contains 568 * an executable region. 569 */ 570 if (vm_map_lookup_entry(map, addr, &entry)) { 571 for (; 572 entry != &map->header && entry->start < addr + size; 573 entry = entry->next) { 574 if (vm_map_check_protection(map, entry->start, 575 entry->end, VM_PROT_EXECUTE) == TRUE) { 576 pkm.pm_address = (uintptr_t) addr; 577 pkm.pm_size = (size_t) size; 578 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, 579 (void *) &pkm); 580 break; 581 } 582 } 583 } 584 #endif 585 /* returns nothing but KERN_SUCCESS anyway */ 586 vm_map_delete(map, addr, addr + size); 587 vm_map_unlock(map); 588 return (0); 589 } 590 591 #ifndef _SYS_SYSPROTO_H_ 592 struct mprotect_args { 593 const void *addr; 594 size_t len; 595 int prot; 596 }; 597 #endif 598 /* 599 * MPSAFE 600 */ 601 int 602 mprotect(td, uap) 603 struct thread *td; 604 struct mprotect_args *uap; 605 { 606 vm_offset_t addr; 607 vm_size_t size, pageoff; 608 vm_prot_t prot; 609 610 addr = (vm_offset_t) uap->addr; 611 size = uap->len; 612 prot = uap->prot & VM_PROT_ALL; 613 #if defined(VM_PROT_READ_IS_EXEC) 614 if (prot & VM_PROT_READ) 615 prot |= VM_PROT_EXECUTE; 616 #endif 617 618 pageoff = (addr & PAGE_MASK); 619 addr -= pageoff; 620 size += pageoff; 621 size = (vm_size_t) round_page(size); 622 if (addr + size < addr) 623 return (EINVAL); 624 625 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 626 addr + size, prot, FALSE)) { 627 case KERN_SUCCESS: 628 return (0); 629 case KERN_PROTECTION_FAILURE: 630 return (EACCES); 631 } 632 return (EINVAL); 633 } 634 635 #ifndef _SYS_SYSPROTO_H_ 636 struct minherit_args { 637 void *addr; 638 size_t len; 639 int inherit; 640 }; 641 #endif 642 /* 643 * MPSAFE 644 */ 645 int 646 minherit(td, uap) 647 struct thread *td; 648 struct minherit_args *uap; 649 { 650 vm_offset_t addr; 651 vm_size_t size, pageoff; 652 vm_inherit_t inherit; 653 654 addr = (vm_offset_t)uap->addr; 655 size = uap->len; 656 inherit = uap->inherit; 657 658 pageoff = (addr & PAGE_MASK); 659 addr -= pageoff; 660 size += pageoff; 661 size = (vm_size_t) round_page(size); 662 if (addr + size < addr) 663 return (EINVAL); 664 665 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 666 addr + size, inherit)) { 667 case KERN_SUCCESS: 668 return (0); 669 case KERN_PROTECTION_FAILURE: 670 return (EACCES); 671 } 672 return (EINVAL); 673 } 674 675 #ifndef _SYS_SYSPROTO_H_ 676 struct madvise_args { 677 void *addr; 678 size_t len; 679 int behav; 680 }; 681 #endif 682 683 /* 684 * MPSAFE 685 */ 686 /* ARGSUSED */ 687 int 688 madvise(td, uap) 689 struct thread *td; 690 struct madvise_args *uap; 691 { 692 vm_offset_t start, end; 693 vm_map_t map; 694 struct proc *p; 695 int error; 696 697 /* 698 * Check for our special case, advising the swap pager we are 699 * "immortal." 700 */ 701 if (uap->behav == MADV_PROTECT) { 702 error = priv_check(td, PRIV_VM_MADV_PROTECT); 703 if (error == 0) { 704 p = td->td_proc; 705 PROC_LOCK(p); 706 p->p_flag |= P_PROTECTED; 707 PROC_UNLOCK(p); 708 } 709 return (error); 710 } 711 /* 712 * Check for illegal behavior 713 */ 714 if (uap->behav < 0 || uap->behav > MADV_CORE) 715 return (EINVAL); 716 /* 717 * Check for illegal addresses. Watch out for address wrap... Note 718 * that VM_*_ADDRESS are not constants due to casts (argh). 719 */ 720 map = &td->td_proc->p_vmspace->vm_map; 721 if ((vm_offset_t)uap->addr < vm_map_min(map) || 722 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 723 return (EINVAL); 724 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 725 return (EINVAL); 726 727 /* 728 * Since this routine is only advisory, we default to conservative 729 * behavior. 730 */ 731 start = trunc_page((vm_offset_t) uap->addr); 732 end = round_page((vm_offset_t) uap->addr + uap->len); 733 734 if (vm_map_madvise(map, start, end, uap->behav)) 735 return (EINVAL); 736 return (0); 737 } 738 739 #ifndef _SYS_SYSPROTO_H_ 740 struct mincore_args { 741 const void *addr; 742 size_t len; 743 char *vec; 744 }; 745 #endif 746 747 /* 748 * MPSAFE 749 */ 750 /* ARGSUSED */ 751 int 752 mincore(td, uap) 753 struct thread *td; 754 struct mincore_args *uap; 755 { 756 vm_offset_t addr, first_addr; 757 vm_offset_t end, cend; 758 pmap_t pmap; 759 vm_map_t map; 760 char *vec; 761 int error = 0; 762 int vecindex, lastvecindex; 763 vm_map_entry_t current; 764 vm_map_entry_t entry; 765 int mincoreinfo; 766 unsigned int timestamp; 767 768 /* 769 * Make sure that the addresses presented are valid for user 770 * mode. 771 */ 772 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 773 end = addr + (vm_size_t)round_page(uap->len); 774 map = &td->td_proc->p_vmspace->vm_map; 775 if (end > vm_map_max(map) || end < addr) 776 return (ENOMEM); 777 778 /* 779 * Address of byte vector 780 */ 781 vec = uap->vec; 782 783 pmap = vmspace_pmap(td->td_proc->p_vmspace); 784 785 vm_map_lock_read(map); 786 RestartScan: 787 timestamp = map->timestamp; 788 789 if (!vm_map_lookup_entry(map, addr, &entry)) { 790 vm_map_unlock_read(map); 791 return (ENOMEM); 792 } 793 794 /* 795 * Do this on a map entry basis so that if the pages are not 796 * in the current processes address space, we can easily look 797 * up the pages elsewhere. 798 */ 799 lastvecindex = -1; 800 for (current = entry; 801 (current != &map->header) && (current->start < end); 802 current = current->next) { 803 804 /* 805 * check for contiguity 806 */ 807 if (current->end < end && 808 (entry->next == &map->header || 809 current->next->start > current->end)) { 810 vm_map_unlock_read(map); 811 return (ENOMEM); 812 } 813 814 /* 815 * ignore submaps (for now) or null objects 816 */ 817 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 818 current->object.vm_object == NULL) 819 continue; 820 821 /* 822 * limit this scan to the current map entry and the 823 * limits for the mincore call 824 */ 825 if (addr < current->start) 826 addr = current->start; 827 cend = current->end; 828 if (cend > end) 829 cend = end; 830 831 /* 832 * scan this entry one page at a time 833 */ 834 while (addr < cend) { 835 /* 836 * Check pmap first, it is likely faster, also 837 * it can provide info as to whether we are the 838 * one referencing or modifying the page. 839 */ 840 mincoreinfo = pmap_mincore(pmap, addr); 841 if (!mincoreinfo) { 842 vm_pindex_t pindex; 843 vm_ooffset_t offset; 844 vm_page_t m; 845 /* 846 * calculate the page index into the object 847 */ 848 offset = current->offset + (addr - current->start); 849 pindex = OFF_TO_IDX(offset); 850 VM_OBJECT_LOCK(current->object.vm_object); 851 m = vm_page_lookup(current->object.vm_object, 852 pindex); 853 /* 854 * if the page is resident, then gather information about 855 * it. 856 */ 857 if (m != NULL && m->valid != 0) { 858 mincoreinfo = MINCORE_INCORE; 859 vm_page_lock_queues(); 860 if (m->dirty || 861 pmap_is_modified(m)) 862 mincoreinfo |= MINCORE_MODIFIED_OTHER; 863 if ((m->flags & PG_REFERENCED) || 864 pmap_ts_referenced(m)) { 865 vm_page_flag_set(m, PG_REFERENCED); 866 mincoreinfo |= MINCORE_REFERENCED_OTHER; 867 } 868 vm_page_unlock_queues(); 869 } 870 VM_OBJECT_UNLOCK(current->object.vm_object); 871 } 872 873 /* 874 * subyte may page fault. In case it needs to modify 875 * the map, we release the lock. 876 */ 877 vm_map_unlock_read(map); 878 879 /* 880 * calculate index into user supplied byte vector 881 */ 882 vecindex = OFF_TO_IDX(addr - first_addr); 883 884 /* 885 * If we have skipped map entries, we need to make sure that 886 * the byte vector is zeroed for those skipped entries. 887 */ 888 while ((lastvecindex + 1) < vecindex) { 889 error = subyte(vec + lastvecindex, 0); 890 if (error) { 891 error = EFAULT; 892 goto done2; 893 } 894 ++lastvecindex; 895 } 896 897 /* 898 * Pass the page information to the user 899 */ 900 error = subyte(vec + vecindex, mincoreinfo); 901 if (error) { 902 error = EFAULT; 903 goto done2; 904 } 905 906 /* 907 * If the map has changed, due to the subyte, the previous 908 * output may be invalid. 909 */ 910 vm_map_lock_read(map); 911 if (timestamp != map->timestamp) 912 goto RestartScan; 913 914 lastvecindex = vecindex; 915 addr += PAGE_SIZE; 916 } 917 } 918 919 /* 920 * subyte may page fault. In case it needs to modify 921 * the map, we release the lock. 922 */ 923 vm_map_unlock_read(map); 924 925 /* 926 * Zero the last entries in the byte vector. 927 */ 928 vecindex = OFF_TO_IDX(end - first_addr); 929 while ((lastvecindex + 1) < vecindex) { 930 error = subyte(vec + lastvecindex, 0); 931 if (error) { 932 error = EFAULT; 933 goto done2; 934 } 935 ++lastvecindex; 936 } 937 938 /* 939 * If the map has changed, due to the subyte, the previous 940 * output may be invalid. 941 */ 942 vm_map_lock_read(map); 943 if (timestamp != map->timestamp) 944 goto RestartScan; 945 vm_map_unlock_read(map); 946 done2: 947 return (error); 948 } 949 950 #ifndef _SYS_SYSPROTO_H_ 951 struct mlock_args { 952 const void *addr; 953 size_t len; 954 }; 955 #endif 956 /* 957 * MPSAFE 958 */ 959 int 960 mlock(td, uap) 961 struct thread *td; 962 struct mlock_args *uap; 963 { 964 struct proc *proc; 965 vm_offset_t addr, end, last, start; 966 vm_size_t npages, size; 967 int error; 968 969 error = priv_check(td, PRIV_VM_MLOCK); 970 if (error) 971 return (error); 972 addr = (vm_offset_t)uap->addr; 973 size = uap->len; 974 last = addr + size; 975 start = trunc_page(addr); 976 end = round_page(last); 977 if (last < addr || end < addr) 978 return (EINVAL); 979 npages = atop(end - start); 980 if (npages > vm_page_max_wired) 981 return (ENOMEM); 982 proc = td->td_proc; 983 PROC_LOCK(proc); 984 if (ptoa(npages + 985 pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) > 986 lim_cur(proc, RLIMIT_MEMLOCK)) { 987 PROC_UNLOCK(proc); 988 return (ENOMEM); 989 } 990 PROC_UNLOCK(proc); 991 if (npages + cnt.v_wire_count > vm_page_max_wired) 992 return (EAGAIN); 993 error = vm_map_wire(&proc->p_vmspace->vm_map, start, end, 994 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 995 return (error == KERN_SUCCESS ? 0 : ENOMEM); 996 } 997 998 #ifndef _SYS_SYSPROTO_H_ 999 struct mlockall_args { 1000 int how; 1001 }; 1002 #endif 1003 1004 /* 1005 * MPSAFE 1006 */ 1007 int 1008 mlockall(td, uap) 1009 struct thread *td; 1010 struct mlockall_args *uap; 1011 { 1012 vm_map_t map; 1013 int error; 1014 1015 map = &td->td_proc->p_vmspace->vm_map; 1016 error = 0; 1017 1018 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1019 return (EINVAL); 1020 1021 #if 0 1022 /* 1023 * If wiring all pages in the process would cause it to exceed 1024 * a hard resource limit, return ENOMEM. 1025 */ 1026 PROC_LOCK(td->td_proc); 1027 if (map->size - ptoa(pmap_wired_count(vm_map_pmap(map)) > 1028 lim_cur(td->td_proc, RLIMIT_MEMLOCK))) { 1029 PROC_UNLOCK(td->td_proc); 1030 return (ENOMEM); 1031 } 1032 PROC_UNLOCK(td->td_proc); 1033 #else 1034 error = priv_check(td, PRIV_VM_MLOCK); 1035 if (error) 1036 return (error); 1037 #endif 1038 1039 if (uap->how & MCL_FUTURE) { 1040 vm_map_lock(map); 1041 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1042 vm_map_unlock(map); 1043 error = 0; 1044 } 1045 1046 if (uap->how & MCL_CURRENT) { 1047 /* 1048 * P1003.1-2001 mandates that all currently mapped pages 1049 * will be memory resident and locked (wired) upon return 1050 * from mlockall(). vm_map_wire() will wire pages, by 1051 * calling vm_fault_wire() for each page in the region. 1052 */ 1053 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1054 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1055 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1056 } 1057 1058 return (error); 1059 } 1060 1061 #ifndef _SYS_SYSPROTO_H_ 1062 struct munlockall_args { 1063 register_t dummy; 1064 }; 1065 #endif 1066 1067 /* 1068 * MPSAFE 1069 */ 1070 int 1071 munlockall(td, uap) 1072 struct thread *td; 1073 struct munlockall_args *uap; 1074 { 1075 vm_map_t map; 1076 int error; 1077 1078 map = &td->td_proc->p_vmspace->vm_map; 1079 error = priv_check(td, PRIV_VM_MUNLOCK); 1080 if (error) 1081 return (error); 1082 1083 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1084 vm_map_lock(map); 1085 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1086 vm_map_unlock(map); 1087 1088 /* Forcibly unwire all pages. */ 1089 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1090 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1091 1092 return (error); 1093 } 1094 1095 #ifndef _SYS_SYSPROTO_H_ 1096 struct munlock_args { 1097 const void *addr; 1098 size_t len; 1099 }; 1100 #endif 1101 /* 1102 * MPSAFE 1103 */ 1104 int 1105 munlock(td, uap) 1106 struct thread *td; 1107 struct munlock_args *uap; 1108 { 1109 vm_offset_t addr, end, last, start; 1110 vm_size_t size; 1111 int error; 1112 1113 error = priv_check(td, PRIV_VM_MUNLOCK); 1114 if (error) 1115 return (error); 1116 addr = (vm_offset_t)uap->addr; 1117 size = uap->len; 1118 last = addr + size; 1119 start = trunc_page(addr); 1120 end = round_page(last); 1121 if (last < addr || end < addr) 1122 return (EINVAL); 1123 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1124 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1125 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1126 } 1127 1128 /* 1129 * vm_mmap_vnode() 1130 * 1131 * MPSAFE 1132 * 1133 * Helper function for vm_mmap. Perform sanity check specific for mmap 1134 * operations on vnodes. 1135 */ 1136 int 1137 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1138 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1139 struct vnode *vp, vm_ooffset_t foff, vm_object_t *objp) 1140 { 1141 struct vattr va; 1142 void *handle; 1143 vm_object_t obj; 1144 struct mount *mp; 1145 int error, flags, type; 1146 int vfslocked; 1147 1148 mp = vp->v_mount; 1149 vfslocked = VFS_LOCK_GIANT(mp); 1150 if ((error = vget(vp, LK_EXCLUSIVE, td)) != 0) { 1151 VFS_UNLOCK_GIANT(vfslocked); 1152 return (error); 1153 } 1154 flags = *flagsp; 1155 obj = vp->v_object; 1156 if (vp->v_type == VREG) { 1157 /* 1158 * Get the proper underlying object 1159 */ 1160 if (obj == NULL) { 1161 error = EINVAL; 1162 goto done; 1163 } 1164 if (obj->handle != vp) { 1165 vput(vp); 1166 vp = (struct vnode*)obj->handle; 1167 vget(vp, LK_EXCLUSIVE, td); 1168 } 1169 type = OBJT_VNODE; 1170 handle = vp; 1171 } else if (vp->v_type == VCHR) { 1172 type = OBJT_DEVICE; 1173 handle = vp->v_rdev; 1174 1175 /* XXX: lack thredref on device */ 1176 if(vp->v_rdev->si_devsw->d_flags & D_MMAP_ANON) { 1177 *maxprotp = VM_PROT_ALL; 1178 *flagsp |= MAP_ANON; 1179 error = 0; 1180 goto done; 1181 } 1182 /* 1183 * cdevs does not provide private mappings of any kind. 1184 */ 1185 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1186 (prot & PROT_WRITE) != 0) { 1187 error = EACCES; 1188 goto done; 1189 } 1190 if (flags & (MAP_PRIVATE|MAP_COPY)) { 1191 error = EINVAL; 1192 goto done; 1193 } 1194 /* 1195 * Force device mappings to be shared. 1196 */ 1197 flags |= MAP_SHARED; 1198 } else { 1199 error = EINVAL; 1200 goto done; 1201 } 1202 if ((error = VOP_GETATTR(vp, &va, td->td_ucred, td))) { 1203 goto done; 1204 } 1205 #ifdef MAC 1206 error = mac_check_vnode_mmap(td->td_ucred, vp, prot, flags); 1207 if (error != 0) 1208 goto done; 1209 #endif 1210 if ((flags & MAP_SHARED) != 0) { 1211 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1212 if (prot & PROT_WRITE) { 1213 error = EPERM; 1214 goto done; 1215 } 1216 *maxprotp &= ~VM_PROT_WRITE; 1217 } 1218 } 1219 /* 1220 * If it is a regular file without any references 1221 * we do not need to sync it. 1222 * Adjust object size to be the size of actual file. 1223 */ 1224 if (vp->v_type == VREG) { 1225 objsize = round_page(va.va_size); 1226 if (va.va_nlink == 0) 1227 flags |= MAP_NOSYNC; 1228 } 1229 obj = vm_pager_allocate(type, handle, objsize, prot, foff); 1230 if (obj == NULL) { 1231 error = (type == OBJT_DEVICE ? EINVAL : ENOMEM); 1232 goto done; 1233 } 1234 *objp = obj; 1235 *flagsp = flags; 1236 vfs_mark_atime(vp, td); 1237 1238 done: 1239 vput(vp); 1240 VFS_UNLOCK_GIANT(vfslocked); 1241 return (error); 1242 } 1243 1244 /* 1245 * vm_mmap_cdev() 1246 * 1247 * MPSAFE 1248 * 1249 * Helper function for vm_mmap. Perform sanity check specific for mmap 1250 * operations on cdevs. 1251 */ 1252 int 1253 vm_mmap_cdev(struct thread *td, vm_size_t objsize, 1254 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1255 struct cdev *cdev, vm_ooffset_t foff, vm_object_t *objp) 1256 { 1257 vm_object_t obj; 1258 int flags; 1259 1260 flags = *flagsp; 1261 1262 /* XXX: lack thredref on device */ 1263 if (cdev->si_devsw->d_flags & D_MMAP_ANON) { 1264 *maxprotp = VM_PROT_ALL; 1265 *flagsp |= MAP_ANON; 1266 return (0); 1267 } 1268 /* 1269 * cdevs does not provide private mappings of any kind. 1270 */ 1271 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1272 (prot & PROT_WRITE) != 0) 1273 return (EACCES); 1274 if (flags & (MAP_PRIVATE|MAP_COPY)) 1275 return (EINVAL); 1276 /* 1277 * Force device mappings to be shared. 1278 */ 1279 flags |= MAP_SHARED; 1280 #ifdef MAC_XXX 1281 error = mac_check_cdev_mmap(td->td_ucred, cdev, prot); 1282 if (error != 0) 1283 return (error); 1284 #endif 1285 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, foff); 1286 if (obj == NULL) 1287 return (EINVAL); 1288 *objp = obj; 1289 *flagsp = flags; 1290 return (0); 1291 } 1292 1293 /* 1294 * vm_mmap() 1295 * 1296 * MPSAFE 1297 * 1298 * Internal version of mmap. Currently used by mmap, exec, and sys5 1299 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1300 */ 1301 int 1302 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1303 vm_prot_t maxprot, int flags, 1304 objtype_t handle_type, void *handle, 1305 vm_ooffset_t foff) 1306 { 1307 boolean_t fitit; 1308 vm_object_t object = NULL; 1309 int rv = KERN_SUCCESS; 1310 int docow, error; 1311 struct thread *td = curthread; 1312 1313 if (size == 0) 1314 return (0); 1315 1316 size = round_page(size); 1317 1318 PROC_LOCK(td->td_proc); 1319 if (td->td_proc->p_vmspace->vm_map.size + size > 1320 lim_cur(td->td_proc, RLIMIT_VMEM)) { 1321 PROC_UNLOCK(td->td_proc); 1322 return(ENOMEM); 1323 } 1324 PROC_UNLOCK(td->td_proc); 1325 1326 /* 1327 * We currently can only deal with page aligned file offsets. 1328 * The check is here rather than in the syscall because the 1329 * kernel calls this function internally for other mmaping 1330 * operations (such as in exec) and non-aligned offsets will 1331 * cause pmap inconsistencies...so we want to be sure to 1332 * disallow this in all cases. 1333 */ 1334 if (foff & PAGE_MASK) 1335 return (EINVAL); 1336 1337 if ((flags & MAP_FIXED) == 0) { 1338 fitit = TRUE; 1339 *addr = round_page(*addr); 1340 } else { 1341 if (*addr != trunc_page(*addr)) 1342 return (EINVAL); 1343 fitit = FALSE; 1344 } 1345 /* 1346 * Lookup/allocate object. 1347 */ 1348 switch (handle_type) { 1349 case OBJT_DEVICE: 1350 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, 1351 handle, foff, &object); 1352 break; 1353 case OBJT_VNODE: 1354 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1355 handle, foff, &object); 1356 break; 1357 case OBJT_DEFAULT: 1358 if (handle == NULL) { 1359 error = 0; 1360 break; 1361 } 1362 /* FALLTHROUGH */ 1363 default: 1364 error = EINVAL; 1365 break; 1366 } 1367 if (error) 1368 return (error); 1369 if (flags & MAP_ANON) { 1370 object = NULL; 1371 docow = 0; 1372 /* 1373 * Unnamed anonymous regions always start at 0. 1374 */ 1375 if (handle == 0) 1376 foff = 0; 1377 } else { 1378 docow = MAP_PREFAULT_PARTIAL; 1379 } 1380 1381 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1382 docow |= MAP_COPY_ON_WRITE; 1383 if (flags & MAP_NOSYNC) 1384 docow |= MAP_DISABLE_SYNCER; 1385 if (flags & MAP_NOCORE) 1386 docow |= MAP_DISABLE_COREDUMP; 1387 1388 #if defined(VM_PROT_READ_IS_EXEC) 1389 if (prot & VM_PROT_READ) 1390 prot |= VM_PROT_EXECUTE; 1391 1392 if (maxprot & VM_PROT_READ) 1393 maxprot |= VM_PROT_EXECUTE; 1394 #endif 1395 1396 if (fitit) 1397 *addr = pmap_addr_hint(object, *addr, size); 1398 1399 if (flags & MAP_STACK) 1400 rv = vm_map_stack(map, *addr, size, prot, maxprot, 1401 docow | MAP_STACK_GROWS_DOWN); 1402 else if (fitit) 1403 rv = vm_map_find(map, object, foff, addr, size, TRUE, 1404 prot, maxprot, docow); 1405 else 1406 rv = vm_map_fixed(map, object, foff, addr, size, 1407 prot, maxprot, docow); 1408 1409 if (rv != KERN_SUCCESS) { 1410 /* 1411 * Lose the object reference. Will destroy the 1412 * object if it's an unnamed anonymous mapping 1413 * or named anonymous without other references. 1414 */ 1415 vm_object_deallocate(object); 1416 } else if (flags & MAP_SHARED) { 1417 /* 1418 * Shared memory is also shared with children. 1419 */ 1420 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); 1421 if (rv != KERN_SUCCESS) 1422 (void) vm_map_remove(map, *addr, *addr + size); 1423 } 1424 1425 /* 1426 * If the process has requested that all future mappings 1427 * be wired, then heed this. 1428 */ 1429 if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE)) 1430 vm_map_wire(map, *addr, *addr + size, 1431 VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); 1432 1433 switch (rv) { 1434 case KERN_SUCCESS: 1435 return (0); 1436 case KERN_INVALID_ADDRESS: 1437 case KERN_NO_SPACE: 1438 return (ENOMEM); 1439 case KERN_PROTECTION_FAILURE: 1440 return (EACCES); 1441 default: 1442 return (EINVAL); 1443 } 1444 } 1445