1 /*- 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35 * 36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_compat.h" 47 #include "opt_hwpmc_hooks.h" 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/kernel.h> 52 #include <sys/lock.h> 53 #include <sys/mutex.h> 54 #include <sys/sysproto.h> 55 #include <sys/filedesc.h> 56 #include <sys/priv.h> 57 #include <sys/proc.h> 58 #include <sys/resource.h> 59 #include <sys/resourcevar.h> 60 #include <sys/vnode.h> 61 #include <sys/fcntl.h> 62 #include <sys/file.h> 63 #include <sys/mman.h> 64 #include <sys/mount.h> 65 #include <sys/conf.h> 66 #include <sys/stat.h> 67 #include <sys/vmmeter.h> 68 #include <sys/sysctl.h> 69 70 #include <security/mac/mac_framework.h> 71 72 #include <vm/vm.h> 73 #include <vm/vm_param.h> 74 #include <vm/pmap.h> 75 #include <vm/vm_map.h> 76 #include <vm/vm_object.h> 77 #include <vm/vm_page.h> 78 #include <vm/vm_pager.h> 79 #include <vm/vm_pageout.h> 80 #include <vm/vm_extern.h> 81 #include <vm/vm_page.h> 82 #include <vm/vm_kern.h> 83 84 #ifdef HWPMC_HOOKS 85 #include <sys/pmckern.h> 86 #endif 87 88 #ifndef _SYS_SYSPROTO_H_ 89 struct sbrk_args { 90 int incr; 91 }; 92 #endif 93 94 static int max_proc_mmap; 95 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, 96 "Maximum number of memory-mapped files per process"); 97 98 /* 99 * Set the maximum number of vm_map_entry structures per process. Roughly 100 * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100 101 * of our KVM malloc space still results in generous limits. We want a 102 * default that is good enough to prevent the kernel running out of resources 103 * if attacked from compromised user account but generous enough such that 104 * multi-threaded processes are not unduly inconvenienced. 105 */ 106 static void vmmapentry_rsrc_init(void *); 107 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, 108 NULL); 109 110 static void 111 vmmapentry_rsrc_init(dummy) 112 void *dummy; 113 { 114 max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry); 115 max_proc_mmap /= 100; 116 } 117 118 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 119 int *, struct vnode *, vm_ooffset_t *, vm_object_t *); 120 static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 121 int *, struct cdev *, vm_ooffset_t *, vm_object_t *); 122 static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 123 int *, struct shmfd *, vm_ooffset_t, vm_object_t *); 124 125 /* 126 * MPSAFE 127 */ 128 /* ARGSUSED */ 129 int 130 sbrk(td, uap) 131 struct thread *td; 132 struct sbrk_args *uap; 133 { 134 /* Not yet implemented */ 135 return (EOPNOTSUPP); 136 } 137 138 #ifndef _SYS_SYSPROTO_H_ 139 struct sstk_args { 140 int incr; 141 }; 142 #endif 143 144 /* 145 * MPSAFE 146 */ 147 /* ARGSUSED */ 148 int 149 sstk(td, uap) 150 struct thread *td; 151 struct sstk_args *uap; 152 { 153 /* Not yet implemented */ 154 return (EOPNOTSUPP); 155 } 156 157 #if defined(COMPAT_43) 158 #ifndef _SYS_SYSPROTO_H_ 159 struct getpagesize_args { 160 int dummy; 161 }; 162 #endif 163 164 /* ARGSUSED */ 165 int 166 ogetpagesize(td, uap) 167 struct thread *td; 168 struct getpagesize_args *uap; 169 { 170 /* MP SAFE */ 171 td->td_retval[0] = PAGE_SIZE; 172 return (0); 173 } 174 #endif /* COMPAT_43 */ 175 176 177 /* 178 * Memory Map (mmap) system call. Note that the file offset 179 * and address are allowed to be NOT page aligned, though if 180 * the MAP_FIXED flag it set, both must have the same remainder 181 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 182 * page-aligned, the actual mapping starts at trunc_page(addr) 183 * and the return value is adjusted up by the page offset. 184 * 185 * Generally speaking, only character devices which are themselves 186 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 187 * there would be no cache coherency between a descriptor and a VM mapping 188 * both to the same character device. 189 */ 190 #ifndef _SYS_SYSPROTO_H_ 191 struct mmap_args { 192 void *addr; 193 size_t len; 194 int prot; 195 int flags; 196 int fd; 197 long pad; 198 off_t pos; 199 }; 200 #endif 201 202 /* 203 * MPSAFE 204 */ 205 int 206 mmap(td, uap) 207 struct thread *td; 208 struct mmap_args *uap; 209 { 210 #ifdef HWPMC_HOOKS 211 struct pmckern_map_in pkm; 212 #endif 213 struct file *fp; 214 struct vnode *vp; 215 vm_offset_t addr; 216 vm_size_t size, pageoff; 217 vm_prot_t prot, maxprot; 218 void *handle; 219 objtype_t handle_type; 220 int flags, error; 221 off_t pos; 222 struct vmspace *vms = td->td_proc->p_vmspace; 223 224 addr = (vm_offset_t) uap->addr; 225 size = uap->len; 226 prot = uap->prot & VM_PROT_ALL; 227 flags = uap->flags; 228 pos = uap->pos; 229 230 fp = NULL; 231 /* make sure mapping fits into numeric range etc */ 232 if (uap->len == 0 || 233 ((flags & MAP_ANON) && uap->fd != -1)) 234 return (EINVAL); 235 236 if (flags & MAP_STACK) { 237 if ((uap->fd != -1) || 238 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 239 return (EINVAL); 240 flags |= MAP_ANON; 241 pos = 0; 242 } 243 244 /* 245 * Align the file position to a page boundary, 246 * and save its page offset component. 247 */ 248 pageoff = (pos & PAGE_MASK); 249 pos -= pageoff; 250 251 /* Adjust size for rounding (on both ends). */ 252 size += pageoff; /* low end... */ 253 size = (vm_size_t) round_page(size); /* hi end */ 254 255 /* 256 * Check for illegal addresses. Watch out for address wrap... Note 257 * that VM_*_ADDRESS are not constants due to casts (argh). 258 */ 259 if (flags & MAP_FIXED) { 260 /* 261 * The specified address must have the same remainder 262 * as the file offset taken modulo PAGE_SIZE, so it 263 * should be aligned after adjustment by pageoff. 264 */ 265 addr -= pageoff; 266 if (addr & PAGE_MASK) 267 return (EINVAL); 268 /* Address range must be all in user VM space. */ 269 if (addr < vm_map_min(&vms->vm_map) || 270 addr + size > vm_map_max(&vms->vm_map)) 271 return (EINVAL); 272 if (addr + size < addr) 273 return (EINVAL); 274 } else { 275 /* 276 * XXX for non-fixed mappings where no hint is provided or 277 * the hint would fall in the potential heap space, 278 * place it after the end of the largest possible heap. 279 * 280 * There should really be a pmap call to determine a reasonable 281 * location. 282 */ 283 PROC_LOCK(td->td_proc); 284 if (addr == 0 || 285 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 286 addr < round_page((vm_offset_t)vms->vm_daddr + 287 lim_max(td->td_proc, RLIMIT_DATA)))) 288 addr = round_page((vm_offset_t)vms->vm_daddr + 289 lim_max(td->td_proc, RLIMIT_DATA)); 290 PROC_UNLOCK(td->td_proc); 291 } 292 if (flags & MAP_ANON) { 293 /* 294 * Mapping blank space is trivial. 295 */ 296 handle = NULL; 297 handle_type = OBJT_DEFAULT; 298 maxprot = VM_PROT_ALL; 299 pos = 0; 300 } else { 301 /* 302 * Mapping file, get fp for validation and 303 * don't let the descriptor disappear on us if we block. 304 */ 305 if ((error = fget(td, uap->fd, &fp)) != 0) 306 goto done; 307 if (fp->f_type == DTYPE_SHM) { 308 handle = fp->f_data; 309 handle_type = OBJT_SWAP; 310 maxprot = VM_PROT_NONE; 311 312 /* FREAD should always be set. */ 313 if (fp->f_flag & FREAD) 314 maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; 315 if (fp->f_flag & FWRITE) 316 maxprot |= VM_PROT_WRITE; 317 goto map; 318 } 319 if (fp->f_type != DTYPE_VNODE) { 320 error = ENODEV; 321 goto done; 322 } 323 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ 324 defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) 325 /* 326 * POSIX shared-memory objects are defined to have 327 * kernel persistence, and are not defined to support 328 * read(2)/write(2) -- or even open(2). Thus, we can 329 * use MAP_ASYNC to trade on-disk coherence for speed. 330 * The shm_open(3) library routine turns on the FPOSIXSHM 331 * flag to request this behavior. 332 */ 333 if (fp->f_flag & FPOSIXSHM) 334 flags |= MAP_NOSYNC; 335 #endif 336 vp = fp->f_vnode; 337 /* 338 * Ensure that file and memory protections are 339 * compatible. Note that we only worry about 340 * writability if mapping is shared; in this case, 341 * current and max prot are dictated by the open file. 342 * XXX use the vnode instead? Problem is: what 343 * credentials do we use for determination? What if 344 * proc does a setuid? 345 */ 346 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 347 maxprot = VM_PROT_NONE; 348 else 349 maxprot = VM_PROT_EXECUTE; 350 if (fp->f_flag & FREAD) { 351 maxprot |= VM_PROT_READ; 352 } else if (prot & PROT_READ) { 353 error = EACCES; 354 goto done; 355 } 356 /* 357 * If we are sharing potential changes (either via 358 * MAP_SHARED or via the implicit sharing of character 359 * device mappings), and we are trying to get write 360 * permission although we opened it without asking 361 * for it, bail out. 362 */ 363 if ((flags & MAP_SHARED) != 0) { 364 if ((fp->f_flag & FWRITE) != 0) { 365 maxprot |= VM_PROT_WRITE; 366 } else if ((prot & PROT_WRITE) != 0) { 367 error = EACCES; 368 goto done; 369 } 370 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 371 maxprot |= VM_PROT_WRITE; 372 } 373 handle = (void *)vp; 374 handle_type = OBJT_VNODE; 375 } 376 map: 377 378 /* 379 * Do not allow more then a certain number of vm_map_entry structures 380 * per process. Scale with the number of rforks sharing the map 381 * to make the limit reasonable for threads. 382 */ 383 if (max_proc_mmap && 384 vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) { 385 error = ENOMEM; 386 goto done; 387 } 388 389 td->td_fpop = fp; 390 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 391 flags, handle_type, handle, pos); 392 td->td_fpop = NULL; 393 #ifdef HWPMC_HOOKS 394 /* inform hwpmc(4) if an executable is being mapped */ 395 if (error == 0 && handle_type == OBJT_VNODE && 396 (prot & PROT_EXEC)) { 397 pkm.pm_file = handle; 398 pkm.pm_address = (uintptr_t) addr; 399 PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); 400 } 401 #endif 402 if (error == 0) 403 td->td_retval[0] = (register_t) (addr + pageoff); 404 done: 405 if (fp) 406 fdrop(fp, td); 407 408 return (error); 409 } 410 411 int 412 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 413 { 414 struct mmap_args oargs; 415 416 oargs.addr = uap->addr; 417 oargs.len = uap->len; 418 oargs.prot = uap->prot; 419 oargs.flags = uap->flags; 420 oargs.fd = uap->fd; 421 oargs.pos = uap->pos; 422 return (mmap(td, &oargs)); 423 } 424 425 #ifdef COMPAT_43 426 #ifndef _SYS_SYSPROTO_H_ 427 struct ommap_args { 428 caddr_t addr; 429 int len; 430 int prot; 431 int flags; 432 int fd; 433 long pos; 434 }; 435 #endif 436 int 437 ommap(td, uap) 438 struct thread *td; 439 struct ommap_args *uap; 440 { 441 struct mmap_args nargs; 442 static const char cvtbsdprot[8] = { 443 0, 444 PROT_EXEC, 445 PROT_WRITE, 446 PROT_EXEC | PROT_WRITE, 447 PROT_READ, 448 PROT_EXEC | PROT_READ, 449 PROT_WRITE | PROT_READ, 450 PROT_EXEC | PROT_WRITE | PROT_READ, 451 }; 452 453 #define OMAP_ANON 0x0002 454 #define OMAP_COPY 0x0020 455 #define OMAP_SHARED 0x0010 456 #define OMAP_FIXED 0x0100 457 458 nargs.addr = uap->addr; 459 nargs.len = uap->len; 460 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 461 nargs.flags = 0; 462 if (uap->flags & OMAP_ANON) 463 nargs.flags |= MAP_ANON; 464 if (uap->flags & OMAP_COPY) 465 nargs.flags |= MAP_COPY; 466 if (uap->flags & OMAP_SHARED) 467 nargs.flags |= MAP_SHARED; 468 else 469 nargs.flags |= MAP_PRIVATE; 470 if (uap->flags & OMAP_FIXED) 471 nargs.flags |= MAP_FIXED; 472 nargs.fd = uap->fd; 473 nargs.pos = uap->pos; 474 return (mmap(td, &nargs)); 475 } 476 #endif /* COMPAT_43 */ 477 478 479 #ifndef _SYS_SYSPROTO_H_ 480 struct msync_args { 481 void *addr; 482 size_t len; 483 int flags; 484 }; 485 #endif 486 /* 487 * MPSAFE 488 */ 489 int 490 msync(td, uap) 491 struct thread *td; 492 struct msync_args *uap; 493 { 494 vm_offset_t addr; 495 vm_size_t size, pageoff; 496 int flags; 497 vm_map_t map; 498 int rv; 499 500 addr = (vm_offset_t) uap->addr; 501 size = uap->len; 502 flags = uap->flags; 503 504 pageoff = (addr & PAGE_MASK); 505 addr -= pageoff; 506 size += pageoff; 507 size = (vm_size_t) round_page(size); 508 if (addr + size < addr) 509 return (EINVAL); 510 511 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 512 return (EINVAL); 513 514 map = &td->td_proc->p_vmspace->vm_map; 515 516 /* 517 * Clean the pages and interpret the return value. 518 */ 519 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 520 (flags & MS_INVALIDATE) != 0); 521 switch (rv) { 522 case KERN_SUCCESS: 523 return (0); 524 case KERN_INVALID_ADDRESS: 525 return (EINVAL); /* Sun returns ENOMEM? */ 526 case KERN_INVALID_ARGUMENT: 527 return (EBUSY); 528 default: 529 return (EINVAL); 530 } 531 } 532 533 #ifndef _SYS_SYSPROTO_H_ 534 struct munmap_args { 535 void *addr; 536 size_t len; 537 }; 538 #endif 539 /* 540 * MPSAFE 541 */ 542 int 543 munmap(td, uap) 544 struct thread *td; 545 struct munmap_args *uap; 546 { 547 #ifdef HWPMC_HOOKS 548 struct pmckern_map_out pkm; 549 vm_map_entry_t entry; 550 #endif 551 vm_offset_t addr; 552 vm_size_t size, pageoff; 553 vm_map_t map; 554 555 addr = (vm_offset_t) uap->addr; 556 size = uap->len; 557 if (size == 0) 558 return (EINVAL); 559 560 pageoff = (addr & PAGE_MASK); 561 addr -= pageoff; 562 size += pageoff; 563 size = (vm_size_t) round_page(size); 564 if (addr + size < addr) 565 return (EINVAL); 566 567 /* 568 * Check for illegal addresses. Watch out for address wrap... 569 */ 570 map = &td->td_proc->p_vmspace->vm_map; 571 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 572 return (EINVAL); 573 vm_map_lock(map); 574 #ifdef HWPMC_HOOKS 575 /* 576 * Inform hwpmc if the address range being unmapped contains 577 * an executable region. 578 */ 579 if (vm_map_lookup_entry(map, addr, &entry)) { 580 for (; 581 entry != &map->header && entry->start < addr + size; 582 entry = entry->next) { 583 if (vm_map_check_protection(map, entry->start, 584 entry->end, VM_PROT_EXECUTE) == TRUE) { 585 pkm.pm_address = (uintptr_t) addr; 586 pkm.pm_size = (size_t) size; 587 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, 588 (void *) &pkm); 589 break; 590 } 591 } 592 } 593 #endif 594 /* returns nothing but KERN_SUCCESS anyway */ 595 vm_map_delete(map, addr, addr + size); 596 vm_map_unlock(map); 597 return (0); 598 } 599 600 #ifndef _SYS_SYSPROTO_H_ 601 struct mprotect_args { 602 const void *addr; 603 size_t len; 604 int prot; 605 }; 606 #endif 607 /* 608 * MPSAFE 609 */ 610 int 611 mprotect(td, uap) 612 struct thread *td; 613 struct mprotect_args *uap; 614 { 615 vm_offset_t addr; 616 vm_size_t size, pageoff; 617 vm_prot_t prot; 618 619 addr = (vm_offset_t) uap->addr; 620 size = uap->len; 621 prot = uap->prot & VM_PROT_ALL; 622 623 pageoff = (addr & PAGE_MASK); 624 addr -= pageoff; 625 size += pageoff; 626 size = (vm_size_t) round_page(size); 627 if (addr + size < addr) 628 return (EINVAL); 629 630 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 631 addr + size, prot, FALSE)) { 632 case KERN_SUCCESS: 633 return (0); 634 case KERN_PROTECTION_FAILURE: 635 return (EACCES); 636 case KERN_RESOURCE_SHORTAGE: 637 return (ENOMEM); 638 } 639 return (EINVAL); 640 } 641 642 #ifndef _SYS_SYSPROTO_H_ 643 struct minherit_args { 644 void *addr; 645 size_t len; 646 int inherit; 647 }; 648 #endif 649 /* 650 * MPSAFE 651 */ 652 int 653 minherit(td, uap) 654 struct thread *td; 655 struct minherit_args *uap; 656 { 657 vm_offset_t addr; 658 vm_size_t size, pageoff; 659 vm_inherit_t inherit; 660 661 addr = (vm_offset_t)uap->addr; 662 size = uap->len; 663 inherit = uap->inherit; 664 665 pageoff = (addr & PAGE_MASK); 666 addr -= pageoff; 667 size += pageoff; 668 size = (vm_size_t) round_page(size); 669 if (addr + size < addr) 670 return (EINVAL); 671 672 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 673 addr + size, inherit)) { 674 case KERN_SUCCESS: 675 return (0); 676 case KERN_PROTECTION_FAILURE: 677 return (EACCES); 678 } 679 return (EINVAL); 680 } 681 682 #ifndef _SYS_SYSPROTO_H_ 683 struct madvise_args { 684 void *addr; 685 size_t len; 686 int behav; 687 }; 688 #endif 689 690 /* 691 * MPSAFE 692 */ 693 /* ARGSUSED */ 694 int 695 madvise(td, uap) 696 struct thread *td; 697 struct madvise_args *uap; 698 { 699 vm_offset_t start, end; 700 vm_map_t map; 701 struct proc *p; 702 int error; 703 704 /* 705 * Check for our special case, advising the swap pager we are 706 * "immortal." 707 */ 708 if (uap->behav == MADV_PROTECT) { 709 error = priv_check(td, PRIV_VM_MADV_PROTECT); 710 if (error == 0) { 711 p = td->td_proc; 712 PROC_LOCK(p); 713 p->p_flag |= P_PROTECTED; 714 PROC_UNLOCK(p); 715 } 716 return (error); 717 } 718 /* 719 * Check for illegal behavior 720 */ 721 if (uap->behav < 0 || uap->behav > MADV_CORE) 722 return (EINVAL); 723 /* 724 * Check for illegal addresses. Watch out for address wrap... Note 725 * that VM_*_ADDRESS are not constants due to casts (argh). 726 */ 727 map = &td->td_proc->p_vmspace->vm_map; 728 if ((vm_offset_t)uap->addr < vm_map_min(map) || 729 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 730 return (EINVAL); 731 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 732 return (EINVAL); 733 734 /* 735 * Since this routine is only advisory, we default to conservative 736 * behavior. 737 */ 738 start = trunc_page((vm_offset_t) uap->addr); 739 end = round_page((vm_offset_t) uap->addr + uap->len); 740 741 if (vm_map_madvise(map, start, end, uap->behav)) 742 return (EINVAL); 743 return (0); 744 } 745 746 #ifndef _SYS_SYSPROTO_H_ 747 struct mincore_args { 748 const void *addr; 749 size_t len; 750 char *vec; 751 }; 752 #endif 753 754 /* 755 * MPSAFE 756 */ 757 /* ARGSUSED */ 758 int 759 mincore(td, uap) 760 struct thread *td; 761 struct mincore_args *uap; 762 { 763 vm_offset_t addr, first_addr; 764 vm_offset_t end, cend; 765 pmap_t pmap; 766 vm_map_t map; 767 char *vec; 768 int error = 0; 769 int vecindex, lastvecindex; 770 vm_map_entry_t current; 771 vm_map_entry_t entry; 772 int mincoreinfo; 773 unsigned int timestamp; 774 775 /* 776 * Make sure that the addresses presented are valid for user 777 * mode. 778 */ 779 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 780 end = addr + (vm_size_t)round_page(uap->len); 781 map = &td->td_proc->p_vmspace->vm_map; 782 if (end > vm_map_max(map) || end < addr) 783 return (ENOMEM); 784 785 /* 786 * Address of byte vector 787 */ 788 vec = uap->vec; 789 790 pmap = vmspace_pmap(td->td_proc->p_vmspace); 791 792 vm_map_lock_read(map); 793 RestartScan: 794 timestamp = map->timestamp; 795 796 if (!vm_map_lookup_entry(map, addr, &entry)) { 797 vm_map_unlock_read(map); 798 return (ENOMEM); 799 } 800 801 /* 802 * Do this on a map entry basis so that if the pages are not 803 * in the current processes address space, we can easily look 804 * up the pages elsewhere. 805 */ 806 lastvecindex = -1; 807 for (current = entry; 808 (current != &map->header) && (current->start < end); 809 current = current->next) { 810 811 /* 812 * check for contiguity 813 */ 814 if (current->end < end && 815 (entry->next == &map->header || 816 current->next->start > current->end)) { 817 vm_map_unlock_read(map); 818 return (ENOMEM); 819 } 820 821 /* 822 * ignore submaps (for now) or null objects 823 */ 824 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 825 current->object.vm_object == NULL) 826 continue; 827 828 /* 829 * limit this scan to the current map entry and the 830 * limits for the mincore call 831 */ 832 if (addr < current->start) 833 addr = current->start; 834 cend = current->end; 835 if (cend > end) 836 cend = end; 837 838 /* 839 * scan this entry one page at a time 840 */ 841 while (addr < cend) { 842 /* 843 * Check pmap first, it is likely faster, also 844 * it can provide info as to whether we are the 845 * one referencing or modifying the page. 846 */ 847 mincoreinfo = pmap_mincore(pmap, addr); 848 if (!mincoreinfo) { 849 vm_pindex_t pindex; 850 vm_ooffset_t offset; 851 vm_page_t m; 852 /* 853 * calculate the page index into the object 854 */ 855 offset = current->offset + (addr - current->start); 856 pindex = OFF_TO_IDX(offset); 857 VM_OBJECT_LOCK(current->object.vm_object); 858 m = vm_page_lookup(current->object.vm_object, 859 pindex); 860 /* 861 * if the page is resident, then gather information about 862 * it. 863 */ 864 if (m != NULL && m->valid != 0) { 865 mincoreinfo = MINCORE_INCORE; 866 vm_page_lock_queues(); 867 if (m->dirty || 868 pmap_is_modified(m)) 869 mincoreinfo |= MINCORE_MODIFIED_OTHER; 870 if ((m->flags & PG_REFERENCED) || 871 pmap_ts_referenced(m)) { 872 vm_page_flag_set(m, PG_REFERENCED); 873 mincoreinfo |= MINCORE_REFERENCED_OTHER; 874 } 875 vm_page_unlock_queues(); 876 } 877 VM_OBJECT_UNLOCK(current->object.vm_object); 878 } 879 880 /* 881 * subyte may page fault. In case it needs to modify 882 * the map, we release the lock. 883 */ 884 vm_map_unlock_read(map); 885 886 /* 887 * calculate index into user supplied byte vector 888 */ 889 vecindex = OFF_TO_IDX(addr - first_addr); 890 891 /* 892 * If we have skipped map entries, we need to make sure that 893 * the byte vector is zeroed for those skipped entries. 894 */ 895 while ((lastvecindex + 1) < vecindex) { 896 error = subyte(vec + lastvecindex, 0); 897 if (error) { 898 error = EFAULT; 899 goto done2; 900 } 901 ++lastvecindex; 902 } 903 904 /* 905 * Pass the page information to the user 906 */ 907 error = subyte(vec + vecindex, mincoreinfo); 908 if (error) { 909 error = EFAULT; 910 goto done2; 911 } 912 913 /* 914 * If the map has changed, due to the subyte, the previous 915 * output may be invalid. 916 */ 917 vm_map_lock_read(map); 918 if (timestamp != map->timestamp) 919 goto RestartScan; 920 921 lastvecindex = vecindex; 922 addr += PAGE_SIZE; 923 } 924 } 925 926 /* 927 * subyte may page fault. In case it needs to modify 928 * the map, we release the lock. 929 */ 930 vm_map_unlock_read(map); 931 932 /* 933 * Zero the last entries in the byte vector. 934 */ 935 vecindex = OFF_TO_IDX(end - first_addr); 936 while ((lastvecindex + 1) < vecindex) { 937 error = subyte(vec + lastvecindex, 0); 938 if (error) { 939 error = EFAULT; 940 goto done2; 941 } 942 ++lastvecindex; 943 } 944 945 /* 946 * If the map has changed, due to the subyte, the previous 947 * output may be invalid. 948 */ 949 vm_map_lock_read(map); 950 if (timestamp != map->timestamp) 951 goto RestartScan; 952 vm_map_unlock_read(map); 953 done2: 954 return (error); 955 } 956 957 #ifndef _SYS_SYSPROTO_H_ 958 struct mlock_args { 959 const void *addr; 960 size_t len; 961 }; 962 #endif 963 /* 964 * MPSAFE 965 */ 966 int 967 mlock(td, uap) 968 struct thread *td; 969 struct mlock_args *uap; 970 { 971 struct proc *proc; 972 vm_offset_t addr, end, last, start; 973 vm_size_t npages, size; 974 int error; 975 976 error = priv_check(td, PRIV_VM_MLOCK); 977 if (error) 978 return (error); 979 addr = (vm_offset_t)uap->addr; 980 size = uap->len; 981 last = addr + size; 982 start = trunc_page(addr); 983 end = round_page(last); 984 if (last < addr || end < addr) 985 return (EINVAL); 986 npages = atop(end - start); 987 if (npages > vm_page_max_wired) 988 return (ENOMEM); 989 proc = td->td_proc; 990 PROC_LOCK(proc); 991 if (ptoa(npages + 992 pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) > 993 lim_cur(proc, RLIMIT_MEMLOCK)) { 994 PROC_UNLOCK(proc); 995 return (ENOMEM); 996 } 997 PROC_UNLOCK(proc); 998 if (npages + cnt.v_wire_count > vm_page_max_wired) 999 return (EAGAIN); 1000 error = vm_map_wire(&proc->p_vmspace->vm_map, start, end, 1001 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1002 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1003 } 1004 1005 #ifndef _SYS_SYSPROTO_H_ 1006 struct mlockall_args { 1007 int how; 1008 }; 1009 #endif 1010 1011 /* 1012 * MPSAFE 1013 */ 1014 int 1015 mlockall(td, uap) 1016 struct thread *td; 1017 struct mlockall_args *uap; 1018 { 1019 vm_map_t map; 1020 int error; 1021 1022 map = &td->td_proc->p_vmspace->vm_map; 1023 error = 0; 1024 1025 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1026 return (EINVAL); 1027 1028 #if 0 1029 /* 1030 * If wiring all pages in the process would cause it to exceed 1031 * a hard resource limit, return ENOMEM. 1032 */ 1033 PROC_LOCK(td->td_proc); 1034 if (map->size - ptoa(pmap_wired_count(vm_map_pmap(map)) > 1035 lim_cur(td->td_proc, RLIMIT_MEMLOCK))) { 1036 PROC_UNLOCK(td->td_proc); 1037 return (ENOMEM); 1038 } 1039 PROC_UNLOCK(td->td_proc); 1040 #else 1041 error = priv_check(td, PRIV_VM_MLOCK); 1042 if (error) 1043 return (error); 1044 #endif 1045 1046 if (uap->how & MCL_FUTURE) { 1047 vm_map_lock(map); 1048 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1049 vm_map_unlock(map); 1050 error = 0; 1051 } 1052 1053 if (uap->how & MCL_CURRENT) { 1054 /* 1055 * P1003.1-2001 mandates that all currently mapped pages 1056 * will be memory resident and locked (wired) upon return 1057 * from mlockall(). vm_map_wire() will wire pages, by 1058 * calling vm_fault_wire() for each page in the region. 1059 */ 1060 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1061 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1062 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1063 } 1064 1065 return (error); 1066 } 1067 1068 #ifndef _SYS_SYSPROTO_H_ 1069 struct munlockall_args { 1070 register_t dummy; 1071 }; 1072 #endif 1073 1074 /* 1075 * MPSAFE 1076 */ 1077 int 1078 munlockall(td, uap) 1079 struct thread *td; 1080 struct munlockall_args *uap; 1081 { 1082 vm_map_t map; 1083 int error; 1084 1085 map = &td->td_proc->p_vmspace->vm_map; 1086 error = priv_check(td, PRIV_VM_MUNLOCK); 1087 if (error) 1088 return (error); 1089 1090 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1091 vm_map_lock(map); 1092 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1093 vm_map_unlock(map); 1094 1095 /* Forcibly unwire all pages. */ 1096 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1097 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1098 1099 return (error); 1100 } 1101 1102 #ifndef _SYS_SYSPROTO_H_ 1103 struct munlock_args { 1104 const void *addr; 1105 size_t len; 1106 }; 1107 #endif 1108 /* 1109 * MPSAFE 1110 */ 1111 int 1112 munlock(td, uap) 1113 struct thread *td; 1114 struct munlock_args *uap; 1115 { 1116 vm_offset_t addr, end, last, start; 1117 vm_size_t size; 1118 int error; 1119 1120 error = priv_check(td, PRIV_VM_MUNLOCK); 1121 if (error) 1122 return (error); 1123 addr = (vm_offset_t)uap->addr; 1124 size = uap->len; 1125 last = addr + size; 1126 start = trunc_page(addr); 1127 end = round_page(last); 1128 if (last < addr || end < addr) 1129 return (EINVAL); 1130 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1131 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1132 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1133 } 1134 1135 /* 1136 * vm_mmap_vnode() 1137 * 1138 * MPSAFE 1139 * 1140 * Helper function for vm_mmap. Perform sanity check specific for mmap 1141 * operations on vnodes. 1142 */ 1143 int 1144 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1145 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1146 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp) 1147 { 1148 struct vattr va; 1149 vm_object_t obj; 1150 vm_offset_t foff; 1151 struct mount *mp; 1152 struct ucred *cred; 1153 int error, flags; 1154 int vfslocked; 1155 1156 mp = vp->v_mount; 1157 cred = td->td_ucred; 1158 vfslocked = VFS_LOCK_GIANT(mp); 1159 if ((error = vget(vp, LK_SHARED, td)) != 0) { 1160 VFS_UNLOCK_GIANT(vfslocked); 1161 return (error); 1162 } 1163 foff = *foffp; 1164 flags = *flagsp; 1165 obj = vp->v_object; 1166 if (vp->v_type == VREG) { 1167 /* 1168 * Get the proper underlying object 1169 */ 1170 if (obj == NULL) { 1171 error = EINVAL; 1172 goto done; 1173 } 1174 if (obj->handle != vp) { 1175 vput(vp); 1176 vp = (struct vnode*)obj->handle; 1177 vget(vp, LK_SHARED, td); 1178 } 1179 } else if (vp->v_type == VCHR) { 1180 error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp, 1181 vp->v_rdev, foffp, objp); 1182 if (error == 0) 1183 goto mark_atime; 1184 goto done; 1185 } else { 1186 error = EINVAL; 1187 goto done; 1188 } 1189 if ((error = VOP_GETATTR(vp, &va, cred))) 1190 goto done; 1191 #ifdef MAC 1192 error = mac_vnode_check_mmap(cred, vp, prot, flags); 1193 if (error != 0) 1194 goto done; 1195 #endif 1196 if ((flags & MAP_SHARED) != 0) { 1197 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1198 if (prot & PROT_WRITE) { 1199 error = EPERM; 1200 goto done; 1201 } 1202 *maxprotp &= ~VM_PROT_WRITE; 1203 } 1204 } 1205 /* 1206 * If it is a regular file without any references 1207 * we do not need to sync it. 1208 * Adjust object size to be the size of actual file. 1209 */ 1210 objsize = round_page(va.va_size); 1211 if (va.va_nlink == 0) 1212 flags |= MAP_NOSYNC; 1213 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, td->td_ucred); 1214 if (obj == NULL) { 1215 error = ENOMEM; 1216 goto done; 1217 } 1218 *objp = obj; 1219 *flagsp = flags; 1220 1221 mark_atime: 1222 vfs_mark_atime(vp, cred); 1223 1224 done: 1225 vput(vp); 1226 VFS_UNLOCK_GIANT(vfslocked); 1227 return (error); 1228 } 1229 1230 /* 1231 * vm_mmap_cdev() 1232 * 1233 * MPSAFE 1234 * 1235 * Helper function for vm_mmap. Perform sanity check specific for mmap 1236 * operations on cdevs. 1237 */ 1238 int 1239 vm_mmap_cdev(struct thread *td, vm_size_t objsize, 1240 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1241 struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp) 1242 { 1243 vm_object_t obj; 1244 struct cdevsw *dsw; 1245 int error, flags; 1246 1247 flags = *flagsp; 1248 1249 dsw = dev_refthread(cdev); 1250 if (dsw == NULL) 1251 return (ENXIO); 1252 if (dsw->d_flags & D_MMAP_ANON) { 1253 dev_relthread(cdev); 1254 *maxprotp = VM_PROT_ALL; 1255 *flagsp |= MAP_ANON; 1256 return (0); 1257 } 1258 /* 1259 * cdevs do not provide private mappings of any kind. 1260 */ 1261 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1262 (prot & PROT_WRITE) != 0) { 1263 dev_relthread(cdev); 1264 return (EACCES); 1265 } 1266 if (flags & (MAP_PRIVATE|MAP_COPY)) { 1267 dev_relthread(cdev); 1268 return (EINVAL); 1269 } 1270 /* 1271 * Force device mappings to be shared. 1272 */ 1273 flags |= MAP_SHARED; 1274 #ifdef MAC_XXX 1275 error = mac_cdev_check_mmap(td->td_ucred, cdev, prot); 1276 if (error != 0) { 1277 dev_relthread(cdev); 1278 return (error); 1279 } 1280 #endif 1281 /* 1282 * First, try d_mmap_single(). If that is not implemented 1283 * (returns ENODEV), fall back to using the device pager. 1284 * Note that d_mmap_single() must return a reference to the 1285 * object (it needs to bump the reference count of the object 1286 * it returns somehow). 1287 * 1288 * XXX assumes VM_PROT_* == PROT_* 1289 */ 1290 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1291 dev_relthread(cdev); 1292 if (error != ENODEV) 1293 return (error); 1294 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1295 td->td_ucred); 1296 if (obj == NULL) 1297 return (EINVAL); 1298 *objp = obj; 1299 *flagsp = flags; 1300 return (0); 1301 } 1302 1303 /* 1304 * vm_mmap_shm() 1305 * 1306 * MPSAFE 1307 * 1308 * Helper function for vm_mmap. Perform sanity check specific for mmap 1309 * operations on shm file descriptors. 1310 */ 1311 int 1312 vm_mmap_shm(struct thread *td, vm_size_t objsize, 1313 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1314 struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp) 1315 { 1316 int error; 1317 1318 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1319 (prot & PROT_WRITE) != 0) 1320 return (EACCES); 1321 #ifdef MAC 1322 error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp); 1323 if (error != 0) 1324 return (error); 1325 #endif 1326 error = shm_mmap(shmfd, objsize, foff, objp); 1327 if (error) 1328 return (error); 1329 return (0); 1330 } 1331 1332 /* 1333 * vm_mmap() 1334 * 1335 * MPSAFE 1336 * 1337 * Internal version of mmap. Currently used by mmap, exec, and sys5 1338 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1339 */ 1340 int 1341 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1342 vm_prot_t maxprot, int flags, 1343 objtype_t handle_type, void *handle, 1344 vm_ooffset_t foff) 1345 { 1346 boolean_t fitit; 1347 vm_object_t object = NULL; 1348 int rv = KERN_SUCCESS; 1349 int docow, error; 1350 struct thread *td = curthread; 1351 1352 if (size == 0) 1353 return (0); 1354 1355 size = round_page(size); 1356 1357 PROC_LOCK(td->td_proc); 1358 if (td->td_proc->p_vmspace->vm_map.size + size > 1359 lim_cur(td->td_proc, RLIMIT_VMEM)) { 1360 PROC_UNLOCK(td->td_proc); 1361 return(ENOMEM); 1362 } 1363 PROC_UNLOCK(td->td_proc); 1364 1365 /* 1366 * We currently can only deal with page aligned file offsets. 1367 * The check is here rather than in the syscall because the 1368 * kernel calls this function internally for other mmaping 1369 * operations (such as in exec) and non-aligned offsets will 1370 * cause pmap inconsistencies...so we want to be sure to 1371 * disallow this in all cases. 1372 */ 1373 if (foff & PAGE_MASK) 1374 return (EINVAL); 1375 1376 if ((flags & MAP_FIXED) == 0) { 1377 fitit = TRUE; 1378 *addr = round_page(*addr); 1379 } else { 1380 if (*addr != trunc_page(*addr)) 1381 return (EINVAL); 1382 fitit = FALSE; 1383 } 1384 /* 1385 * Lookup/allocate object. 1386 */ 1387 switch (handle_type) { 1388 case OBJT_DEVICE: 1389 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, 1390 handle, &foff, &object); 1391 break; 1392 case OBJT_VNODE: 1393 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1394 handle, &foff, &object); 1395 break; 1396 case OBJT_SWAP: 1397 error = vm_mmap_shm(td, size, prot, &maxprot, &flags, 1398 handle, foff, &object); 1399 break; 1400 case OBJT_DEFAULT: 1401 if (handle == NULL) { 1402 error = 0; 1403 break; 1404 } 1405 /* FALLTHROUGH */ 1406 default: 1407 error = EINVAL; 1408 break; 1409 } 1410 if (error) 1411 return (error); 1412 if (flags & MAP_ANON) { 1413 object = NULL; 1414 docow = 0; 1415 /* 1416 * Unnamed anonymous regions always start at 0. 1417 */ 1418 if (handle == 0) 1419 foff = 0; 1420 } else { 1421 docow = MAP_PREFAULT_PARTIAL; 1422 } 1423 1424 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1425 docow |= MAP_COPY_ON_WRITE; 1426 if (flags & MAP_NOSYNC) 1427 docow |= MAP_DISABLE_SYNCER; 1428 if (flags & MAP_NOCORE) 1429 docow |= MAP_DISABLE_COREDUMP; 1430 1431 if (flags & MAP_STACK) 1432 rv = vm_map_stack(map, *addr, size, prot, maxprot, 1433 docow | MAP_STACK_GROWS_DOWN); 1434 else if (fitit) 1435 rv = vm_map_find(map, object, foff, addr, size, 1436 object != NULL && object->type == OBJT_DEVICE ? 1437 VMFS_ALIGNED_SPACE : VMFS_ANY_SPACE, prot, maxprot, docow); 1438 else 1439 rv = vm_map_fixed(map, object, foff, *addr, size, 1440 prot, maxprot, docow); 1441 1442 if (rv != KERN_SUCCESS) { 1443 /* 1444 * Lose the object reference. Will destroy the 1445 * object if it's an unnamed anonymous mapping 1446 * or named anonymous without other references. 1447 */ 1448 vm_object_deallocate(object); 1449 } else if (flags & MAP_SHARED) { 1450 /* 1451 * Shared memory is also shared with children. 1452 */ 1453 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); 1454 if (rv != KERN_SUCCESS) 1455 (void) vm_map_remove(map, *addr, *addr + size); 1456 } 1457 1458 /* 1459 * If the process has requested that all future mappings 1460 * be wired, then heed this. 1461 */ 1462 if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE)) 1463 vm_map_wire(map, *addr, *addr + size, 1464 VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); 1465 1466 switch (rv) { 1467 case KERN_SUCCESS: 1468 return (0); 1469 case KERN_INVALID_ADDRESS: 1470 case KERN_NO_SPACE: 1471 return (ENOMEM); 1472 case KERN_PROTECTION_FAILURE: 1473 return (EACCES); 1474 default: 1475 return (EINVAL); 1476 } 1477 } 1478