1 /*- 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35 * 36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_compat.h" 47 #include "opt_hwpmc_hooks.h" 48 #include "opt_mac.h" 49 50 #include <sys/param.h> 51 #include <sys/systm.h> 52 #include <sys/kernel.h> 53 #include <sys/lock.h> 54 #include <sys/mutex.h> 55 #include <sys/sysproto.h> 56 #include <sys/filedesc.h> 57 #include <sys/priv.h> 58 #include <sys/proc.h> 59 #include <sys/resource.h> 60 #include <sys/resourcevar.h> 61 #include <sys/vnode.h> 62 #include <sys/fcntl.h> 63 #include <sys/file.h> 64 #include <sys/mman.h> 65 #include <sys/mount.h> 66 #include <sys/conf.h> 67 #include <sys/stat.h> 68 #include <sys/vmmeter.h> 69 #include <sys/sysctl.h> 70 71 #include <security/mac/mac_framework.h> 72 73 #include <vm/vm.h> 74 #include <vm/vm_param.h> 75 #include <vm/pmap.h> 76 #include <vm/vm_map.h> 77 #include <vm/vm_object.h> 78 #include <vm/vm_page.h> 79 #include <vm/vm_pager.h> 80 #include <vm/vm_pageout.h> 81 #include <vm/vm_extern.h> 82 #include <vm/vm_page.h> 83 #include <vm/vm_kern.h> 84 85 #ifdef HWPMC_HOOKS 86 #include <sys/pmckern.h> 87 #endif 88 89 #ifndef _SYS_SYSPROTO_H_ 90 struct sbrk_args { 91 int incr; 92 }; 93 #endif 94 95 static int max_proc_mmap; 96 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, 97 "Maximum number of memory-mapped files per process"); 98 99 /* 100 * Set the maximum number of vm_map_entry structures per process. Roughly 101 * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100 102 * of our KVM malloc space still results in generous limits. We want a 103 * default that is good enough to prevent the kernel running out of resources 104 * if attacked from compromised user account but generous enough such that 105 * multi-threaded processes are not unduly inconvenienced. 106 */ 107 static void vmmapentry_rsrc_init(void *); 108 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, 109 NULL); 110 111 static void 112 vmmapentry_rsrc_init(dummy) 113 void *dummy; 114 { 115 max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry); 116 max_proc_mmap /= 100; 117 } 118 119 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 120 int *, struct vnode *, vm_ooffset_t, vm_object_t *); 121 static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 122 int *, struct cdev *, vm_ooffset_t, vm_object_t *); 123 static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 124 int *, struct shmfd *, vm_ooffset_t, vm_object_t *); 125 126 /* 127 * MPSAFE 128 */ 129 /* ARGSUSED */ 130 int 131 sbrk(td, uap) 132 struct thread *td; 133 struct sbrk_args *uap; 134 { 135 /* Not yet implemented */ 136 return (EOPNOTSUPP); 137 } 138 139 #ifndef _SYS_SYSPROTO_H_ 140 struct sstk_args { 141 int incr; 142 }; 143 #endif 144 145 /* 146 * MPSAFE 147 */ 148 /* ARGSUSED */ 149 int 150 sstk(td, uap) 151 struct thread *td; 152 struct sstk_args *uap; 153 { 154 /* Not yet implemented */ 155 return (EOPNOTSUPP); 156 } 157 158 #if defined(COMPAT_43) 159 #ifndef _SYS_SYSPROTO_H_ 160 struct getpagesize_args { 161 int dummy; 162 }; 163 #endif 164 165 /* ARGSUSED */ 166 int 167 ogetpagesize(td, uap) 168 struct thread *td; 169 struct getpagesize_args *uap; 170 { 171 /* MP SAFE */ 172 td->td_retval[0] = PAGE_SIZE; 173 return (0); 174 } 175 #endif /* COMPAT_43 */ 176 177 178 /* 179 * Memory Map (mmap) system call. Note that the file offset 180 * and address are allowed to be NOT page aligned, though if 181 * the MAP_FIXED flag it set, both must have the same remainder 182 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 183 * page-aligned, the actual mapping starts at trunc_page(addr) 184 * and the return value is adjusted up by the page offset. 185 * 186 * Generally speaking, only character devices which are themselves 187 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 188 * there would be no cache coherency between a descriptor and a VM mapping 189 * both to the same character device. 190 */ 191 #ifndef _SYS_SYSPROTO_H_ 192 struct mmap_args { 193 void *addr; 194 size_t len; 195 int prot; 196 int flags; 197 int fd; 198 long pad; 199 off_t pos; 200 }; 201 #endif 202 203 /* 204 * MPSAFE 205 */ 206 int 207 mmap(td, uap) 208 struct thread *td; 209 struct mmap_args *uap; 210 { 211 #ifdef HWPMC_HOOKS 212 struct pmckern_map_in pkm; 213 #endif 214 struct file *fp; 215 struct vnode *vp; 216 vm_offset_t addr; 217 vm_size_t size, pageoff; 218 vm_prot_t prot, maxprot; 219 void *handle; 220 objtype_t handle_type; 221 int flags, error; 222 off_t pos; 223 struct vmspace *vms = td->td_proc->p_vmspace; 224 225 addr = (vm_offset_t) uap->addr; 226 size = uap->len; 227 prot = uap->prot & VM_PROT_ALL; 228 flags = uap->flags; 229 pos = uap->pos; 230 231 fp = NULL; 232 /* make sure mapping fits into numeric range etc */ 233 if ((ssize_t) uap->len < 0 || 234 ((flags & MAP_ANON) && uap->fd != -1)) 235 return (EINVAL); 236 237 if (flags & MAP_STACK) { 238 if ((uap->fd != -1) || 239 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 240 return (EINVAL); 241 flags |= MAP_ANON; 242 pos = 0; 243 } 244 245 /* 246 * Align the file position to a page boundary, 247 * and save its page offset component. 248 */ 249 pageoff = (pos & PAGE_MASK); 250 pos -= pageoff; 251 252 /* Adjust size for rounding (on both ends). */ 253 size += pageoff; /* low end... */ 254 size = (vm_size_t) round_page(size); /* hi end */ 255 256 /* 257 * Check for illegal addresses. Watch out for address wrap... Note 258 * that VM_*_ADDRESS are not constants due to casts (argh). 259 */ 260 if (flags & MAP_FIXED) { 261 /* 262 * The specified address must have the same remainder 263 * as the file offset taken modulo PAGE_SIZE, so it 264 * should be aligned after adjustment by pageoff. 265 */ 266 addr -= pageoff; 267 if (addr & PAGE_MASK) 268 return (EINVAL); 269 /* Address range must be all in user VM space. */ 270 if (addr < vm_map_min(&vms->vm_map) || 271 addr + size > vm_map_max(&vms->vm_map)) 272 return (EINVAL); 273 if (addr + size < addr) 274 return (EINVAL); 275 } else { 276 /* 277 * XXX for non-fixed mappings where no hint is provided or 278 * the hint would fall in the potential heap space, 279 * place it after the end of the largest possible heap. 280 * 281 * There should really be a pmap call to determine a reasonable 282 * location. 283 */ 284 PROC_LOCK(td->td_proc); 285 if (addr == 0 || 286 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 287 addr < round_page((vm_offset_t)vms->vm_daddr + 288 lim_max(td->td_proc, RLIMIT_DATA)))) 289 addr = round_page((vm_offset_t)vms->vm_daddr + 290 lim_max(td->td_proc, RLIMIT_DATA)); 291 PROC_UNLOCK(td->td_proc); 292 } 293 if (flags & MAP_ANON) { 294 /* 295 * Mapping blank space is trivial. 296 */ 297 handle = NULL; 298 handle_type = OBJT_DEFAULT; 299 maxprot = VM_PROT_ALL; 300 pos = 0; 301 } else { 302 /* 303 * Mapping file, get fp for validation and 304 * don't let the descriptor disappear on us if we block. 305 */ 306 if ((error = fget(td, uap->fd, &fp)) != 0) 307 goto done; 308 if (fp->f_type == DTYPE_SHM) { 309 handle = fp->f_data; 310 handle_type = OBJT_SWAP; 311 maxprot = VM_PROT_NONE; 312 313 /* FREAD should always be set. */ 314 if (fp->f_flag & FREAD) 315 maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; 316 if (fp->f_flag & FWRITE) 317 maxprot |= VM_PROT_WRITE; 318 goto map; 319 } 320 if (fp->f_type != DTYPE_VNODE) { 321 error = ENODEV; 322 goto done; 323 } 324 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ 325 defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) 326 /* 327 * POSIX shared-memory objects are defined to have 328 * kernel persistence, and are not defined to support 329 * read(2)/write(2) -- or even open(2). Thus, we can 330 * use MAP_ASYNC to trade on-disk coherence for speed. 331 * The shm_open(3) library routine turns on the FPOSIXSHM 332 * flag to request this behavior. 333 */ 334 if (fp->f_flag & FPOSIXSHM) 335 flags |= MAP_NOSYNC; 336 #endif 337 vp = fp->f_vnode; 338 /* 339 * Ensure that file and memory protections are 340 * compatible. Note that we only worry about 341 * writability if mapping is shared; in this case, 342 * current and max prot are dictated by the open file. 343 * XXX use the vnode instead? Problem is: what 344 * credentials do we use for determination? What if 345 * proc does a setuid? 346 */ 347 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 348 maxprot = VM_PROT_NONE; 349 else 350 maxprot = VM_PROT_EXECUTE; 351 if (fp->f_flag & FREAD) { 352 maxprot |= VM_PROT_READ; 353 } else if (prot & PROT_READ) { 354 error = EACCES; 355 goto done; 356 } 357 /* 358 * If we are sharing potential changes (either via 359 * MAP_SHARED or via the implicit sharing of character 360 * device mappings), and we are trying to get write 361 * permission although we opened it without asking 362 * for it, bail out. 363 */ 364 if ((flags & MAP_SHARED) != 0) { 365 if ((fp->f_flag & FWRITE) != 0) { 366 maxprot |= VM_PROT_WRITE; 367 } else if ((prot & PROT_WRITE) != 0) { 368 error = EACCES; 369 goto done; 370 } 371 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 372 maxprot |= VM_PROT_WRITE; 373 } 374 handle = (void *)vp; 375 handle_type = OBJT_VNODE; 376 } 377 map: 378 379 /* 380 * Do not allow more then a certain number of vm_map_entry structures 381 * per process. Scale with the number of rforks sharing the map 382 * to make the limit reasonable for threads. 383 */ 384 if (max_proc_mmap && 385 vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) { 386 error = ENOMEM; 387 goto done; 388 } 389 390 td->td_fpop = fp; 391 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 392 flags, handle_type, handle, pos); 393 td->td_fpop = NULL; 394 #ifdef HWPMC_HOOKS 395 /* inform hwpmc(4) if an executable is being mapped */ 396 if (error == 0 && handle_type == OBJT_VNODE && 397 (prot & PROT_EXEC)) { 398 pkm.pm_file = handle; 399 pkm.pm_address = (uintptr_t) addr; 400 PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); 401 } 402 #endif 403 if (error == 0) 404 td->td_retval[0] = (register_t) (addr + pageoff); 405 done: 406 if (fp) 407 fdrop(fp, td); 408 409 return (error); 410 } 411 412 int 413 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 414 { 415 struct mmap_args oargs; 416 417 oargs.addr = uap->addr; 418 oargs.len = uap->len; 419 oargs.prot = uap->prot; 420 oargs.flags = uap->flags; 421 oargs.fd = uap->fd; 422 oargs.pos = uap->pos; 423 return (mmap(td, &oargs)); 424 } 425 426 #ifdef COMPAT_43 427 #ifndef _SYS_SYSPROTO_H_ 428 struct ommap_args { 429 caddr_t addr; 430 int len; 431 int prot; 432 int flags; 433 int fd; 434 long pos; 435 }; 436 #endif 437 int 438 ommap(td, uap) 439 struct thread *td; 440 struct ommap_args *uap; 441 { 442 struct mmap_args nargs; 443 static const char cvtbsdprot[8] = { 444 0, 445 PROT_EXEC, 446 PROT_WRITE, 447 PROT_EXEC | PROT_WRITE, 448 PROT_READ, 449 PROT_EXEC | PROT_READ, 450 PROT_WRITE | PROT_READ, 451 PROT_EXEC | PROT_WRITE | PROT_READ, 452 }; 453 454 #define OMAP_ANON 0x0002 455 #define OMAP_COPY 0x0020 456 #define OMAP_SHARED 0x0010 457 #define OMAP_FIXED 0x0100 458 459 nargs.addr = uap->addr; 460 nargs.len = uap->len; 461 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 462 nargs.flags = 0; 463 if (uap->flags & OMAP_ANON) 464 nargs.flags |= MAP_ANON; 465 if (uap->flags & OMAP_COPY) 466 nargs.flags |= MAP_COPY; 467 if (uap->flags & OMAP_SHARED) 468 nargs.flags |= MAP_SHARED; 469 else 470 nargs.flags |= MAP_PRIVATE; 471 if (uap->flags & OMAP_FIXED) 472 nargs.flags |= MAP_FIXED; 473 nargs.fd = uap->fd; 474 nargs.pos = uap->pos; 475 return (mmap(td, &nargs)); 476 } 477 #endif /* COMPAT_43 */ 478 479 480 #ifndef _SYS_SYSPROTO_H_ 481 struct msync_args { 482 void *addr; 483 size_t len; 484 int flags; 485 }; 486 #endif 487 /* 488 * MPSAFE 489 */ 490 int 491 msync(td, uap) 492 struct thread *td; 493 struct msync_args *uap; 494 { 495 vm_offset_t addr; 496 vm_size_t size, pageoff; 497 int flags; 498 vm_map_t map; 499 int rv; 500 501 addr = (vm_offset_t) uap->addr; 502 size = uap->len; 503 flags = uap->flags; 504 505 pageoff = (addr & PAGE_MASK); 506 addr -= pageoff; 507 size += pageoff; 508 size = (vm_size_t) round_page(size); 509 if (addr + size < addr) 510 return (EINVAL); 511 512 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 513 return (EINVAL); 514 515 map = &td->td_proc->p_vmspace->vm_map; 516 517 /* 518 * Clean the pages and interpret the return value. 519 */ 520 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 521 (flags & MS_INVALIDATE) != 0); 522 switch (rv) { 523 case KERN_SUCCESS: 524 return (0); 525 case KERN_INVALID_ADDRESS: 526 return (EINVAL); /* Sun returns ENOMEM? */ 527 case KERN_INVALID_ARGUMENT: 528 return (EBUSY); 529 default: 530 return (EINVAL); 531 } 532 } 533 534 #ifndef _SYS_SYSPROTO_H_ 535 struct munmap_args { 536 void *addr; 537 size_t len; 538 }; 539 #endif 540 /* 541 * MPSAFE 542 */ 543 int 544 munmap(td, uap) 545 struct thread *td; 546 struct munmap_args *uap; 547 { 548 #ifdef HWPMC_HOOKS 549 struct pmckern_map_out pkm; 550 vm_map_entry_t entry; 551 #endif 552 vm_offset_t addr; 553 vm_size_t size, pageoff; 554 vm_map_t map; 555 556 addr = (vm_offset_t) uap->addr; 557 size = uap->len; 558 if (size == 0) 559 return (EINVAL); 560 561 pageoff = (addr & PAGE_MASK); 562 addr -= pageoff; 563 size += pageoff; 564 size = (vm_size_t) round_page(size); 565 if (addr + size < addr) 566 return (EINVAL); 567 568 /* 569 * Check for illegal addresses. Watch out for address wrap... 570 */ 571 map = &td->td_proc->p_vmspace->vm_map; 572 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 573 return (EINVAL); 574 vm_map_lock(map); 575 #ifdef HWPMC_HOOKS 576 /* 577 * Inform hwpmc if the address range being unmapped contains 578 * an executable region. 579 */ 580 if (vm_map_lookup_entry(map, addr, &entry)) { 581 for (; 582 entry != &map->header && entry->start < addr + size; 583 entry = entry->next) { 584 if (vm_map_check_protection(map, entry->start, 585 entry->end, VM_PROT_EXECUTE) == TRUE) { 586 pkm.pm_address = (uintptr_t) addr; 587 pkm.pm_size = (size_t) size; 588 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, 589 (void *) &pkm); 590 break; 591 } 592 } 593 } 594 #endif 595 /* returns nothing but KERN_SUCCESS anyway */ 596 vm_map_delete(map, addr, addr + size); 597 vm_map_unlock(map); 598 return (0); 599 } 600 601 #ifndef _SYS_SYSPROTO_H_ 602 struct mprotect_args { 603 const void *addr; 604 size_t len; 605 int prot; 606 }; 607 #endif 608 /* 609 * MPSAFE 610 */ 611 int 612 mprotect(td, uap) 613 struct thread *td; 614 struct mprotect_args *uap; 615 { 616 vm_offset_t addr; 617 vm_size_t size, pageoff; 618 vm_prot_t prot; 619 620 addr = (vm_offset_t) uap->addr; 621 size = uap->len; 622 prot = uap->prot & VM_PROT_ALL; 623 624 pageoff = (addr & PAGE_MASK); 625 addr -= pageoff; 626 size += pageoff; 627 size = (vm_size_t) round_page(size); 628 if (addr + size < addr) 629 return (EINVAL); 630 631 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 632 addr + size, prot, FALSE)) { 633 case KERN_SUCCESS: 634 return (0); 635 case KERN_PROTECTION_FAILURE: 636 return (EACCES); 637 } 638 return (EINVAL); 639 } 640 641 #ifndef _SYS_SYSPROTO_H_ 642 struct minherit_args { 643 void *addr; 644 size_t len; 645 int inherit; 646 }; 647 #endif 648 /* 649 * MPSAFE 650 */ 651 int 652 minherit(td, uap) 653 struct thread *td; 654 struct minherit_args *uap; 655 { 656 vm_offset_t addr; 657 vm_size_t size, pageoff; 658 vm_inherit_t inherit; 659 660 addr = (vm_offset_t)uap->addr; 661 size = uap->len; 662 inherit = uap->inherit; 663 664 pageoff = (addr & PAGE_MASK); 665 addr -= pageoff; 666 size += pageoff; 667 size = (vm_size_t) round_page(size); 668 if (addr + size < addr) 669 return (EINVAL); 670 671 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 672 addr + size, inherit)) { 673 case KERN_SUCCESS: 674 return (0); 675 case KERN_PROTECTION_FAILURE: 676 return (EACCES); 677 } 678 return (EINVAL); 679 } 680 681 #ifndef _SYS_SYSPROTO_H_ 682 struct madvise_args { 683 void *addr; 684 size_t len; 685 int behav; 686 }; 687 #endif 688 689 /* 690 * MPSAFE 691 */ 692 /* ARGSUSED */ 693 int 694 madvise(td, uap) 695 struct thread *td; 696 struct madvise_args *uap; 697 { 698 vm_offset_t start, end; 699 vm_map_t map; 700 struct proc *p; 701 int error; 702 703 /* 704 * Check for our special case, advising the swap pager we are 705 * "immortal." 706 */ 707 if (uap->behav == MADV_PROTECT) { 708 error = priv_check(td, PRIV_VM_MADV_PROTECT); 709 if (error == 0) { 710 p = td->td_proc; 711 PROC_LOCK(p); 712 p->p_flag |= P_PROTECTED; 713 PROC_UNLOCK(p); 714 } 715 return (error); 716 } 717 /* 718 * Check for illegal behavior 719 */ 720 if (uap->behav < 0 || uap->behav > MADV_CORE) 721 return (EINVAL); 722 /* 723 * Check for illegal addresses. Watch out for address wrap... Note 724 * that VM_*_ADDRESS are not constants due to casts (argh). 725 */ 726 map = &td->td_proc->p_vmspace->vm_map; 727 if ((vm_offset_t)uap->addr < vm_map_min(map) || 728 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 729 return (EINVAL); 730 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 731 return (EINVAL); 732 733 /* 734 * Since this routine is only advisory, we default to conservative 735 * behavior. 736 */ 737 start = trunc_page((vm_offset_t) uap->addr); 738 end = round_page((vm_offset_t) uap->addr + uap->len); 739 740 if (vm_map_madvise(map, start, end, uap->behav)) 741 return (EINVAL); 742 return (0); 743 } 744 745 #ifndef _SYS_SYSPROTO_H_ 746 struct mincore_args { 747 const void *addr; 748 size_t len; 749 char *vec; 750 }; 751 #endif 752 753 /* 754 * MPSAFE 755 */ 756 /* ARGSUSED */ 757 int 758 mincore(td, uap) 759 struct thread *td; 760 struct mincore_args *uap; 761 { 762 vm_offset_t addr, first_addr; 763 vm_offset_t end, cend; 764 pmap_t pmap; 765 vm_map_t map; 766 char *vec; 767 int error = 0; 768 int vecindex, lastvecindex; 769 vm_map_entry_t current; 770 vm_map_entry_t entry; 771 int mincoreinfo; 772 unsigned int timestamp; 773 774 /* 775 * Make sure that the addresses presented are valid for user 776 * mode. 777 */ 778 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 779 end = addr + (vm_size_t)round_page(uap->len); 780 map = &td->td_proc->p_vmspace->vm_map; 781 if (end > vm_map_max(map) || end < addr) 782 return (ENOMEM); 783 784 /* 785 * Address of byte vector 786 */ 787 vec = uap->vec; 788 789 pmap = vmspace_pmap(td->td_proc->p_vmspace); 790 791 vm_map_lock_read(map); 792 RestartScan: 793 timestamp = map->timestamp; 794 795 if (!vm_map_lookup_entry(map, addr, &entry)) { 796 vm_map_unlock_read(map); 797 return (ENOMEM); 798 } 799 800 /* 801 * Do this on a map entry basis so that if the pages are not 802 * in the current processes address space, we can easily look 803 * up the pages elsewhere. 804 */ 805 lastvecindex = -1; 806 for (current = entry; 807 (current != &map->header) && (current->start < end); 808 current = current->next) { 809 810 /* 811 * check for contiguity 812 */ 813 if (current->end < end && 814 (entry->next == &map->header || 815 current->next->start > current->end)) { 816 vm_map_unlock_read(map); 817 return (ENOMEM); 818 } 819 820 /* 821 * ignore submaps (for now) or null objects 822 */ 823 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 824 current->object.vm_object == NULL) 825 continue; 826 827 /* 828 * limit this scan to the current map entry and the 829 * limits for the mincore call 830 */ 831 if (addr < current->start) 832 addr = current->start; 833 cend = current->end; 834 if (cend > end) 835 cend = end; 836 837 /* 838 * scan this entry one page at a time 839 */ 840 while (addr < cend) { 841 /* 842 * Check pmap first, it is likely faster, also 843 * it can provide info as to whether we are the 844 * one referencing or modifying the page. 845 */ 846 mincoreinfo = pmap_mincore(pmap, addr); 847 if (!mincoreinfo) { 848 vm_pindex_t pindex; 849 vm_ooffset_t offset; 850 vm_page_t m; 851 /* 852 * calculate the page index into the object 853 */ 854 offset = current->offset + (addr - current->start); 855 pindex = OFF_TO_IDX(offset); 856 VM_OBJECT_LOCK(current->object.vm_object); 857 m = vm_page_lookup(current->object.vm_object, 858 pindex); 859 /* 860 * if the page is resident, then gather information about 861 * it. 862 */ 863 if (m != NULL && m->valid != 0) { 864 mincoreinfo = MINCORE_INCORE; 865 vm_page_lock_queues(); 866 if (m->dirty || 867 pmap_is_modified(m)) 868 mincoreinfo |= MINCORE_MODIFIED_OTHER; 869 if ((m->flags & PG_REFERENCED) || 870 pmap_ts_referenced(m)) { 871 vm_page_flag_set(m, PG_REFERENCED); 872 mincoreinfo |= MINCORE_REFERENCED_OTHER; 873 } 874 vm_page_unlock_queues(); 875 } 876 VM_OBJECT_UNLOCK(current->object.vm_object); 877 } 878 879 /* 880 * subyte may page fault. In case it needs to modify 881 * the map, we release the lock. 882 */ 883 vm_map_unlock_read(map); 884 885 /* 886 * calculate index into user supplied byte vector 887 */ 888 vecindex = OFF_TO_IDX(addr - first_addr); 889 890 /* 891 * If we have skipped map entries, we need to make sure that 892 * the byte vector is zeroed for those skipped entries. 893 */ 894 while ((lastvecindex + 1) < vecindex) { 895 error = subyte(vec + lastvecindex, 0); 896 if (error) { 897 error = EFAULT; 898 goto done2; 899 } 900 ++lastvecindex; 901 } 902 903 /* 904 * Pass the page information to the user 905 */ 906 error = subyte(vec + vecindex, mincoreinfo); 907 if (error) { 908 error = EFAULT; 909 goto done2; 910 } 911 912 /* 913 * If the map has changed, due to the subyte, the previous 914 * output may be invalid. 915 */ 916 vm_map_lock_read(map); 917 if (timestamp != map->timestamp) 918 goto RestartScan; 919 920 lastvecindex = vecindex; 921 addr += PAGE_SIZE; 922 } 923 } 924 925 /* 926 * subyte may page fault. In case it needs to modify 927 * the map, we release the lock. 928 */ 929 vm_map_unlock_read(map); 930 931 /* 932 * Zero the last entries in the byte vector. 933 */ 934 vecindex = OFF_TO_IDX(end - first_addr); 935 while ((lastvecindex + 1) < vecindex) { 936 error = subyte(vec + lastvecindex, 0); 937 if (error) { 938 error = EFAULT; 939 goto done2; 940 } 941 ++lastvecindex; 942 } 943 944 /* 945 * If the map has changed, due to the subyte, the previous 946 * output may be invalid. 947 */ 948 vm_map_lock_read(map); 949 if (timestamp != map->timestamp) 950 goto RestartScan; 951 vm_map_unlock_read(map); 952 done2: 953 return (error); 954 } 955 956 #ifndef _SYS_SYSPROTO_H_ 957 struct mlock_args { 958 const void *addr; 959 size_t len; 960 }; 961 #endif 962 /* 963 * MPSAFE 964 */ 965 int 966 mlock(td, uap) 967 struct thread *td; 968 struct mlock_args *uap; 969 { 970 struct proc *proc; 971 vm_offset_t addr, end, last, start; 972 vm_size_t npages, size; 973 int error; 974 975 error = priv_check(td, PRIV_VM_MLOCK); 976 if (error) 977 return (error); 978 addr = (vm_offset_t)uap->addr; 979 size = uap->len; 980 last = addr + size; 981 start = trunc_page(addr); 982 end = round_page(last); 983 if (last < addr || end < addr) 984 return (EINVAL); 985 npages = atop(end - start); 986 if (npages > vm_page_max_wired) 987 return (ENOMEM); 988 proc = td->td_proc; 989 PROC_LOCK(proc); 990 if (ptoa(npages + 991 pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) > 992 lim_cur(proc, RLIMIT_MEMLOCK)) { 993 PROC_UNLOCK(proc); 994 return (ENOMEM); 995 } 996 PROC_UNLOCK(proc); 997 if (npages + cnt.v_wire_count > vm_page_max_wired) 998 return (EAGAIN); 999 error = vm_map_wire(&proc->p_vmspace->vm_map, start, end, 1000 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1001 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1002 } 1003 1004 #ifndef _SYS_SYSPROTO_H_ 1005 struct mlockall_args { 1006 int how; 1007 }; 1008 #endif 1009 1010 /* 1011 * MPSAFE 1012 */ 1013 int 1014 mlockall(td, uap) 1015 struct thread *td; 1016 struct mlockall_args *uap; 1017 { 1018 vm_map_t map; 1019 int error; 1020 1021 map = &td->td_proc->p_vmspace->vm_map; 1022 error = 0; 1023 1024 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1025 return (EINVAL); 1026 1027 #if 0 1028 /* 1029 * If wiring all pages in the process would cause it to exceed 1030 * a hard resource limit, return ENOMEM. 1031 */ 1032 PROC_LOCK(td->td_proc); 1033 if (map->size - ptoa(pmap_wired_count(vm_map_pmap(map)) > 1034 lim_cur(td->td_proc, RLIMIT_MEMLOCK))) { 1035 PROC_UNLOCK(td->td_proc); 1036 return (ENOMEM); 1037 } 1038 PROC_UNLOCK(td->td_proc); 1039 #else 1040 error = priv_check(td, PRIV_VM_MLOCK); 1041 if (error) 1042 return (error); 1043 #endif 1044 1045 if (uap->how & MCL_FUTURE) { 1046 vm_map_lock(map); 1047 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1048 vm_map_unlock(map); 1049 error = 0; 1050 } 1051 1052 if (uap->how & MCL_CURRENT) { 1053 /* 1054 * P1003.1-2001 mandates that all currently mapped pages 1055 * will be memory resident and locked (wired) upon return 1056 * from mlockall(). vm_map_wire() will wire pages, by 1057 * calling vm_fault_wire() for each page in the region. 1058 */ 1059 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1060 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1061 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1062 } 1063 1064 return (error); 1065 } 1066 1067 #ifndef _SYS_SYSPROTO_H_ 1068 struct munlockall_args { 1069 register_t dummy; 1070 }; 1071 #endif 1072 1073 /* 1074 * MPSAFE 1075 */ 1076 int 1077 munlockall(td, uap) 1078 struct thread *td; 1079 struct munlockall_args *uap; 1080 { 1081 vm_map_t map; 1082 int error; 1083 1084 map = &td->td_proc->p_vmspace->vm_map; 1085 error = priv_check(td, PRIV_VM_MUNLOCK); 1086 if (error) 1087 return (error); 1088 1089 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1090 vm_map_lock(map); 1091 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1092 vm_map_unlock(map); 1093 1094 /* Forcibly unwire all pages. */ 1095 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1096 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1097 1098 return (error); 1099 } 1100 1101 #ifndef _SYS_SYSPROTO_H_ 1102 struct munlock_args { 1103 const void *addr; 1104 size_t len; 1105 }; 1106 #endif 1107 /* 1108 * MPSAFE 1109 */ 1110 int 1111 munlock(td, uap) 1112 struct thread *td; 1113 struct munlock_args *uap; 1114 { 1115 vm_offset_t addr, end, last, start; 1116 vm_size_t size; 1117 int error; 1118 1119 error = priv_check(td, PRIV_VM_MUNLOCK); 1120 if (error) 1121 return (error); 1122 addr = (vm_offset_t)uap->addr; 1123 size = uap->len; 1124 last = addr + size; 1125 start = trunc_page(addr); 1126 end = round_page(last); 1127 if (last < addr || end < addr) 1128 return (EINVAL); 1129 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1130 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1131 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1132 } 1133 1134 /* 1135 * vm_mmap_vnode() 1136 * 1137 * MPSAFE 1138 * 1139 * Helper function for vm_mmap. Perform sanity check specific for mmap 1140 * operations on vnodes. 1141 */ 1142 int 1143 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1144 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1145 struct vnode *vp, vm_ooffset_t foff, vm_object_t *objp) 1146 { 1147 struct vattr va; 1148 void *handle; 1149 vm_object_t obj; 1150 struct mount *mp; 1151 struct cdevsw *dsw; 1152 struct ucred *cred; 1153 int error, flags, type; 1154 int vfslocked; 1155 1156 mp = vp->v_mount; 1157 cred = td->td_ucred; 1158 vfslocked = VFS_LOCK_GIANT(mp); 1159 if ((error = vget(vp, LK_SHARED, td)) != 0) { 1160 VFS_UNLOCK_GIANT(vfslocked); 1161 return (error); 1162 } 1163 flags = *flagsp; 1164 obj = vp->v_object; 1165 if (vp->v_type == VREG) { 1166 /* 1167 * Get the proper underlying object 1168 */ 1169 if (obj == NULL) { 1170 error = EINVAL; 1171 goto done; 1172 } 1173 if (obj->handle != vp) { 1174 vput(vp); 1175 vp = (struct vnode*)obj->handle; 1176 vget(vp, LK_SHARED, td); 1177 } 1178 type = OBJT_VNODE; 1179 handle = vp; 1180 } else if (vp->v_type == VCHR) { 1181 type = OBJT_DEVICE; 1182 handle = vp->v_rdev; 1183 1184 dsw = dev_refthread(handle); 1185 if (dsw == NULL) { 1186 error = ENXIO; 1187 goto done; 1188 } 1189 if (dsw->d_flags & D_MMAP_ANON) { 1190 dev_relthread(handle); 1191 *maxprotp = VM_PROT_ALL; 1192 *flagsp |= MAP_ANON; 1193 error = 0; 1194 goto done; 1195 } 1196 dev_relthread(handle); 1197 /* 1198 * cdevs does not provide private mappings of any kind. 1199 */ 1200 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1201 (prot & PROT_WRITE) != 0) { 1202 error = EACCES; 1203 goto done; 1204 } 1205 if (flags & (MAP_PRIVATE|MAP_COPY)) { 1206 error = EINVAL; 1207 goto done; 1208 } 1209 /* 1210 * Force device mappings to be shared. 1211 */ 1212 flags |= MAP_SHARED; 1213 } else { 1214 error = EINVAL; 1215 goto done; 1216 } 1217 if ((error = VOP_GETATTR(vp, &va, cred))) 1218 goto done; 1219 #ifdef MAC 1220 error = mac_vnode_check_mmap(cred, vp, prot, flags); 1221 if (error != 0) 1222 goto done; 1223 #endif 1224 if ((flags & MAP_SHARED) != 0) { 1225 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1226 if (prot & PROT_WRITE) { 1227 error = EPERM; 1228 goto done; 1229 } 1230 *maxprotp &= ~VM_PROT_WRITE; 1231 } 1232 } 1233 /* 1234 * If it is a regular file without any references 1235 * we do not need to sync it. 1236 * Adjust object size to be the size of actual file. 1237 */ 1238 if (vp->v_type == VREG) { 1239 objsize = round_page(va.va_size); 1240 if (va.va_nlink == 0) 1241 flags |= MAP_NOSYNC; 1242 } 1243 obj = vm_pager_allocate(type, handle, objsize, prot, foff); 1244 if (obj == NULL) { 1245 error = (type == OBJT_DEVICE ? EINVAL : ENOMEM); 1246 goto done; 1247 } 1248 *objp = obj; 1249 *flagsp = flags; 1250 vfs_mark_atime(vp, cred); 1251 1252 done: 1253 vput(vp); 1254 VFS_UNLOCK_GIANT(vfslocked); 1255 return (error); 1256 } 1257 1258 /* 1259 * vm_mmap_cdev() 1260 * 1261 * MPSAFE 1262 * 1263 * Helper function for vm_mmap. Perform sanity check specific for mmap 1264 * operations on cdevs. 1265 */ 1266 int 1267 vm_mmap_cdev(struct thread *td, vm_size_t objsize, 1268 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1269 struct cdev *cdev, vm_ooffset_t foff, vm_object_t *objp) 1270 { 1271 vm_object_t obj; 1272 struct cdevsw *dsw; 1273 int flags; 1274 1275 flags = *flagsp; 1276 1277 dsw = dev_refthread(cdev); 1278 if (dsw == NULL) 1279 return (ENXIO); 1280 if (dsw->d_flags & D_MMAP_ANON) { 1281 dev_relthread(cdev); 1282 *maxprotp = VM_PROT_ALL; 1283 *flagsp |= MAP_ANON; 1284 return (0); 1285 } 1286 dev_relthread(cdev); 1287 /* 1288 * cdevs does not provide private mappings of any kind. 1289 */ 1290 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1291 (prot & PROT_WRITE) != 0) 1292 return (EACCES); 1293 if (flags & (MAP_PRIVATE|MAP_COPY)) 1294 return (EINVAL); 1295 /* 1296 * Force device mappings to be shared. 1297 */ 1298 flags |= MAP_SHARED; 1299 #ifdef MAC_XXX 1300 error = mac_check_cdev_mmap(td->td_ucred, cdev, prot); 1301 if (error != 0) 1302 return (error); 1303 #endif 1304 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, foff); 1305 if (obj == NULL) 1306 return (EINVAL); 1307 *objp = obj; 1308 *flagsp = flags; 1309 return (0); 1310 } 1311 1312 /* 1313 * vm_mmap_shm() 1314 * 1315 * MPSAFE 1316 * 1317 * Helper function for vm_mmap. Perform sanity check specific for mmap 1318 * operations on shm file descriptors. 1319 */ 1320 int 1321 vm_mmap_shm(struct thread *td, vm_size_t objsize, 1322 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1323 struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp) 1324 { 1325 int error; 1326 1327 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1328 (prot & PROT_WRITE) != 0) 1329 return (EACCES); 1330 #ifdef MAC 1331 error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp); 1332 if (error != 0) 1333 return (error); 1334 #endif 1335 error = shm_mmap(shmfd, objsize, foff, objp); 1336 if (error) 1337 return (error); 1338 return (0); 1339 } 1340 1341 /* 1342 * vm_mmap() 1343 * 1344 * MPSAFE 1345 * 1346 * Internal version of mmap. Currently used by mmap, exec, and sys5 1347 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1348 */ 1349 int 1350 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1351 vm_prot_t maxprot, int flags, 1352 objtype_t handle_type, void *handle, 1353 vm_ooffset_t foff) 1354 { 1355 boolean_t fitit; 1356 vm_object_t object = NULL; 1357 int rv = KERN_SUCCESS; 1358 int docow, error; 1359 struct thread *td = curthread; 1360 1361 if (size == 0) 1362 return (0); 1363 1364 size = round_page(size); 1365 1366 PROC_LOCK(td->td_proc); 1367 if (td->td_proc->p_vmspace->vm_map.size + size > 1368 lim_cur(td->td_proc, RLIMIT_VMEM)) { 1369 PROC_UNLOCK(td->td_proc); 1370 return(ENOMEM); 1371 } 1372 PROC_UNLOCK(td->td_proc); 1373 1374 /* 1375 * We currently can only deal with page aligned file offsets. 1376 * The check is here rather than in the syscall because the 1377 * kernel calls this function internally for other mmaping 1378 * operations (such as in exec) and non-aligned offsets will 1379 * cause pmap inconsistencies...so we want to be sure to 1380 * disallow this in all cases. 1381 */ 1382 if (foff & PAGE_MASK) 1383 return (EINVAL); 1384 1385 if ((flags & MAP_FIXED) == 0) { 1386 fitit = TRUE; 1387 *addr = round_page(*addr); 1388 } else { 1389 if (*addr != trunc_page(*addr)) 1390 return (EINVAL); 1391 fitit = FALSE; 1392 } 1393 /* 1394 * Lookup/allocate object. 1395 */ 1396 switch (handle_type) { 1397 case OBJT_DEVICE: 1398 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, 1399 handle, foff, &object); 1400 break; 1401 case OBJT_VNODE: 1402 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1403 handle, foff, &object); 1404 break; 1405 case OBJT_SWAP: 1406 error = vm_mmap_shm(td, size, prot, &maxprot, &flags, 1407 handle, foff, &object); 1408 break; 1409 case OBJT_DEFAULT: 1410 if (handle == NULL) { 1411 error = 0; 1412 break; 1413 } 1414 /* FALLTHROUGH */ 1415 default: 1416 error = EINVAL; 1417 break; 1418 } 1419 if (error) 1420 return (error); 1421 if (flags & MAP_ANON) { 1422 object = NULL; 1423 docow = 0; 1424 /* 1425 * Unnamed anonymous regions always start at 0. 1426 */ 1427 if (handle == 0) 1428 foff = 0; 1429 } else { 1430 docow = MAP_PREFAULT_PARTIAL; 1431 } 1432 1433 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1434 docow |= MAP_COPY_ON_WRITE; 1435 if (flags & MAP_NOSYNC) 1436 docow |= MAP_DISABLE_SYNCER; 1437 if (flags & MAP_NOCORE) 1438 docow |= MAP_DISABLE_COREDUMP; 1439 1440 if (flags & MAP_STACK) 1441 rv = vm_map_stack(map, *addr, size, prot, maxprot, 1442 docow | MAP_STACK_GROWS_DOWN); 1443 else if (fitit) 1444 rv = vm_map_find(map, object, foff, addr, size, 1445 object != NULL && object->type == OBJT_DEVICE ? 1446 VMFS_ALIGNED_SPACE : VMFS_ANY_SPACE, prot, maxprot, docow); 1447 else 1448 rv = vm_map_fixed(map, object, foff, *addr, size, 1449 prot, maxprot, docow); 1450 1451 if (rv != KERN_SUCCESS) { 1452 /* 1453 * Lose the object reference. Will destroy the 1454 * object if it's an unnamed anonymous mapping 1455 * or named anonymous without other references. 1456 */ 1457 vm_object_deallocate(object); 1458 } else if (flags & MAP_SHARED) { 1459 /* 1460 * Shared memory is also shared with children. 1461 */ 1462 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); 1463 if (rv != KERN_SUCCESS) 1464 (void) vm_map_remove(map, *addr, *addr + size); 1465 } 1466 1467 /* 1468 * If the process has requested that all future mappings 1469 * be wired, then heed this. 1470 */ 1471 if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE)) 1472 vm_map_wire(map, *addr, *addr + size, 1473 VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); 1474 1475 switch (rv) { 1476 case KERN_SUCCESS: 1477 return (0); 1478 case KERN_INVALID_ADDRESS: 1479 case KERN_NO_SPACE: 1480 return (ENOMEM); 1481 case KERN_PROTECTION_FAILURE: 1482 return (EACCES); 1483 default: 1484 return (EINVAL); 1485 } 1486 } 1487