1 /*- 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35 * 36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_compat.h" 47 #include "opt_hwpmc_hooks.h" 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/kernel.h> 52 #include <sys/lock.h> 53 #include <sys/mutex.h> 54 #include <sys/sysproto.h> 55 #include <sys/filedesc.h> 56 #include <sys/priv.h> 57 #include <sys/proc.h> 58 #include <sys/resource.h> 59 #include <sys/resourcevar.h> 60 #include <sys/vnode.h> 61 #include <sys/fcntl.h> 62 #include <sys/file.h> 63 #include <sys/mman.h> 64 #include <sys/mount.h> 65 #include <sys/conf.h> 66 #include <sys/stat.h> 67 #include <sys/vmmeter.h> 68 #include <sys/sysctl.h> 69 70 #include <security/mac/mac_framework.h> 71 72 #include <vm/vm.h> 73 #include <vm/vm_param.h> 74 #include <vm/pmap.h> 75 #include <vm/vm_map.h> 76 #include <vm/vm_object.h> 77 #include <vm/vm_page.h> 78 #include <vm/vm_pager.h> 79 #include <vm/vm_pageout.h> 80 #include <vm/vm_extern.h> 81 #include <vm/vm_page.h> 82 #include <vm/vm_kern.h> 83 84 #ifdef HWPMC_HOOKS 85 #include <sys/pmckern.h> 86 #endif 87 88 #ifndef _SYS_SYSPROTO_H_ 89 struct sbrk_args { 90 int incr; 91 }; 92 #endif 93 94 static int max_proc_mmap; 95 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, 96 "Maximum number of memory-mapped files per process"); 97 98 /* 99 * Set the maximum number of vm_map_entry structures per process. Roughly 100 * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100 101 * of our KVM malloc space still results in generous limits. We want a 102 * default that is good enough to prevent the kernel running out of resources 103 * if attacked from compromised user account but generous enough such that 104 * multi-threaded processes are not unduly inconvenienced. 105 */ 106 static void vmmapentry_rsrc_init(void *); 107 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, 108 NULL); 109 110 static void 111 vmmapentry_rsrc_init(dummy) 112 void *dummy; 113 { 114 max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry); 115 max_proc_mmap /= 100; 116 } 117 118 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 119 int *, struct vnode *, vm_ooffset_t *, vm_object_t *); 120 static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 121 int *, struct cdev *, vm_ooffset_t *, vm_object_t *); 122 static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 123 int *, struct shmfd *, vm_ooffset_t, vm_object_t *); 124 125 /* 126 * MPSAFE 127 */ 128 /* ARGSUSED */ 129 int 130 sbrk(td, uap) 131 struct thread *td; 132 struct sbrk_args *uap; 133 { 134 /* Not yet implemented */ 135 return (EOPNOTSUPP); 136 } 137 138 #ifndef _SYS_SYSPROTO_H_ 139 struct sstk_args { 140 int incr; 141 }; 142 #endif 143 144 /* 145 * MPSAFE 146 */ 147 /* ARGSUSED */ 148 int 149 sstk(td, uap) 150 struct thread *td; 151 struct sstk_args *uap; 152 { 153 /* Not yet implemented */ 154 return (EOPNOTSUPP); 155 } 156 157 #if defined(COMPAT_43) 158 #ifndef _SYS_SYSPROTO_H_ 159 struct getpagesize_args { 160 int dummy; 161 }; 162 #endif 163 164 /* ARGSUSED */ 165 int 166 ogetpagesize(td, uap) 167 struct thread *td; 168 struct getpagesize_args *uap; 169 { 170 /* MP SAFE */ 171 td->td_retval[0] = PAGE_SIZE; 172 return (0); 173 } 174 #endif /* COMPAT_43 */ 175 176 177 /* 178 * Memory Map (mmap) system call. Note that the file offset 179 * and address are allowed to be NOT page aligned, though if 180 * the MAP_FIXED flag it set, both must have the same remainder 181 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 182 * page-aligned, the actual mapping starts at trunc_page(addr) 183 * and the return value is adjusted up by the page offset. 184 * 185 * Generally speaking, only character devices which are themselves 186 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 187 * there would be no cache coherency between a descriptor and a VM mapping 188 * both to the same character device. 189 */ 190 #ifndef _SYS_SYSPROTO_H_ 191 struct mmap_args { 192 void *addr; 193 size_t len; 194 int prot; 195 int flags; 196 int fd; 197 long pad; 198 off_t pos; 199 }; 200 #endif 201 202 /* 203 * MPSAFE 204 */ 205 int 206 mmap(td, uap) 207 struct thread *td; 208 struct mmap_args *uap; 209 { 210 #ifdef HWPMC_HOOKS 211 struct pmckern_map_in pkm; 212 #endif 213 struct file *fp; 214 struct vnode *vp; 215 vm_offset_t addr; 216 vm_size_t size, pageoff; 217 vm_prot_t prot, maxprot; 218 void *handle; 219 objtype_t handle_type; 220 int flags, error; 221 off_t pos; 222 struct vmspace *vms = td->td_proc->p_vmspace; 223 224 addr = (vm_offset_t) uap->addr; 225 size = uap->len; 226 prot = uap->prot & VM_PROT_ALL; 227 flags = uap->flags; 228 pos = uap->pos; 229 230 fp = NULL; 231 /* make sure mapping fits into numeric range etc */ 232 if ((ssize_t) uap->len < 0 || 233 ((flags & MAP_ANON) && uap->fd != -1)) 234 return (EINVAL); 235 236 if (flags & MAP_STACK) { 237 if ((uap->fd != -1) || 238 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 239 return (EINVAL); 240 flags |= MAP_ANON; 241 pos = 0; 242 } 243 244 /* 245 * Align the file position to a page boundary, 246 * and save its page offset component. 247 */ 248 pageoff = (pos & PAGE_MASK); 249 pos -= pageoff; 250 251 /* Adjust size for rounding (on both ends). */ 252 size += pageoff; /* low end... */ 253 size = (vm_size_t) round_page(size); /* hi end */ 254 255 /* 256 * Check for illegal addresses. Watch out for address wrap... Note 257 * that VM_*_ADDRESS are not constants due to casts (argh). 258 */ 259 if (flags & MAP_FIXED) { 260 /* 261 * The specified address must have the same remainder 262 * as the file offset taken modulo PAGE_SIZE, so it 263 * should be aligned after adjustment by pageoff. 264 */ 265 addr -= pageoff; 266 if (addr & PAGE_MASK) 267 return (EINVAL); 268 /* Address range must be all in user VM space. */ 269 if (addr < vm_map_min(&vms->vm_map) || 270 addr + size > vm_map_max(&vms->vm_map)) 271 return (EINVAL); 272 if (addr + size < addr) 273 return (EINVAL); 274 } else { 275 /* 276 * XXX for non-fixed mappings where no hint is provided or 277 * the hint would fall in the potential heap space, 278 * place it after the end of the largest possible heap. 279 * 280 * There should really be a pmap call to determine a reasonable 281 * location. 282 */ 283 PROC_LOCK(td->td_proc); 284 if (addr == 0 || 285 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 286 addr < round_page((vm_offset_t)vms->vm_daddr + 287 lim_max(td->td_proc, RLIMIT_DATA)))) 288 addr = round_page((vm_offset_t)vms->vm_daddr + 289 lim_max(td->td_proc, RLIMIT_DATA)); 290 PROC_UNLOCK(td->td_proc); 291 } 292 if (flags & MAP_ANON) { 293 /* 294 * Mapping blank space is trivial. 295 */ 296 handle = NULL; 297 handle_type = OBJT_DEFAULT; 298 maxprot = VM_PROT_ALL; 299 pos = 0; 300 } else { 301 /* 302 * Mapping file, get fp for validation and 303 * don't let the descriptor disappear on us if we block. 304 */ 305 if ((error = fget(td, uap->fd, &fp)) != 0) 306 goto done; 307 if (fp->f_type == DTYPE_SHM) { 308 handle = fp->f_data; 309 handle_type = OBJT_SWAP; 310 maxprot = VM_PROT_NONE; 311 312 /* FREAD should always be set. */ 313 if (fp->f_flag & FREAD) 314 maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; 315 if (fp->f_flag & FWRITE) 316 maxprot |= VM_PROT_WRITE; 317 goto map; 318 } 319 if (fp->f_type != DTYPE_VNODE) { 320 error = ENODEV; 321 goto done; 322 } 323 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ 324 defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) 325 /* 326 * POSIX shared-memory objects are defined to have 327 * kernel persistence, and are not defined to support 328 * read(2)/write(2) -- or even open(2). Thus, we can 329 * use MAP_ASYNC to trade on-disk coherence for speed. 330 * The shm_open(3) library routine turns on the FPOSIXSHM 331 * flag to request this behavior. 332 */ 333 if (fp->f_flag & FPOSIXSHM) 334 flags |= MAP_NOSYNC; 335 #endif 336 vp = fp->f_vnode; 337 /* 338 * Ensure that file and memory protections are 339 * compatible. Note that we only worry about 340 * writability if mapping is shared; in this case, 341 * current and max prot are dictated by the open file. 342 * XXX use the vnode instead? Problem is: what 343 * credentials do we use for determination? What if 344 * proc does a setuid? 345 */ 346 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 347 maxprot = VM_PROT_NONE; 348 else 349 maxprot = VM_PROT_EXECUTE; 350 if (fp->f_flag & FREAD) { 351 maxprot |= VM_PROT_READ; 352 } else if (prot & PROT_READ) { 353 error = EACCES; 354 goto done; 355 } 356 /* 357 * If we are sharing potential changes (either via 358 * MAP_SHARED or via the implicit sharing of character 359 * device mappings), and we are trying to get write 360 * permission although we opened it without asking 361 * for it, bail out. 362 */ 363 if ((flags & MAP_SHARED) != 0) { 364 if ((fp->f_flag & FWRITE) != 0) { 365 maxprot |= VM_PROT_WRITE; 366 } else if ((prot & PROT_WRITE) != 0) { 367 error = EACCES; 368 goto done; 369 } 370 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 371 maxprot |= VM_PROT_WRITE; 372 } 373 handle = (void *)vp; 374 handle_type = OBJT_VNODE; 375 } 376 map: 377 378 /* 379 * Do not allow more then a certain number of vm_map_entry structures 380 * per process. Scale with the number of rforks sharing the map 381 * to make the limit reasonable for threads. 382 */ 383 if (max_proc_mmap && 384 vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) { 385 error = ENOMEM; 386 goto done; 387 } 388 389 td->td_fpop = fp; 390 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 391 flags, handle_type, handle, pos); 392 td->td_fpop = NULL; 393 #ifdef HWPMC_HOOKS 394 /* inform hwpmc(4) if an executable is being mapped */ 395 if (error == 0 && handle_type == OBJT_VNODE && 396 (prot & PROT_EXEC)) { 397 pkm.pm_file = handle; 398 pkm.pm_address = (uintptr_t) addr; 399 PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); 400 } 401 #endif 402 if (error == 0) 403 td->td_retval[0] = (register_t) (addr + pageoff); 404 done: 405 if (fp) 406 fdrop(fp, td); 407 408 return (error); 409 } 410 411 int 412 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 413 { 414 struct mmap_args oargs; 415 416 oargs.addr = uap->addr; 417 oargs.len = uap->len; 418 oargs.prot = uap->prot; 419 oargs.flags = uap->flags; 420 oargs.fd = uap->fd; 421 oargs.pos = uap->pos; 422 return (mmap(td, &oargs)); 423 } 424 425 #ifdef COMPAT_43 426 #ifndef _SYS_SYSPROTO_H_ 427 struct ommap_args { 428 caddr_t addr; 429 int len; 430 int prot; 431 int flags; 432 int fd; 433 long pos; 434 }; 435 #endif 436 int 437 ommap(td, uap) 438 struct thread *td; 439 struct ommap_args *uap; 440 { 441 struct mmap_args nargs; 442 static const char cvtbsdprot[8] = { 443 0, 444 PROT_EXEC, 445 PROT_WRITE, 446 PROT_EXEC | PROT_WRITE, 447 PROT_READ, 448 PROT_EXEC | PROT_READ, 449 PROT_WRITE | PROT_READ, 450 PROT_EXEC | PROT_WRITE | PROT_READ, 451 }; 452 453 #define OMAP_ANON 0x0002 454 #define OMAP_COPY 0x0020 455 #define OMAP_SHARED 0x0010 456 #define OMAP_FIXED 0x0100 457 458 nargs.addr = uap->addr; 459 nargs.len = uap->len; 460 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 461 nargs.flags = 0; 462 if (uap->flags & OMAP_ANON) 463 nargs.flags |= MAP_ANON; 464 if (uap->flags & OMAP_COPY) 465 nargs.flags |= MAP_COPY; 466 if (uap->flags & OMAP_SHARED) 467 nargs.flags |= MAP_SHARED; 468 else 469 nargs.flags |= MAP_PRIVATE; 470 if (uap->flags & OMAP_FIXED) 471 nargs.flags |= MAP_FIXED; 472 nargs.fd = uap->fd; 473 nargs.pos = uap->pos; 474 return (mmap(td, &nargs)); 475 } 476 #endif /* COMPAT_43 */ 477 478 479 #ifndef _SYS_SYSPROTO_H_ 480 struct msync_args { 481 void *addr; 482 size_t len; 483 int flags; 484 }; 485 #endif 486 /* 487 * MPSAFE 488 */ 489 int 490 msync(td, uap) 491 struct thread *td; 492 struct msync_args *uap; 493 { 494 vm_offset_t addr; 495 vm_size_t size, pageoff; 496 int flags; 497 vm_map_t map; 498 int rv; 499 500 addr = (vm_offset_t) uap->addr; 501 size = uap->len; 502 flags = uap->flags; 503 504 pageoff = (addr & PAGE_MASK); 505 addr -= pageoff; 506 size += pageoff; 507 size = (vm_size_t) round_page(size); 508 if (addr + size < addr) 509 return (EINVAL); 510 511 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 512 return (EINVAL); 513 514 map = &td->td_proc->p_vmspace->vm_map; 515 516 /* 517 * Clean the pages and interpret the return value. 518 */ 519 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 520 (flags & MS_INVALIDATE) != 0); 521 switch (rv) { 522 case KERN_SUCCESS: 523 return (0); 524 case KERN_INVALID_ADDRESS: 525 return (EINVAL); /* Sun returns ENOMEM? */ 526 case KERN_INVALID_ARGUMENT: 527 return (EBUSY); 528 default: 529 return (EINVAL); 530 } 531 } 532 533 #ifndef _SYS_SYSPROTO_H_ 534 struct munmap_args { 535 void *addr; 536 size_t len; 537 }; 538 #endif 539 /* 540 * MPSAFE 541 */ 542 int 543 munmap(td, uap) 544 struct thread *td; 545 struct munmap_args *uap; 546 { 547 #ifdef HWPMC_HOOKS 548 struct pmckern_map_out pkm; 549 vm_map_entry_t entry; 550 #endif 551 vm_offset_t addr; 552 vm_size_t size, pageoff; 553 vm_map_t map; 554 555 addr = (vm_offset_t) uap->addr; 556 size = uap->len; 557 if (size == 0) 558 return (EINVAL); 559 560 pageoff = (addr & PAGE_MASK); 561 addr -= pageoff; 562 size += pageoff; 563 size = (vm_size_t) round_page(size); 564 if (addr + size < addr) 565 return (EINVAL); 566 567 /* 568 * Check for illegal addresses. Watch out for address wrap... 569 */ 570 map = &td->td_proc->p_vmspace->vm_map; 571 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 572 return (EINVAL); 573 vm_map_lock(map); 574 #ifdef HWPMC_HOOKS 575 /* 576 * Inform hwpmc if the address range being unmapped contains 577 * an executable region. 578 */ 579 if (vm_map_lookup_entry(map, addr, &entry)) { 580 for (; 581 entry != &map->header && entry->start < addr + size; 582 entry = entry->next) { 583 if (vm_map_check_protection(map, entry->start, 584 entry->end, VM_PROT_EXECUTE) == TRUE) { 585 pkm.pm_address = (uintptr_t) addr; 586 pkm.pm_size = (size_t) size; 587 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, 588 (void *) &pkm); 589 break; 590 } 591 } 592 } 593 #endif 594 /* returns nothing but KERN_SUCCESS anyway */ 595 vm_map_delete(map, addr, addr + size); 596 vm_map_unlock(map); 597 return (0); 598 } 599 600 #ifndef _SYS_SYSPROTO_H_ 601 struct mprotect_args { 602 const void *addr; 603 size_t len; 604 int prot; 605 }; 606 #endif 607 /* 608 * MPSAFE 609 */ 610 int 611 mprotect(td, uap) 612 struct thread *td; 613 struct mprotect_args *uap; 614 { 615 vm_offset_t addr; 616 vm_size_t size, pageoff; 617 vm_prot_t prot; 618 619 addr = (vm_offset_t) uap->addr; 620 size = uap->len; 621 prot = uap->prot & VM_PROT_ALL; 622 623 pageoff = (addr & PAGE_MASK); 624 addr -= pageoff; 625 size += pageoff; 626 size = (vm_size_t) round_page(size); 627 if (addr + size < addr) 628 return (EINVAL); 629 630 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 631 addr + size, prot, FALSE)) { 632 case KERN_SUCCESS: 633 return (0); 634 case KERN_PROTECTION_FAILURE: 635 return (EACCES); 636 } 637 return (EINVAL); 638 } 639 640 #ifndef _SYS_SYSPROTO_H_ 641 struct minherit_args { 642 void *addr; 643 size_t len; 644 int inherit; 645 }; 646 #endif 647 /* 648 * MPSAFE 649 */ 650 int 651 minherit(td, uap) 652 struct thread *td; 653 struct minherit_args *uap; 654 { 655 vm_offset_t addr; 656 vm_size_t size, pageoff; 657 vm_inherit_t inherit; 658 659 addr = (vm_offset_t)uap->addr; 660 size = uap->len; 661 inherit = uap->inherit; 662 663 pageoff = (addr & PAGE_MASK); 664 addr -= pageoff; 665 size += pageoff; 666 size = (vm_size_t) round_page(size); 667 if (addr + size < addr) 668 return (EINVAL); 669 670 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 671 addr + size, inherit)) { 672 case KERN_SUCCESS: 673 return (0); 674 case KERN_PROTECTION_FAILURE: 675 return (EACCES); 676 } 677 return (EINVAL); 678 } 679 680 #ifndef _SYS_SYSPROTO_H_ 681 struct madvise_args { 682 void *addr; 683 size_t len; 684 int behav; 685 }; 686 #endif 687 688 /* 689 * MPSAFE 690 */ 691 /* ARGSUSED */ 692 int 693 madvise(td, uap) 694 struct thread *td; 695 struct madvise_args *uap; 696 { 697 vm_offset_t start, end; 698 vm_map_t map; 699 struct proc *p; 700 int error; 701 702 /* 703 * Check for our special case, advising the swap pager we are 704 * "immortal." 705 */ 706 if (uap->behav == MADV_PROTECT) { 707 error = priv_check(td, PRIV_VM_MADV_PROTECT); 708 if (error == 0) { 709 p = td->td_proc; 710 PROC_LOCK(p); 711 p->p_flag |= P_PROTECTED; 712 PROC_UNLOCK(p); 713 } 714 return (error); 715 } 716 /* 717 * Check for illegal behavior 718 */ 719 if (uap->behav < 0 || uap->behav > MADV_CORE) 720 return (EINVAL); 721 /* 722 * Check for illegal addresses. Watch out for address wrap... Note 723 * that VM_*_ADDRESS are not constants due to casts (argh). 724 */ 725 map = &td->td_proc->p_vmspace->vm_map; 726 if ((vm_offset_t)uap->addr < vm_map_min(map) || 727 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 728 return (EINVAL); 729 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 730 return (EINVAL); 731 732 /* 733 * Since this routine is only advisory, we default to conservative 734 * behavior. 735 */ 736 start = trunc_page((vm_offset_t) uap->addr); 737 end = round_page((vm_offset_t) uap->addr + uap->len); 738 739 if (vm_map_madvise(map, start, end, uap->behav)) 740 return (EINVAL); 741 return (0); 742 } 743 744 #ifndef _SYS_SYSPROTO_H_ 745 struct mincore_args { 746 const void *addr; 747 size_t len; 748 char *vec; 749 }; 750 #endif 751 752 /* 753 * MPSAFE 754 */ 755 /* ARGSUSED */ 756 int 757 mincore(td, uap) 758 struct thread *td; 759 struct mincore_args *uap; 760 { 761 vm_offset_t addr, first_addr; 762 vm_offset_t end, cend; 763 pmap_t pmap; 764 vm_map_t map; 765 char *vec; 766 int error = 0; 767 int vecindex, lastvecindex; 768 vm_map_entry_t current; 769 vm_map_entry_t entry; 770 int mincoreinfo; 771 unsigned int timestamp; 772 773 /* 774 * Make sure that the addresses presented are valid for user 775 * mode. 776 */ 777 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 778 end = addr + (vm_size_t)round_page(uap->len); 779 map = &td->td_proc->p_vmspace->vm_map; 780 if (end > vm_map_max(map) || end < addr) 781 return (ENOMEM); 782 783 /* 784 * Address of byte vector 785 */ 786 vec = uap->vec; 787 788 pmap = vmspace_pmap(td->td_proc->p_vmspace); 789 790 vm_map_lock_read(map); 791 RestartScan: 792 timestamp = map->timestamp; 793 794 if (!vm_map_lookup_entry(map, addr, &entry)) { 795 vm_map_unlock_read(map); 796 return (ENOMEM); 797 } 798 799 /* 800 * Do this on a map entry basis so that if the pages are not 801 * in the current processes address space, we can easily look 802 * up the pages elsewhere. 803 */ 804 lastvecindex = -1; 805 for (current = entry; 806 (current != &map->header) && (current->start < end); 807 current = current->next) { 808 809 /* 810 * check for contiguity 811 */ 812 if (current->end < end && 813 (entry->next == &map->header || 814 current->next->start > current->end)) { 815 vm_map_unlock_read(map); 816 return (ENOMEM); 817 } 818 819 /* 820 * ignore submaps (for now) or null objects 821 */ 822 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 823 current->object.vm_object == NULL) 824 continue; 825 826 /* 827 * limit this scan to the current map entry and the 828 * limits for the mincore call 829 */ 830 if (addr < current->start) 831 addr = current->start; 832 cend = current->end; 833 if (cend > end) 834 cend = end; 835 836 /* 837 * scan this entry one page at a time 838 */ 839 while (addr < cend) { 840 /* 841 * Check pmap first, it is likely faster, also 842 * it can provide info as to whether we are the 843 * one referencing or modifying the page. 844 */ 845 mincoreinfo = pmap_mincore(pmap, addr); 846 if (!mincoreinfo) { 847 vm_pindex_t pindex; 848 vm_ooffset_t offset; 849 vm_page_t m; 850 /* 851 * calculate the page index into the object 852 */ 853 offset = current->offset + (addr - current->start); 854 pindex = OFF_TO_IDX(offset); 855 VM_OBJECT_LOCK(current->object.vm_object); 856 m = vm_page_lookup(current->object.vm_object, 857 pindex); 858 /* 859 * if the page is resident, then gather information about 860 * it. 861 */ 862 if (m != NULL && m->valid != 0) { 863 mincoreinfo = MINCORE_INCORE; 864 vm_page_lock_queues(); 865 if (m->dirty || 866 pmap_is_modified(m)) 867 mincoreinfo |= MINCORE_MODIFIED_OTHER; 868 if ((m->flags & PG_REFERENCED) || 869 pmap_ts_referenced(m)) { 870 vm_page_flag_set(m, PG_REFERENCED); 871 mincoreinfo |= MINCORE_REFERENCED_OTHER; 872 } 873 vm_page_unlock_queues(); 874 } 875 VM_OBJECT_UNLOCK(current->object.vm_object); 876 } 877 878 /* 879 * subyte may page fault. In case it needs to modify 880 * the map, we release the lock. 881 */ 882 vm_map_unlock_read(map); 883 884 /* 885 * calculate index into user supplied byte vector 886 */ 887 vecindex = OFF_TO_IDX(addr - first_addr); 888 889 /* 890 * If we have skipped map entries, we need to make sure that 891 * the byte vector is zeroed for those skipped entries. 892 */ 893 while ((lastvecindex + 1) < vecindex) { 894 error = subyte(vec + lastvecindex, 0); 895 if (error) { 896 error = EFAULT; 897 goto done2; 898 } 899 ++lastvecindex; 900 } 901 902 /* 903 * Pass the page information to the user 904 */ 905 error = subyte(vec + vecindex, mincoreinfo); 906 if (error) { 907 error = EFAULT; 908 goto done2; 909 } 910 911 /* 912 * If the map has changed, due to the subyte, the previous 913 * output may be invalid. 914 */ 915 vm_map_lock_read(map); 916 if (timestamp != map->timestamp) 917 goto RestartScan; 918 919 lastvecindex = vecindex; 920 addr += PAGE_SIZE; 921 } 922 } 923 924 /* 925 * subyte may page fault. In case it needs to modify 926 * the map, we release the lock. 927 */ 928 vm_map_unlock_read(map); 929 930 /* 931 * Zero the last entries in the byte vector. 932 */ 933 vecindex = OFF_TO_IDX(end - first_addr); 934 while ((lastvecindex + 1) < vecindex) { 935 error = subyte(vec + lastvecindex, 0); 936 if (error) { 937 error = EFAULT; 938 goto done2; 939 } 940 ++lastvecindex; 941 } 942 943 /* 944 * If the map has changed, due to the subyte, the previous 945 * output may be invalid. 946 */ 947 vm_map_lock_read(map); 948 if (timestamp != map->timestamp) 949 goto RestartScan; 950 vm_map_unlock_read(map); 951 done2: 952 return (error); 953 } 954 955 #ifndef _SYS_SYSPROTO_H_ 956 struct mlock_args { 957 const void *addr; 958 size_t len; 959 }; 960 #endif 961 /* 962 * MPSAFE 963 */ 964 int 965 mlock(td, uap) 966 struct thread *td; 967 struct mlock_args *uap; 968 { 969 struct proc *proc; 970 vm_offset_t addr, end, last, start; 971 vm_size_t npages, size; 972 int error; 973 974 error = priv_check(td, PRIV_VM_MLOCK); 975 if (error) 976 return (error); 977 addr = (vm_offset_t)uap->addr; 978 size = uap->len; 979 last = addr + size; 980 start = trunc_page(addr); 981 end = round_page(last); 982 if (last < addr || end < addr) 983 return (EINVAL); 984 npages = atop(end - start); 985 if (npages > vm_page_max_wired) 986 return (ENOMEM); 987 proc = td->td_proc; 988 PROC_LOCK(proc); 989 if (ptoa(npages + 990 pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) > 991 lim_cur(proc, RLIMIT_MEMLOCK)) { 992 PROC_UNLOCK(proc); 993 return (ENOMEM); 994 } 995 PROC_UNLOCK(proc); 996 if (npages + cnt.v_wire_count > vm_page_max_wired) 997 return (EAGAIN); 998 error = vm_map_wire(&proc->p_vmspace->vm_map, start, end, 999 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1000 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1001 } 1002 1003 #ifndef _SYS_SYSPROTO_H_ 1004 struct mlockall_args { 1005 int how; 1006 }; 1007 #endif 1008 1009 /* 1010 * MPSAFE 1011 */ 1012 int 1013 mlockall(td, uap) 1014 struct thread *td; 1015 struct mlockall_args *uap; 1016 { 1017 vm_map_t map; 1018 int error; 1019 1020 map = &td->td_proc->p_vmspace->vm_map; 1021 error = 0; 1022 1023 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1024 return (EINVAL); 1025 1026 #if 0 1027 /* 1028 * If wiring all pages in the process would cause it to exceed 1029 * a hard resource limit, return ENOMEM. 1030 */ 1031 PROC_LOCK(td->td_proc); 1032 if (map->size - ptoa(pmap_wired_count(vm_map_pmap(map)) > 1033 lim_cur(td->td_proc, RLIMIT_MEMLOCK))) { 1034 PROC_UNLOCK(td->td_proc); 1035 return (ENOMEM); 1036 } 1037 PROC_UNLOCK(td->td_proc); 1038 #else 1039 error = priv_check(td, PRIV_VM_MLOCK); 1040 if (error) 1041 return (error); 1042 #endif 1043 1044 if (uap->how & MCL_FUTURE) { 1045 vm_map_lock(map); 1046 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1047 vm_map_unlock(map); 1048 error = 0; 1049 } 1050 1051 if (uap->how & MCL_CURRENT) { 1052 /* 1053 * P1003.1-2001 mandates that all currently mapped pages 1054 * will be memory resident and locked (wired) upon return 1055 * from mlockall(). vm_map_wire() will wire pages, by 1056 * calling vm_fault_wire() for each page in the region. 1057 */ 1058 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1059 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1060 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1061 } 1062 1063 return (error); 1064 } 1065 1066 #ifndef _SYS_SYSPROTO_H_ 1067 struct munlockall_args { 1068 register_t dummy; 1069 }; 1070 #endif 1071 1072 /* 1073 * MPSAFE 1074 */ 1075 int 1076 munlockall(td, uap) 1077 struct thread *td; 1078 struct munlockall_args *uap; 1079 { 1080 vm_map_t map; 1081 int error; 1082 1083 map = &td->td_proc->p_vmspace->vm_map; 1084 error = priv_check(td, PRIV_VM_MUNLOCK); 1085 if (error) 1086 return (error); 1087 1088 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1089 vm_map_lock(map); 1090 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1091 vm_map_unlock(map); 1092 1093 /* Forcibly unwire all pages. */ 1094 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1095 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1096 1097 return (error); 1098 } 1099 1100 #ifndef _SYS_SYSPROTO_H_ 1101 struct munlock_args { 1102 const void *addr; 1103 size_t len; 1104 }; 1105 #endif 1106 /* 1107 * MPSAFE 1108 */ 1109 int 1110 munlock(td, uap) 1111 struct thread *td; 1112 struct munlock_args *uap; 1113 { 1114 vm_offset_t addr, end, last, start; 1115 vm_size_t size; 1116 int error; 1117 1118 error = priv_check(td, PRIV_VM_MUNLOCK); 1119 if (error) 1120 return (error); 1121 addr = (vm_offset_t)uap->addr; 1122 size = uap->len; 1123 last = addr + size; 1124 start = trunc_page(addr); 1125 end = round_page(last); 1126 if (last < addr || end < addr) 1127 return (EINVAL); 1128 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1129 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1130 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1131 } 1132 1133 /* 1134 * vm_mmap_vnode() 1135 * 1136 * MPSAFE 1137 * 1138 * Helper function for vm_mmap. Perform sanity check specific for mmap 1139 * operations on vnodes. 1140 */ 1141 int 1142 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1143 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1144 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp) 1145 { 1146 struct vattr va; 1147 vm_object_t obj; 1148 vm_offset_t foff; 1149 struct mount *mp; 1150 struct ucred *cred; 1151 int error, flags; 1152 int vfslocked; 1153 1154 mp = vp->v_mount; 1155 cred = td->td_ucred; 1156 vfslocked = VFS_LOCK_GIANT(mp); 1157 if ((error = vget(vp, LK_SHARED, td)) != 0) { 1158 VFS_UNLOCK_GIANT(vfslocked); 1159 return (error); 1160 } 1161 foff = *foffp; 1162 flags = *flagsp; 1163 obj = vp->v_object; 1164 if (vp->v_type == VREG) { 1165 /* 1166 * Get the proper underlying object 1167 */ 1168 if (obj == NULL) { 1169 error = EINVAL; 1170 goto done; 1171 } 1172 if (obj->handle != vp) { 1173 vput(vp); 1174 vp = (struct vnode*)obj->handle; 1175 vget(vp, LK_SHARED, td); 1176 } 1177 } else if (vp->v_type == VCHR) { 1178 error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp, 1179 vp->v_rdev, foffp, objp); 1180 if (error == 0) 1181 goto mark_atime; 1182 goto done; 1183 } else { 1184 error = EINVAL; 1185 goto done; 1186 } 1187 if ((error = VOP_GETATTR(vp, &va, cred))) 1188 goto done; 1189 #ifdef MAC 1190 error = mac_vnode_check_mmap(cred, vp, prot, flags); 1191 if (error != 0) 1192 goto done; 1193 #endif 1194 if ((flags & MAP_SHARED) != 0) { 1195 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1196 if (prot & PROT_WRITE) { 1197 error = EPERM; 1198 goto done; 1199 } 1200 *maxprotp &= ~VM_PROT_WRITE; 1201 } 1202 } 1203 /* 1204 * If it is a regular file without any references 1205 * we do not need to sync it. 1206 * Adjust object size to be the size of actual file. 1207 */ 1208 objsize = round_page(va.va_size); 1209 if (va.va_nlink == 0) 1210 flags |= MAP_NOSYNC; 1211 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff); 1212 if (obj == NULL) { 1213 error = ENOMEM; 1214 goto done; 1215 } 1216 *objp = obj; 1217 *flagsp = flags; 1218 1219 mark_atime: 1220 vfs_mark_atime(vp, cred); 1221 1222 done: 1223 vput(vp); 1224 VFS_UNLOCK_GIANT(vfslocked); 1225 return (error); 1226 } 1227 1228 /* 1229 * vm_mmap_cdev() 1230 * 1231 * MPSAFE 1232 * 1233 * Helper function for vm_mmap. Perform sanity check specific for mmap 1234 * operations on cdevs. 1235 */ 1236 int 1237 vm_mmap_cdev(struct thread *td, vm_size_t objsize, 1238 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1239 struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp) 1240 { 1241 vm_object_t obj; 1242 struct cdevsw *dsw; 1243 int error, flags; 1244 1245 flags = *flagsp; 1246 1247 dsw = dev_refthread(cdev); 1248 if (dsw == NULL) 1249 return (ENXIO); 1250 if (dsw->d_flags & D_MMAP_ANON) { 1251 dev_relthread(cdev); 1252 *maxprotp = VM_PROT_ALL; 1253 *flagsp |= MAP_ANON; 1254 return (0); 1255 } 1256 /* 1257 * cdevs do not provide private mappings of any kind. 1258 */ 1259 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1260 (prot & PROT_WRITE) != 0) { 1261 dev_relthread(cdev); 1262 return (EACCES); 1263 } 1264 if (flags & (MAP_PRIVATE|MAP_COPY)) { 1265 dev_relthread(cdev); 1266 return (EINVAL); 1267 } 1268 /* 1269 * Force device mappings to be shared. 1270 */ 1271 flags |= MAP_SHARED; 1272 #ifdef MAC_XXX 1273 error = mac_cdev_check_mmap(td->td_ucred, cdev, prot); 1274 if (error != 0) { 1275 dev_relthread(cdev); 1276 return (error); 1277 } 1278 #endif 1279 /* 1280 * First, try d_mmap_single(). If that is not implemented 1281 * (returns ENODEV), fall back to using the device pager. 1282 * Note that d_mmap_single() must return a reference to the 1283 * object (it needs to bump the reference count of the object 1284 * it returns somehow). 1285 * 1286 * XXX assumes VM_PROT_* == PROT_* 1287 */ 1288 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1289 dev_relthread(cdev); 1290 if (error != ENODEV) 1291 return (error); 1292 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff); 1293 if (obj == NULL) 1294 return (EINVAL); 1295 *objp = obj; 1296 *flagsp = flags; 1297 return (0); 1298 } 1299 1300 /* 1301 * vm_mmap_shm() 1302 * 1303 * MPSAFE 1304 * 1305 * Helper function for vm_mmap. Perform sanity check specific for mmap 1306 * operations on shm file descriptors. 1307 */ 1308 int 1309 vm_mmap_shm(struct thread *td, vm_size_t objsize, 1310 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1311 struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp) 1312 { 1313 int error; 1314 1315 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1316 (prot & PROT_WRITE) != 0) 1317 return (EACCES); 1318 #ifdef MAC 1319 error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp); 1320 if (error != 0) 1321 return (error); 1322 #endif 1323 error = shm_mmap(shmfd, objsize, foff, objp); 1324 if (error) 1325 return (error); 1326 return (0); 1327 } 1328 1329 /* 1330 * vm_mmap() 1331 * 1332 * MPSAFE 1333 * 1334 * Internal version of mmap. Currently used by mmap, exec, and sys5 1335 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1336 */ 1337 int 1338 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1339 vm_prot_t maxprot, int flags, 1340 objtype_t handle_type, void *handle, 1341 vm_ooffset_t foff) 1342 { 1343 boolean_t fitit; 1344 vm_object_t object = NULL; 1345 int rv = KERN_SUCCESS; 1346 int docow, error; 1347 struct thread *td = curthread; 1348 1349 if (size == 0) 1350 return (0); 1351 1352 size = round_page(size); 1353 1354 PROC_LOCK(td->td_proc); 1355 if (td->td_proc->p_vmspace->vm_map.size + size > 1356 lim_cur(td->td_proc, RLIMIT_VMEM)) { 1357 PROC_UNLOCK(td->td_proc); 1358 return(ENOMEM); 1359 } 1360 PROC_UNLOCK(td->td_proc); 1361 1362 /* 1363 * We currently can only deal with page aligned file offsets. 1364 * The check is here rather than in the syscall because the 1365 * kernel calls this function internally for other mmaping 1366 * operations (such as in exec) and non-aligned offsets will 1367 * cause pmap inconsistencies...so we want to be sure to 1368 * disallow this in all cases. 1369 */ 1370 if (foff & PAGE_MASK) 1371 return (EINVAL); 1372 1373 if ((flags & MAP_FIXED) == 0) { 1374 fitit = TRUE; 1375 *addr = round_page(*addr); 1376 } else { 1377 if (*addr != trunc_page(*addr)) 1378 return (EINVAL); 1379 fitit = FALSE; 1380 } 1381 /* 1382 * Lookup/allocate object. 1383 */ 1384 switch (handle_type) { 1385 case OBJT_DEVICE: 1386 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, 1387 handle, &foff, &object); 1388 break; 1389 case OBJT_VNODE: 1390 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1391 handle, &foff, &object); 1392 break; 1393 case OBJT_SWAP: 1394 error = vm_mmap_shm(td, size, prot, &maxprot, &flags, 1395 handle, foff, &object); 1396 break; 1397 case OBJT_DEFAULT: 1398 if (handle == NULL) { 1399 error = 0; 1400 break; 1401 } 1402 /* FALLTHROUGH */ 1403 default: 1404 error = EINVAL; 1405 break; 1406 } 1407 if (error) 1408 return (error); 1409 if (flags & MAP_ANON) { 1410 object = NULL; 1411 docow = 0; 1412 /* 1413 * Unnamed anonymous regions always start at 0. 1414 */ 1415 if (handle == 0) 1416 foff = 0; 1417 } else { 1418 docow = MAP_PREFAULT_PARTIAL; 1419 } 1420 1421 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1422 docow |= MAP_COPY_ON_WRITE; 1423 if (flags & MAP_NOSYNC) 1424 docow |= MAP_DISABLE_SYNCER; 1425 if (flags & MAP_NOCORE) 1426 docow |= MAP_DISABLE_COREDUMP; 1427 1428 if (flags & MAP_STACK) 1429 rv = vm_map_stack(map, *addr, size, prot, maxprot, 1430 docow | MAP_STACK_GROWS_DOWN); 1431 else if (fitit) 1432 rv = vm_map_find(map, object, foff, addr, size, 1433 object != NULL && object->type == OBJT_DEVICE ? 1434 VMFS_ALIGNED_SPACE : VMFS_ANY_SPACE, prot, maxprot, docow); 1435 else 1436 rv = vm_map_fixed(map, object, foff, *addr, size, 1437 prot, maxprot, docow); 1438 1439 if (rv != KERN_SUCCESS) { 1440 /* 1441 * Lose the object reference. Will destroy the 1442 * object if it's an unnamed anonymous mapping 1443 * or named anonymous without other references. 1444 */ 1445 vm_object_deallocate(object); 1446 } else if (flags & MAP_SHARED) { 1447 /* 1448 * Shared memory is also shared with children. 1449 */ 1450 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); 1451 if (rv != KERN_SUCCESS) 1452 (void) vm_map_remove(map, *addr, *addr + size); 1453 } 1454 1455 /* 1456 * If the process has requested that all future mappings 1457 * be wired, then heed this. 1458 */ 1459 if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE)) 1460 vm_map_wire(map, *addr, *addr + size, 1461 VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); 1462 1463 switch (rv) { 1464 case KERN_SUCCESS: 1465 return (0); 1466 case KERN_INVALID_ADDRESS: 1467 case KERN_NO_SPACE: 1468 return (ENOMEM); 1469 case KERN_PROTECTION_FAILURE: 1470 return (EACCES); 1471 default: 1472 return (EINVAL); 1473 } 1474 } 1475