1 /*- 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35 * 36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_compat.h" 47 #include "opt_hwpmc_hooks.h" 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/kernel.h> 52 #include <sys/lock.h> 53 #include <sys/mutex.h> 54 #include <sys/sysproto.h> 55 #include <sys/filedesc.h> 56 #include <sys/priv.h> 57 #include <sys/proc.h> 58 #include <sys/resource.h> 59 #include <sys/resourcevar.h> 60 #include <sys/vnode.h> 61 #include <sys/fcntl.h> 62 #include <sys/file.h> 63 #include <sys/mman.h> 64 #include <sys/mount.h> 65 #include <sys/conf.h> 66 #include <sys/stat.h> 67 #include <sys/sysent.h> 68 #include <sys/vmmeter.h> 69 #include <sys/sysctl.h> 70 71 #include <security/mac/mac_framework.h> 72 73 #include <vm/vm.h> 74 #include <vm/vm_param.h> 75 #include <vm/pmap.h> 76 #include <vm/vm_map.h> 77 #include <vm/vm_object.h> 78 #include <vm/vm_page.h> 79 #include <vm/vm_pager.h> 80 #include <vm/vm_pageout.h> 81 #include <vm/vm_extern.h> 82 #include <vm/vm_page.h> 83 #include <vm/vm_kern.h> 84 85 #ifdef HWPMC_HOOKS 86 #include <sys/pmckern.h> 87 #endif 88 89 #ifndef _SYS_SYSPROTO_H_ 90 struct sbrk_args { 91 int incr; 92 }; 93 #endif 94 95 static int max_proc_mmap; 96 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, 97 "Maximum number of memory-mapped files per process"); 98 99 /* 100 * Set the maximum number of vm_map_entry structures per process. Roughly 101 * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100 102 * of our KVM malloc space still results in generous limits. We want a 103 * default that is good enough to prevent the kernel running out of resources 104 * if attacked from compromised user account but generous enough such that 105 * multi-threaded processes are not unduly inconvenienced. 106 */ 107 static void vmmapentry_rsrc_init(void *); 108 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, 109 NULL); 110 111 static void 112 vmmapentry_rsrc_init(dummy) 113 void *dummy; 114 { 115 max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry); 116 max_proc_mmap /= 100; 117 } 118 119 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 120 int *, struct vnode *, vm_ooffset_t *, vm_object_t *); 121 static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 122 int *, struct cdev *, vm_ooffset_t *, vm_object_t *); 123 static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 124 int *, struct shmfd *, vm_ooffset_t, vm_object_t *); 125 126 /* 127 * MPSAFE 128 */ 129 /* ARGSUSED */ 130 int 131 sbrk(td, uap) 132 struct thread *td; 133 struct sbrk_args *uap; 134 { 135 /* Not yet implemented */ 136 return (EOPNOTSUPP); 137 } 138 139 #ifndef _SYS_SYSPROTO_H_ 140 struct sstk_args { 141 int incr; 142 }; 143 #endif 144 145 /* 146 * MPSAFE 147 */ 148 /* ARGSUSED */ 149 int 150 sstk(td, uap) 151 struct thread *td; 152 struct sstk_args *uap; 153 { 154 /* Not yet implemented */ 155 return (EOPNOTSUPP); 156 } 157 158 #if defined(COMPAT_43) 159 #ifndef _SYS_SYSPROTO_H_ 160 struct getpagesize_args { 161 int dummy; 162 }; 163 #endif 164 165 /* ARGSUSED */ 166 int 167 ogetpagesize(td, uap) 168 struct thread *td; 169 struct getpagesize_args *uap; 170 { 171 /* MP SAFE */ 172 td->td_retval[0] = PAGE_SIZE; 173 return (0); 174 } 175 #endif /* COMPAT_43 */ 176 177 178 /* 179 * Memory Map (mmap) system call. Note that the file offset 180 * and address are allowed to be NOT page aligned, though if 181 * the MAP_FIXED flag it set, both must have the same remainder 182 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 183 * page-aligned, the actual mapping starts at trunc_page(addr) 184 * and the return value is adjusted up by the page offset. 185 * 186 * Generally speaking, only character devices which are themselves 187 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 188 * there would be no cache coherency between a descriptor and a VM mapping 189 * both to the same character device. 190 */ 191 #ifndef _SYS_SYSPROTO_H_ 192 struct mmap_args { 193 void *addr; 194 size_t len; 195 int prot; 196 int flags; 197 int fd; 198 long pad; 199 off_t pos; 200 }; 201 #endif 202 203 /* 204 * MPSAFE 205 */ 206 int 207 mmap(td, uap) 208 struct thread *td; 209 struct mmap_args *uap; 210 { 211 #ifdef HWPMC_HOOKS 212 struct pmckern_map_in pkm; 213 #endif 214 struct file *fp; 215 struct vnode *vp; 216 vm_offset_t addr; 217 vm_size_t size, pageoff; 218 vm_prot_t prot, maxprot; 219 void *handle; 220 objtype_t handle_type; 221 int flags, error; 222 off_t pos; 223 struct vmspace *vms = td->td_proc->p_vmspace; 224 225 addr = (vm_offset_t) uap->addr; 226 size = uap->len; 227 prot = uap->prot & VM_PROT_ALL; 228 flags = uap->flags; 229 pos = uap->pos; 230 231 fp = NULL; 232 233 /* Make sure mapping fits into numeric range, etc. */ 234 if ((uap->len == 0 && !SV_CURPROC_FLAG(SV_AOUT) && 235 curproc->p_osrel >= 800104) || 236 ((flags & MAP_ANON) && uap->fd != -1)) 237 return (EINVAL); 238 239 if (flags & MAP_STACK) { 240 if ((uap->fd != -1) || 241 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 242 return (EINVAL); 243 flags |= MAP_ANON; 244 pos = 0; 245 } 246 247 /* 248 * Align the file position to a page boundary, 249 * and save its page offset component. 250 */ 251 pageoff = (pos & PAGE_MASK); 252 pos -= pageoff; 253 254 /* Adjust size for rounding (on both ends). */ 255 size += pageoff; /* low end... */ 256 size = (vm_size_t) round_page(size); /* hi end */ 257 258 /* 259 * Check for illegal addresses. Watch out for address wrap... Note 260 * that VM_*_ADDRESS are not constants due to casts (argh). 261 */ 262 if (flags & MAP_FIXED) { 263 /* 264 * The specified address must have the same remainder 265 * as the file offset taken modulo PAGE_SIZE, so it 266 * should be aligned after adjustment by pageoff. 267 */ 268 addr -= pageoff; 269 if (addr & PAGE_MASK) 270 return (EINVAL); 271 272 /* Address range must be all in user VM space. */ 273 if (addr < vm_map_min(&vms->vm_map) || 274 addr + size > vm_map_max(&vms->vm_map)) 275 return (EINVAL); 276 if (addr + size < addr) 277 return (EINVAL); 278 } else { 279 /* 280 * XXX for non-fixed mappings where no hint is provided or 281 * the hint would fall in the potential heap space, 282 * place it after the end of the largest possible heap. 283 * 284 * There should really be a pmap call to determine a reasonable 285 * location. 286 */ 287 PROC_LOCK(td->td_proc); 288 if (addr == 0 || 289 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 290 addr < round_page((vm_offset_t)vms->vm_daddr + 291 lim_max(td->td_proc, RLIMIT_DATA)))) 292 addr = round_page((vm_offset_t)vms->vm_daddr + 293 lim_max(td->td_proc, RLIMIT_DATA)); 294 PROC_UNLOCK(td->td_proc); 295 } 296 if (flags & MAP_ANON) { 297 /* 298 * Mapping blank space is trivial. 299 */ 300 handle = NULL; 301 handle_type = OBJT_DEFAULT; 302 maxprot = VM_PROT_ALL; 303 pos = 0; 304 } else { 305 /* 306 * Mapping file, get fp for validation and 307 * don't let the descriptor disappear on us if we block. 308 */ 309 if ((error = fget(td, uap->fd, &fp)) != 0) 310 goto done; 311 if (fp->f_type == DTYPE_SHM) { 312 handle = fp->f_data; 313 handle_type = OBJT_SWAP; 314 maxprot = VM_PROT_NONE; 315 316 /* FREAD should always be set. */ 317 if (fp->f_flag & FREAD) 318 maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; 319 if (fp->f_flag & FWRITE) 320 maxprot |= VM_PROT_WRITE; 321 goto map; 322 } 323 if (fp->f_type != DTYPE_VNODE) { 324 error = ENODEV; 325 goto done; 326 } 327 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ 328 defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) 329 /* 330 * POSIX shared-memory objects are defined to have 331 * kernel persistence, and are not defined to support 332 * read(2)/write(2) -- or even open(2). Thus, we can 333 * use MAP_ASYNC to trade on-disk coherence for speed. 334 * The shm_open(3) library routine turns on the FPOSIXSHM 335 * flag to request this behavior. 336 */ 337 if (fp->f_flag & FPOSIXSHM) 338 flags |= MAP_NOSYNC; 339 #endif 340 vp = fp->f_vnode; 341 /* 342 * Ensure that file and memory protections are 343 * compatible. Note that we only worry about 344 * writability if mapping is shared; in this case, 345 * current and max prot are dictated by the open file. 346 * XXX use the vnode instead? Problem is: what 347 * credentials do we use for determination? What if 348 * proc does a setuid? 349 */ 350 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 351 maxprot = VM_PROT_NONE; 352 else 353 maxprot = VM_PROT_EXECUTE; 354 if (fp->f_flag & FREAD) { 355 maxprot |= VM_PROT_READ; 356 } else if (prot & PROT_READ) { 357 error = EACCES; 358 goto done; 359 } 360 /* 361 * If we are sharing potential changes (either via 362 * MAP_SHARED or via the implicit sharing of character 363 * device mappings), and we are trying to get write 364 * permission although we opened it without asking 365 * for it, bail out. 366 */ 367 if ((flags & MAP_SHARED) != 0) { 368 if ((fp->f_flag & FWRITE) != 0) { 369 maxprot |= VM_PROT_WRITE; 370 } else if ((prot & PROT_WRITE) != 0) { 371 error = EACCES; 372 goto done; 373 } 374 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 375 maxprot |= VM_PROT_WRITE; 376 } 377 handle = (void *)vp; 378 handle_type = OBJT_VNODE; 379 } 380 map: 381 382 /* 383 * Do not allow more then a certain number of vm_map_entry structures 384 * per process. Scale with the number of rforks sharing the map 385 * to make the limit reasonable for threads. 386 */ 387 if (max_proc_mmap && 388 vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) { 389 error = ENOMEM; 390 goto done; 391 } 392 393 td->td_fpop = fp; 394 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 395 flags, handle_type, handle, pos); 396 td->td_fpop = NULL; 397 #ifdef HWPMC_HOOKS 398 /* inform hwpmc(4) if an executable is being mapped */ 399 if (error == 0 && handle_type == OBJT_VNODE && 400 (prot & PROT_EXEC)) { 401 pkm.pm_file = handle; 402 pkm.pm_address = (uintptr_t) addr; 403 PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); 404 } 405 #endif 406 if (error == 0) 407 td->td_retval[0] = (register_t) (addr + pageoff); 408 done: 409 if (fp) 410 fdrop(fp, td); 411 412 return (error); 413 } 414 415 int 416 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 417 { 418 struct mmap_args oargs; 419 420 oargs.addr = uap->addr; 421 oargs.len = uap->len; 422 oargs.prot = uap->prot; 423 oargs.flags = uap->flags; 424 oargs.fd = uap->fd; 425 oargs.pos = uap->pos; 426 return (mmap(td, &oargs)); 427 } 428 429 #ifdef COMPAT_43 430 #ifndef _SYS_SYSPROTO_H_ 431 struct ommap_args { 432 caddr_t addr; 433 int len; 434 int prot; 435 int flags; 436 int fd; 437 long pos; 438 }; 439 #endif 440 int 441 ommap(td, uap) 442 struct thread *td; 443 struct ommap_args *uap; 444 { 445 struct mmap_args nargs; 446 static const char cvtbsdprot[8] = { 447 0, 448 PROT_EXEC, 449 PROT_WRITE, 450 PROT_EXEC | PROT_WRITE, 451 PROT_READ, 452 PROT_EXEC | PROT_READ, 453 PROT_WRITE | PROT_READ, 454 PROT_EXEC | PROT_WRITE | PROT_READ, 455 }; 456 457 #define OMAP_ANON 0x0002 458 #define OMAP_COPY 0x0020 459 #define OMAP_SHARED 0x0010 460 #define OMAP_FIXED 0x0100 461 462 nargs.addr = uap->addr; 463 nargs.len = uap->len; 464 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 465 nargs.flags = 0; 466 if (uap->flags & OMAP_ANON) 467 nargs.flags |= MAP_ANON; 468 if (uap->flags & OMAP_COPY) 469 nargs.flags |= MAP_COPY; 470 if (uap->flags & OMAP_SHARED) 471 nargs.flags |= MAP_SHARED; 472 else 473 nargs.flags |= MAP_PRIVATE; 474 if (uap->flags & OMAP_FIXED) 475 nargs.flags |= MAP_FIXED; 476 nargs.fd = uap->fd; 477 nargs.pos = uap->pos; 478 return (mmap(td, &nargs)); 479 } 480 #endif /* COMPAT_43 */ 481 482 483 #ifndef _SYS_SYSPROTO_H_ 484 struct msync_args { 485 void *addr; 486 size_t len; 487 int flags; 488 }; 489 #endif 490 /* 491 * MPSAFE 492 */ 493 int 494 msync(td, uap) 495 struct thread *td; 496 struct msync_args *uap; 497 { 498 vm_offset_t addr; 499 vm_size_t size, pageoff; 500 int flags; 501 vm_map_t map; 502 int rv; 503 504 addr = (vm_offset_t) uap->addr; 505 size = uap->len; 506 flags = uap->flags; 507 508 pageoff = (addr & PAGE_MASK); 509 addr -= pageoff; 510 size += pageoff; 511 size = (vm_size_t) round_page(size); 512 if (addr + size < addr) 513 return (EINVAL); 514 515 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 516 return (EINVAL); 517 518 map = &td->td_proc->p_vmspace->vm_map; 519 520 /* 521 * Clean the pages and interpret the return value. 522 */ 523 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 524 (flags & MS_INVALIDATE) != 0); 525 switch (rv) { 526 case KERN_SUCCESS: 527 return (0); 528 case KERN_INVALID_ADDRESS: 529 return (EINVAL); /* Sun returns ENOMEM? */ 530 case KERN_INVALID_ARGUMENT: 531 return (EBUSY); 532 default: 533 return (EINVAL); 534 } 535 } 536 537 #ifndef _SYS_SYSPROTO_H_ 538 struct munmap_args { 539 void *addr; 540 size_t len; 541 }; 542 #endif 543 /* 544 * MPSAFE 545 */ 546 int 547 munmap(td, uap) 548 struct thread *td; 549 struct munmap_args *uap; 550 { 551 #ifdef HWPMC_HOOKS 552 struct pmckern_map_out pkm; 553 vm_map_entry_t entry; 554 #endif 555 vm_offset_t addr; 556 vm_size_t size, pageoff; 557 vm_map_t map; 558 559 addr = (vm_offset_t) uap->addr; 560 size = uap->len; 561 if (size == 0) 562 return (EINVAL); 563 564 pageoff = (addr & PAGE_MASK); 565 addr -= pageoff; 566 size += pageoff; 567 size = (vm_size_t) round_page(size); 568 if (addr + size < addr) 569 return (EINVAL); 570 571 /* 572 * Check for illegal addresses. Watch out for address wrap... 573 */ 574 map = &td->td_proc->p_vmspace->vm_map; 575 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 576 return (EINVAL); 577 vm_map_lock(map); 578 #ifdef HWPMC_HOOKS 579 /* 580 * Inform hwpmc if the address range being unmapped contains 581 * an executable region. 582 */ 583 if (vm_map_lookup_entry(map, addr, &entry)) { 584 for (; 585 entry != &map->header && entry->start < addr + size; 586 entry = entry->next) { 587 if (vm_map_check_protection(map, entry->start, 588 entry->end, VM_PROT_EXECUTE) == TRUE) { 589 pkm.pm_address = (uintptr_t) addr; 590 pkm.pm_size = (size_t) size; 591 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, 592 (void *) &pkm); 593 break; 594 } 595 } 596 } 597 #endif 598 /* returns nothing but KERN_SUCCESS anyway */ 599 vm_map_delete(map, addr, addr + size); 600 vm_map_unlock(map); 601 return (0); 602 } 603 604 #ifndef _SYS_SYSPROTO_H_ 605 struct mprotect_args { 606 const void *addr; 607 size_t len; 608 int prot; 609 }; 610 #endif 611 /* 612 * MPSAFE 613 */ 614 int 615 mprotect(td, uap) 616 struct thread *td; 617 struct mprotect_args *uap; 618 { 619 vm_offset_t addr; 620 vm_size_t size, pageoff; 621 vm_prot_t prot; 622 623 addr = (vm_offset_t) uap->addr; 624 size = uap->len; 625 prot = uap->prot & VM_PROT_ALL; 626 627 pageoff = (addr & PAGE_MASK); 628 addr -= pageoff; 629 size += pageoff; 630 size = (vm_size_t) round_page(size); 631 if (addr + size < addr) 632 return (EINVAL); 633 634 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 635 addr + size, prot, FALSE)) { 636 case KERN_SUCCESS: 637 return (0); 638 case KERN_PROTECTION_FAILURE: 639 return (EACCES); 640 case KERN_RESOURCE_SHORTAGE: 641 return (ENOMEM); 642 } 643 return (EINVAL); 644 } 645 646 #ifndef _SYS_SYSPROTO_H_ 647 struct minherit_args { 648 void *addr; 649 size_t len; 650 int inherit; 651 }; 652 #endif 653 /* 654 * MPSAFE 655 */ 656 int 657 minherit(td, uap) 658 struct thread *td; 659 struct minherit_args *uap; 660 { 661 vm_offset_t addr; 662 vm_size_t size, pageoff; 663 vm_inherit_t inherit; 664 665 addr = (vm_offset_t)uap->addr; 666 size = uap->len; 667 inherit = uap->inherit; 668 669 pageoff = (addr & PAGE_MASK); 670 addr -= pageoff; 671 size += pageoff; 672 size = (vm_size_t) round_page(size); 673 if (addr + size < addr) 674 return (EINVAL); 675 676 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 677 addr + size, inherit)) { 678 case KERN_SUCCESS: 679 return (0); 680 case KERN_PROTECTION_FAILURE: 681 return (EACCES); 682 } 683 return (EINVAL); 684 } 685 686 #ifndef _SYS_SYSPROTO_H_ 687 struct madvise_args { 688 void *addr; 689 size_t len; 690 int behav; 691 }; 692 #endif 693 694 /* 695 * MPSAFE 696 */ 697 /* ARGSUSED */ 698 int 699 madvise(td, uap) 700 struct thread *td; 701 struct madvise_args *uap; 702 { 703 vm_offset_t start, end; 704 vm_map_t map; 705 struct proc *p; 706 int error; 707 708 /* 709 * Check for our special case, advising the swap pager we are 710 * "immortal." 711 */ 712 if (uap->behav == MADV_PROTECT) { 713 error = priv_check(td, PRIV_VM_MADV_PROTECT); 714 if (error == 0) { 715 p = td->td_proc; 716 PROC_LOCK(p); 717 p->p_flag |= P_PROTECTED; 718 PROC_UNLOCK(p); 719 } 720 return (error); 721 } 722 /* 723 * Check for illegal behavior 724 */ 725 if (uap->behav < 0 || uap->behav > MADV_CORE) 726 return (EINVAL); 727 /* 728 * Check for illegal addresses. Watch out for address wrap... Note 729 * that VM_*_ADDRESS are not constants due to casts (argh). 730 */ 731 map = &td->td_proc->p_vmspace->vm_map; 732 if ((vm_offset_t)uap->addr < vm_map_min(map) || 733 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 734 return (EINVAL); 735 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 736 return (EINVAL); 737 738 /* 739 * Since this routine is only advisory, we default to conservative 740 * behavior. 741 */ 742 start = trunc_page((vm_offset_t) uap->addr); 743 end = round_page((vm_offset_t) uap->addr + uap->len); 744 745 if (vm_map_madvise(map, start, end, uap->behav)) 746 return (EINVAL); 747 return (0); 748 } 749 750 #ifndef _SYS_SYSPROTO_H_ 751 struct mincore_args { 752 const void *addr; 753 size_t len; 754 char *vec; 755 }; 756 #endif 757 758 /* 759 * MPSAFE 760 */ 761 /* ARGSUSED */ 762 int 763 mincore(td, uap) 764 struct thread *td; 765 struct mincore_args *uap; 766 { 767 vm_offset_t addr, first_addr; 768 vm_offset_t end, cend; 769 pmap_t pmap; 770 vm_map_t map; 771 char *vec; 772 int error = 0; 773 int vecindex, lastvecindex; 774 vm_map_entry_t current; 775 vm_map_entry_t entry; 776 int mincoreinfo; 777 unsigned int timestamp; 778 779 /* 780 * Make sure that the addresses presented are valid for user 781 * mode. 782 */ 783 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 784 end = addr + (vm_size_t)round_page(uap->len); 785 map = &td->td_proc->p_vmspace->vm_map; 786 if (end > vm_map_max(map) || end < addr) 787 return (ENOMEM); 788 789 /* 790 * Address of byte vector 791 */ 792 vec = uap->vec; 793 794 pmap = vmspace_pmap(td->td_proc->p_vmspace); 795 796 vm_map_lock_read(map); 797 RestartScan: 798 timestamp = map->timestamp; 799 800 if (!vm_map_lookup_entry(map, addr, &entry)) { 801 vm_map_unlock_read(map); 802 return (ENOMEM); 803 } 804 805 /* 806 * Do this on a map entry basis so that if the pages are not 807 * in the current processes address space, we can easily look 808 * up the pages elsewhere. 809 */ 810 lastvecindex = -1; 811 for (current = entry; 812 (current != &map->header) && (current->start < end); 813 current = current->next) { 814 815 /* 816 * check for contiguity 817 */ 818 if (current->end < end && 819 (entry->next == &map->header || 820 current->next->start > current->end)) { 821 vm_map_unlock_read(map); 822 return (ENOMEM); 823 } 824 825 /* 826 * ignore submaps (for now) or null objects 827 */ 828 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 829 current->object.vm_object == NULL) 830 continue; 831 832 /* 833 * limit this scan to the current map entry and the 834 * limits for the mincore call 835 */ 836 if (addr < current->start) 837 addr = current->start; 838 cend = current->end; 839 if (cend > end) 840 cend = end; 841 842 /* 843 * scan this entry one page at a time 844 */ 845 while (addr < cend) { 846 /* 847 * Check pmap first, it is likely faster, also 848 * it can provide info as to whether we are the 849 * one referencing or modifying the page. 850 */ 851 mincoreinfo = pmap_mincore(pmap, addr); 852 if (!mincoreinfo) { 853 vm_pindex_t pindex; 854 vm_ooffset_t offset; 855 vm_page_t m; 856 /* 857 * calculate the page index into the object 858 */ 859 offset = current->offset + (addr - current->start); 860 pindex = OFF_TO_IDX(offset); 861 VM_OBJECT_LOCK(current->object.vm_object); 862 m = vm_page_lookup(current->object.vm_object, 863 pindex); 864 /* 865 * if the page is resident, then gather information about 866 * it. 867 */ 868 if (m != NULL && m->valid != 0) { 869 mincoreinfo = MINCORE_INCORE; 870 vm_page_lock_queues(); 871 if (m->dirty || 872 pmap_is_modified(m)) 873 mincoreinfo |= MINCORE_MODIFIED_OTHER; 874 if ((m->flags & PG_REFERENCED) || 875 pmap_ts_referenced(m)) { 876 vm_page_flag_set(m, PG_REFERENCED); 877 mincoreinfo |= MINCORE_REFERENCED_OTHER; 878 } 879 vm_page_unlock_queues(); 880 } 881 VM_OBJECT_UNLOCK(current->object.vm_object); 882 } 883 884 /* 885 * subyte may page fault. In case it needs to modify 886 * the map, we release the lock. 887 */ 888 vm_map_unlock_read(map); 889 890 /* 891 * calculate index into user supplied byte vector 892 */ 893 vecindex = OFF_TO_IDX(addr - first_addr); 894 895 /* 896 * If we have skipped map entries, we need to make sure that 897 * the byte vector is zeroed for those skipped entries. 898 */ 899 while ((lastvecindex + 1) < vecindex) { 900 error = subyte(vec + lastvecindex, 0); 901 if (error) { 902 error = EFAULT; 903 goto done2; 904 } 905 ++lastvecindex; 906 } 907 908 /* 909 * Pass the page information to the user 910 */ 911 error = subyte(vec + vecindex, mincoreinfo); 912 if (error) { 913 error = EFAULT; 914 goto done2; 915 } 916 917 /* 918 * If the map has changed, due to the subyte, the previous 919 * output may be invalid. 920 */ 921 vm_map_lock_read(map); 922 if (timestamp != map->timestamp) 923 goto RestartScan; 924 925 lastvecindex = vecindex; 926 addr += PAGE_SIZE; 927 } 928 } 929 930 /* 931 * subyte may page fault. In case it needs to modify 932 * the map, we release the lock. 933 */ 934 vm_map_unlock_read(map); 935 936 /* 937 * Zero the last entries in the byte vector. 938 */ 939 vecindex = OFF_TO_IDX(end - first_addr); 940 while ((lastvecindex + 1) < vecindex) { 941 error = subyte(vec + lastvecindex, 0); 942 if (error) { 943 error = EFAULT; 944 goto done2; 945 } 946 ++lastvecindex; 947 } 948 949 /* 950 * If the map has changed, due to the subyte, the previous 951 * output may be invalid. 952 */ 953 vm_map_lock_read(map); 954 if (timestamp != map->timestamp) 955 goto RestartScan; 956 vm_map_unlock_read(map); 957 done2: 958 return (error); 959 } 960 961 #ifndef _SYS_SYSPROTO_H_ 962 struct mlock_args { 963 const void *addr; 964 size_t len; 965 }; 966 #endif 967 /* 968 * MPSAFE 969 */ 970 int 971 mlock(td, uap) 972 struct thread *td; 973 struct mlock_args *uap; 974 { 975 struct proc *proc; 976 vm_offset_t addr, end, last, start; 977 vm_size_t npages, size; 978 int error; 979 980 error = priv_check(td, PRIV_VM_MLOCK); 981 if (error) 982 return (error); 983 addr = (vm_offset_t)uap->addr; 984 size = uap->len; 985 last = addr + size; 986 start = trunc_page(addr); 987 end = round_page(last); 988 if (last < addr || end < addr) 989 return (EINVAL); 990 npages = atop(end - start); 991 if (npages > vm_page_max_wired) 992 return (ENOMEM); 993 proc = td->td_proc; 994 PROC_LOCK(proc); 995 if (ptoa(npages + 996 pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) > 997 lim_cur(proc, RLIMIT_MEMLOCK)) { 998 PROC_UNLOCK(proc); 999 return (ENOMEM); 1000 } 1001 PROC_UNLOCK(proc); 1002 if (npages + cnt.v_wire_count > vm_page_max_wired) 1003 return (EAGAIN); 1004 error = vm_map_wire(&proc->p_vmspace->vm_map, start, end, 1005 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1006 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1007 } 1008 1009 #ifndef _SYS_SYSPROTO_H_ 1010 struct mlockall_args { 1011 int how; 1012 }; 1013 #endif 1014 1015 /* 1016 * MPSAFE 1017 */ 1018 int 1019 mlockall(td, uap) 1020 struct thread *td; 1021 struct mlockall_args *uap; 1022 { 1023 vm_map_t map; 1024 int error; 1025 1026 map = &td->td_proc->p_vmspace->vm_map; 1027 error = 0; 1028 1029 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1030 return (EINVAL); 1031 1032 #if 0 1033 /* 1034 * If wiring all pages in the process would cause it to exceed 1035 * a hard resource limit, return ENOMEM. 1036 */ 1037 PROC_LOCK(td->td_proc); 1038 if (map->size - ptoa(pmap_wired_count(vm_map_pmap(map)) > 1039 lim_cur(td->td_proc, RLIMIT_MEMLOCK))) { 1040 PROC_UNLOCK(td->td_proc); 1041 return (ENOMEM); 1042 } 1043 PROC_UNLOCK(td->td_proc); 1044 #else 1045 error = priv_check(td, PRIV_VM_MLOCK); 1046 if (error) 1047 return (error); 1048 #endif 1049 1050 if (uap->how & MCL_FUTURE) { 1051 vm_map_lock(map); 1052 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1053 vm_map_unlock(map); 1054 error = 0; 1055 } 1056 1057 if (uap->how & MCL_CURRENT) { 1058 /* 1059 * P1003.1-2001 mandates that all currently mapped pages 1060 * will be memory resident and locked (wired) upon return 1061 * from mlockall(). vm_map_wire() will wire pages, by 1062 * calling vm_fault_wire() for each page in the region. 1063 */ 1064 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1065 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1066 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1067 } 1068 1069 return (error); 1070 } 1071 1072 #ifndef _SYS_SYSPROTO_H_ 1073 struct munlockall_args { 1074 register_t dummy; 1075 }; 1076 #endif 1077 1078 /* 1079 * MPSAFE 1080 */ 1081 int 1082 munlockall(td, uap) 1083 struct thread *td; 1084 struct munlockall_args *uap; 1085 { 1086 vm_map_t map; 1087 int error; 1088 1089 map = &td->td_proc->p_vmspace->vm_map; 1090 error = priv_check(td, PRIV_VM_MUNLOCK); 1091 if (error) 1092 return (error); 1093 1094 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1095 vm_map_lock(map); 1096 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1097 vm_map_unlock(map); 1098 1099 /* Forcibly unwire all pages. */ 1100 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1101 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1102 1103 return (error); 1104 } 1105 1106 #ifndef _SYS_SYSPROTO_H_ 1107 struct munlock_args { 1108 const void *addr; 1109 size_t len; 1110 }; 1111 #endif 1112 /* 1113 * MPSAFE 1114 */ 1115 int 1116 munlock(td, uap) 1117 struct thread *td; 1118 struct munlock_args *uap; 1119 { 1120 vm_offset_t addr, end, last, start; 1121 vm_size_t size; 1122 int error; 1123 1124 error = priv_check(td, PRIV_VM_MUNLOCK); 1125 if (error) 1126 return (error); 1127 addr = (vm_offset_t)uap->addr; 1128 size = uap->len; 1129 last = addr + size; 1130 start = trunc_page(addr); 1131 end = round_page(last); 1132 if (last < addr || end < addr) 1133 return (EINVAL); 1134 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1135 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1136 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1137 } 1138 1139 /* 1140 * vm_mmap_vnode() 1141 * 1142 * MPSAFE 1143 * 1144 * Helper function for vm_mmap. Perform sanity check specific for mmap 1145 * operations on vnodes. 1146 */ 1147 int 1148 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1149 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1150 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp) 1151 { 1152 struct vattr va; 1153 vm_object_t obj; 1154 vm_offset_t foff; 1155 struct mount *mp; 1156 struct ucred *cred; 1157 int error, flags; 1158 int vfslocked; 1159 1160 mp = vp->v_mount; 1161 cred = td->td_ucred; 1162 vfslocked = VFS_LOCK_GIANT(mp); 1163 if ((error = vget(vp, LK_SHARED, td)) != 0) { 1164 VFS_UNLOCK_GIANT(vfslocked); 1165 return (error); 1166 } 1167 foff = *foffp; 1168 flags = *flagsp; 1169 obj = vp->v_object; 1170 if (vp->v_type == VREG) { 1171 /* 1172 * Get the proper underlying object 1173 */ 1174 if (obj == NULL) { 1175 error = EINVAL; 1176 goto done; 1177 } 1178 if (obj->handle != vp) { 1179 vput(vp); 1180 vp = (struct vnode*)obj->handle; 1181 vget(vp, LK_SHARED, td); 1182 } 1183 } else if (vp->v_type == VCHR) { 1184 error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp, 1185 vp->v_rdev, foffp, objp); 1186 if (error == 0) 1187 goto mark_atime; 1188 goto done; 1189 } else { 1190 error = EINVAL; 1191 goto done; 1192 } 1193 if ((error = VOP_GETATTR(vp, &va, cred))) 1194 goto done; 1195 #ifdef MAC 1196 error = mac_vnode_check_mmap(cred, vp, prot, flags); 1197 if (error != 0) 1198 goto done; 1199 #endif 1200 if ((flags & MAP_SHARED) != 0) { 1201 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1202 if (prot & PROT_WRITE) { 1203 error = EPERM; 1204 goto done; 1205 } 1206 *maxprotp &= ~VM_PROT_WRITE; 1207 } 1208 } 1209 /* 1210 * If it is a regular file without any references 1211 * we do not need to sync it. 1212 * Adjust object size to be the size of actual file. 1213 */ 1214 objsize = round_page(va.va_size); 1215 if (va.va_nlink == 0) 1216 flags |= MAP_NOSYNC; 1217 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, td->td_ucred); 1218 if (obj == NULL) { 1219 error = ENOMEM; 1220 goto done; 1221 } 1222 *objp = obj; 1223 *flagsp = flags; 1224 1225 mark_atime: 1226 vfs_mark_atime(vp, cred); 1227 1228 done: 1229 vput(vp); 1230 VFS_UNLOCK_GIANT(vfslocked); 1231 return (error); 1232 } 1233 1234 /* 1235 * vm_mmap_cdev() 1236 * 1237 * MPSAFE 1238 * 1239 * Helper function for vm_mmap. Perform sanity check specific for mmap 1240 * operations on cdevs. 1241 */ 1242 int 1243 vm_mmap_cdev(struct thread *td, vm_size_t objsize, 1244 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1245 struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp) 1246 { 1247 vm_object_t obj; 1248 struct cdevsw *dsw; 1249 int error, flags; 1250 1251 flags = *flagsp; 1252 1253 dsw = dev_refthread(cdev); 1254 if (dsw == NULL) 1255 return (ENXIO); 1256 if (dsw->d_flags & D_MMAP_ANON) { 1257 dev_relthread(cdev); 1258 *maxprotp = VM_PROT_ALL; 1259 *flagsp |= MAP_ANON; 1260 return (0); 1261 } 1262 /* 1263 * cdevs do not provide private mappings of any kind. 1264 */ 1265 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1266 (prot & PROT_WRITE) != 0) { 1267 dev_relthread(cdev); 1268 return (EACCES); 1269 } 1270 if (flags & (MAP_PRIVATE|MAP_COPY)) { 1271 dev_relthread(cdev); 1272 return (EINVAL); 1273 } 1274 /* 1275 * Force device mappings to be shared. 1276 */ 1277 flags |= MAP_SHARED; 1278 #ifdef MAC_XXX 1279 error = mac_cdev_check_mmap(td->td_ucred, cdev, prot); 1280 if (error != 0) { 1281 dev_relthread(cdev); 1282 return (error); 1283 } 1284 #endif 1285 /* 1286 * First, try d_mmap_single(). If that is not implemented 1287 * (returns ENODEV), fall back to using the device pager. 1288 * Note that d_mmap_single() must return a reference to the 1289 * object (it needs to bump the reference count of the object 1290 * it returns somehow). 1291 * 1292 * XXX assumes VM_PROT_* == PROT_* 1293 */ 1294 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1295 dev_relthread(cdev); 1296 if (error != ENODEV) 1297 return (error); 1298 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1299 td->td_ucred); 1300 if (obj == NULL) 1301 return (EINVAL); 1302 *objp = obj; 1303 *flagsp = flags; 1304 return (0); 1305 } 1306 1307 /* 1308 * vm_mmap_shm() 1309 * 1310 * MPSAFE 1311 * 1312 * Helper function for vm_mmap. Perform sanity check specific for mmap 1313 * operations on shm file descriptors. 1314 */ 1315 int 1316 vm_mmap_shm(struct thread *td, vm_size_t objsize, 1317 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1318 struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp) 1319 { 1320 int error; 1321 1322 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1323 (prot & PROT_WRITE) != 0) 1324 return (EACCES); 1325 #ifdef MAC 1326 error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp); 1327 if (error != 0) 1328 return (error); 1329 #endif 1330 error = shm_mmap(shmfd, objsize, foff, objp); 1331 if (error) 1332 return (error); 1333 return (0); 1334 } 1335 1336 /* 1337 * vm_mmap() 1338 * 1339 * MPSAFE 1340 * 1341 * Internal version of mmap. Currently used by mmap, exec, and sys5 1342 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1343 */ 1344 int 1345 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1346 vm_prot_t maxprot, int flags, 1347 objtype_t handle_type, void *handle, 1348 vm_ooffset_t foff) 1349 { 1350 boolean_t fitit; 1351 vm_object_t object = NULL; 1352 int rv = KERN_SUCCESS; 1353 int docow, error; 1354 struct thread *td = curthread; 1355 1356 if (size == 0) 1357 return (0); 1358 1359 size = round_page(size); 1360 1361 PROC_LOCK(td->td_proc); 1362 if (td->td_proc->p_vmspace->vm_map.size + size > 1363 lim_cur(td->td_proc, RLIMIT_VMEM)) { 1364 PROC_UNLOCK(td->td_proc); 1365 return(ENOMEM); 1366 } 1367 PROC_UNLOCK(td->td_proc); 1368 1369 /* 1370 * We currently can only deal with page aligned file offsets. 1371 * The check is here rather than in the syscall because the 1372 * kernel calls this function internally for other mmaping 1373 * operations (such as in exec) and non-aligned offsets will 1374 * cause pmap inconsistencies...so we want to be sure to 1375 * disallow this in all cases. 1376 */ 1377 if (foff & PAGE_MASK) 1378 return (EINVAL); 1379 1380 if ((flags & MAP_FIXED) == 0) { 1381 fitit = TRUE; 1382 *addr = round_page(*addr); 1383 } else { 1384 if (*addr != trunc_page(*addr)) 1385 return (EINVAL); 1386 fitit = FALSE; 1387 } 1388 /* 1389 * Lookup/allocate object. 1390 */ 1391 switch (handle_type) { 1392 case OBJT_DEVICE: 1393 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, 1394 handle, &foff, &object); 1395 break; 1396 case OBJT_VNODE: 1397 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1398 handle, &foff, &object); 1399 break; 1400 case OBJT_SWAP: 1401 error = vm_mmap_shm(td, size, prot, &maxprot, &flags, 1402 handle, foff, &object); 1403 break; 1404 case OBJT_DEFAULT: 1405 if (handle == NULL) { 1406 error = 0; 1407 break; 1408 } 1409 /* FALLTHROUGH */ 1410 default: 1411 error = EINVAL; 1412 break; 1413 } 1414 if (error) 1415 return (error); 1416 if (flags & MAP_ANON) { 1417 object = NULL; 1418 docow = 0; 1419 /* 1420 * Unnamed anonymous regions always start at 0. 1421 */ 1422 if (handle == 0) 1423 foff = 0; 1424 } else { 1425 docow = MAP_PREFAULT_PARTIAL; 1426 } 1427 1428 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1429 docow |= MAP_COPY_ON_WRITE; 1430 if (flags & MAP_NOSYNC) 1431 docow |= MAP_DISABLE_SYNCER; 1432 if (flags & MAP_NOCORE) 1433 docow |= MAP_DISABLE_COREDUMP; 1434 1435 if (flags & MAP_STACK) 1436 rv = vm_map_stack(map, *addr, size, prot, maxprot, 1437 docow | MAP_STACK_GROWS_DOWN); 1438 else if (fitit) 1439 rv = vm_map_find(map, object, foff, addr, size, 1440 object != NULL && object->type == OBJT_DEVICE ? 1441 VMFS_ALIGNED_SPACE : VMFS_ANY_SPACE, prot, maxprot, docow); 1442 else 1443 rv = vm_map_fixed(map, object, foff, *addr, size, 1444 prot, maxprot, docow); 1445 1446 if (rv != KERN_SUCCESS) { 1447 /* 1448 * Lose the object reference. Will destroy the 1449 * object if it's an unnamed anonymous mapping 1450 * or named anonymous without other references. 1451 */ 1452 vm_object_deallocate(object); 1453 } else if (flags & MAP_SHARED) { 1454 /* 1455 * Shared memory is also shared with children. 1456 */ 1457 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); 1458 if (rv != KERN_SUCCESS) 1459 (void) vm_map_remove(map, *addr, *addr + size); 1460 } 1461 1462 /* 1463 * If the process has requested that all future mappings 1464 * be wired, then heed this. 1465 */ 1466 if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE)) 1467 vm_map_wire(map, *addr, *addr + size, 1468 VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); 1469 1470 switch (rv) { 1471 case KERN_SUCCESS: 1472 return (0); 1473 case KERN_INVALID_ADDRESS: 1474 case KERN_NO_SPACE: 1475 return (ENOMEM); 1476 case KERN_PROTECTION_FAILURE: 1477 return (EACCES); 1478 default: 1479 return (EINVAL); 1480 } 1481 } 1482