1 /*- 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35 * 36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_compat.h" 47 #include "opt_hwpmc_hooks.h" 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/capability.h> 52 #include <sys/kernel.h> 53 #include <sys/lock.h> 54 #include <sys/mutex.h> 55 #include <sys/sysproto.h> 56 #include <sys/filedesc.h> 57 #include <sys/priv.h> 58 #include <sys/proc.h> 59 #include <sys/racct.h> 60 #include <sys/resource.h> 61 #include <sys/resourcevar.h> 62 #include <sys/rwlock.h> 63 #include <sys/sysctl.h> 64 #include <sys/vnode.h> 65 #include <sys/fcntl.h> 66 #include <sys/file.h> 67 #include <sys/mman.h> 68 #include <sys/mount.h> 69 #include <sys/conf.h> 70 #include <sys/stat.h> 71 #include <sys/sysent.h> 72 #include <sys/vmmeter.h> 73 74 #include <security/mac/mac_framework.h> 75 76 #include <vm/vm.h> 77 #include <vm/vm_param.h> 78 #include <vm/pmap.h> 79 #include <vm/vm_map.h> 80 #include <vm/vm_object.h> 81 #include <vm/vm_page.h> 82 #include <vm/vm_pager.h> 83 #include <vm/vm_pageout.h> 84 #include <vm/vm_extern.h> 85 #include <vm/vm_page.h> 86 #include <vm/vnode_pager.h> 87 88 #ifdef HWPMC_HOOKS 89 #include <sys/pmckern.h> 90 #endif 91 92 int old_mlock = 0; 93 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RW | CTLFLAG_TUN, &old_mlock, 0, 94 "Do not apply RLIMIT_MEMLOCK on mlockall"); 95 TUNABLE_INT("vm.old_mlock", &old_mlock); 96 97 #ifdef MAP_32BIT 98 #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) 99 #endif 100 101 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 102 int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *); 103 static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 104 int *, struct cdev *, vm_ooffset_t *, vm_object_t *); 105 static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 106 int *, struct shmfd *, vm_ooffset_t, vm_object_t *); 107 108 #ifndef _SYS_SYSPROTO_H_ 109 struct sbrk_args { 110 int incr; 111 }; 112 #endif 113 114 /* 115 * MPSAFE 116 */ 117 /* ARGSUSED */ 118 int 119 sys_sbrk(td, uap) 120 struct thread *td; 121 struct sbrk_args *uap; 122 { 123 /* Not yet implemented */ 124 return (EOPNOTSUPP); 125 } 126 127 #ifndef _SYS_SYSPROTO_H_ 128 struct sstk_args { 129 int incr; 130 }; 131 #endif 132 133 /* 134 * MPSAFE 135 */ 136 /* ARGSUSED */ 137 int 138 sys_sstk(td, uap) 139 struct thread *td; 140 struct sstk_args *uap; 141 { 142 /* Not yet implemented */ 143 return (EOPNOTSUPP); 144 } 145 146 #if defined(COMPAT_43) 147 #ifndef _SYS_SYSPROTO_H_ 148 struct getpagesize_args { 149 int dummy; 150 }; 151 #endif 152 153 int 154 ogetpagesize(td, uap) 155 struct thread *td; 156 struct getpagesize_args *uap; 157 { 158 /* MP SAFE */ 159 td->td_retval[0] = PAGE_SIZE; 160 return (0); 161 } 162 #endif /* COMPAT_43 */ 163 164 165 /* 166 * Memory Map (mmap) system call. Note that the file offset 167 * and address are allowed to be NOT page aligned, though if 168 * the MAP_FIXED flag it set, both must have the same remainder 169 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 170 * page-aligned, the actual mapping starts at trunc_page(addr) 171 * and the return value is adjusted up by the page offset. 172 * 173 * Generally speaking, only character devices which are themselves 174 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 175 * there would be no cache coherency between a descriptor and a VM mapping 176 * both to the same character device. 177 */ 178 #ifndef _SYS_SYSPROTO_H_ 179 struct mmap_args { 180 void *addr; 181 size_t len; 182 int prot; 183 int flags; 184 int fd; 185 long pad; 186 off_t pos; 187 }; 188 #endif 189 190 /* 191 * MPSAFE 192 */ 193 int 194 sys_mmap(td, uap) 195 struct thread *td; 196 struct mmap_args *uap; 197 { 198 #ifdef HWPMC_HOOKS 199 struct pmckern_map_in pkm; 200 #endif 201 struct file *fp; 202 struct vnode *vp; 203 vm_offset_t addr; 204 vm_size_t size, pageoff; 205 vm_prot_t cap_maxprot, prot, maxprot; 206 void *handle; 207 objtype_t handle_type; 208 int align, error, flags; 209 off_t pos; 210 struct vmspace *vms = td->td_proc->p_vmspace; 211 cap_rights_t rights; 212 213 addr = (vm_offset_t) uap->addr; 214 size = uap->len; 215 prot = uap->prot & VM_PROT_ALL; 216 flags = uap->flags; 217 pos = uap->pos; 218 219 fp = NULL; 220 221 /* 222 * Enforce the constraints. 223 * Mapping of length 0 is only allowed for old binaries. 224 * Anonymous mapping shall specify -1 as filedescriptor and 225 * zero position for new code. Be nice to ancient a.out 226 * binaries and correct pos for anonymous mapping, since old 227 * ld.so sometimes issues anonymous map requests with non-zero 228 * pos. 229 */ 230 if (!SV_CURPROC_FLAG(SV_AOUT)) { 231 if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) || 232 ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0))) 233 return (EINVAL); 234 } else { 235 if ((flags & MAP_ANON) != 0) 236 pos = 0; 237 } 238 239 if (flags & MAP_STACK) { 240 if ((uap->fd != -1) || 241 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 242 return (EINVAL); 243 flags |= MAP_ANON; 244 pos = 0; 245 } 246 247 /* 248 * Align the file position to a page boundary, 249 * and save its page offset component. 250 */ 251 pageoff = (pos & PAGE_MASK); 252 pos -= pageoff; 253 254 /* Adjust size for rounding (on both ends). */ 255 size += pageoff; /* low end... */ 256 size = (vm_size_t) round_page(size); /* hi end */ 257 258 /* Ensure alignment is at least a page and fits in a pointer. */ 259 align = flags & MAP_ALIGNMENT_MASK; 260 if (align != 0 && align != MAP_ALIGNED_SUPER && 261 (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || 262 align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) 263 return (EINVAL); 264 265 /* 266 * Check for illegal addresses. Watch out for address wrap... Note 267 * that VM_*_ADDRESS are not constants due to casts (argh). 268 */ 269 if (flags & MAP_FIXED) { 270 /* 271 * The specified address must have the same remainder 272 * as the file offset taken modulo PAGE_SIZE, so it 273 * should be aligned after adjustment by pageoff. 274 */ 275 addr -= pageoff; 276 if (addr & PAGE_MASK) 277 return (EINVAL); 278 279 /* Address range must be all in user VM space. */ 280 if (addr < vm_map_min(&vms->vm_map) || 281 addr + size > vm_map_max(&vms->vm_map)) 282 return (EINVAL); 283 if (addr + size < addr) 284 return (EINVAL); 285 #ifdef MAP_32BIT 286 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) 287 return (EINVAL); 288 } else if (flags & MAP_32BIT) { 289 /* 290 * For MAP_32BIT, override the hint if it is too high and 291 * do not bother moving the mapping past the heap (since 292 * the heap is usually above 2GB). 293 */ 294 if (addr + size > MAP_32BIT_MAX_ADDR) 295 addr = 0; 296 #endif 297 } else { 298 /* 299 * XXX for non-fixed mappings where no hint is provided or 300 * the hint would fall in the potential heap space, 301 * place it after the end of the largest possible heap. 302 * 303 * There should really be a pmap call to determine a reasonable 304 * location. 305 */ 306 PROC_LOCK(td->td_proc); 307 if (addr == 0 || 308 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 309 addr < round_page((vm_offset_t)vms->vm_daddr + 310 lim_max(td->td_proc, RLIMIT_DATA)))) 311 addr = round_page((vm_offset_t)vms->vm_daddr + 312 lim_max(td->td_proc, RLIMIT_DATA)); 313 PROC_UNLOCK(td->td_proc); 314 } 315 if (flags & MAP_ANON) { 316 /* 317 * Mapping blank space is trivial. 318 */ 319 handle = NULL; 320 handle_type = OBJT_DEFAULT; 321 maxprot = VM_PROT_ALL; 322 cap_maxprot = VM_PROT_ALL; 323 } else { 324 /* 325 * Mapping file, get fp for validation and don't let the 326 * descriptor disappear on us if we block. Check capability 327 * rights, but also return the maximum rights to be combined 328 * with maxprot later. 329 */ 330 cap_rights_init(&rights, CAP_MMAP); 331 if (prot & PROT_READ) 332 cap_rights_set(&rights, CAP_MMAP_R); 333 if ((flags & MAP_SHARED) != 0) { 334 if (prot & PROT_WRITE) 335 cap_rights_set(&rights, CAP_MMAP_W); 336 } 337 if (prot & PROT_EXEC) 338 cap_rights_set(&rights, CAP_MMAP_X); 339 error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp); 340 if (error != 0) 341 goto done; 342 if (fp->f_type == DTYPE_SHM) { 343 handle = fp->f_data; 344 handle_type = OBJT_SWAP; 345 maxprot = VM_PROT_NONE; 346 347 /* FREAD should always be set. */ 348 if (fp->f_flag & FREAD) 349 maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; 350 if (fp->f_flag & FWRITE) 351 maxprot |= VM_PROT_WRITE; 352 goto map; 353 } 354 if (fp->f_type != DTYPE_VNODE) { 355 error = ENODEV; 356 goto done; 357 } 358 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ 359 defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) 360 /* 361 * POSIX shared-memory objects are defined to have 362 * kernel persistence, and are not defined to support 363 * read(2)/write(2) -- or even open(2). Thus, we can 364 * use MAP_ASYNC to trade on-disk coherence for speed. 365 * The shm_open(3) library routine turns on the FPOSIXSHM 366 * flag to request this behavior. 367 */ 368 if (fp->f_flag & FPOSIXSHM) 369 flags |= MAP_NOSYNC; 370 #endif 371 vp = fp->f_vnode; 372 /* 373 * Ensure that file and memory protections are 374 * compatible. Note that we only worry about 375 * writability if mapping is shared; in this case, 376 * current and max prot are dictated by the open file. 377 * XXX use the vnode instead? Problem is: what 378 * credentials do we use for determination? What if 379 * proc does a setuid? 380 */ 381 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 382 maxprot = VM_PROT_NONE; 383 else 384 maxprot = VM_PROT_EXECUTE; 385 if (fp->f_flag & FREAD) { 386 maxprot |= VM_PROT_READ; 387 } else if (prot & PROT_READ) { 388 error = EACCES; 389 goto done; 390 } 391 /* 392 * If we are sharing potential changes (either via 393 * MAP_SHARED or via the implicit sharing of character 394 * device mappings), and we are trying to get write 395 * permission although we opened it without asking 396 * for it, bail out. 397 */ 398 if ((flags & MAP_SHARED) != 0) { 399 if ((fp->f_flag & FWRITE) != 0) { 400 maxprot |= VM_PROT_WRITE; 401 } else if ((prot & PROT_WRITE) != 0) { 402 error = EACCES; 403 goto done; 404 } 405 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 406 maxprot |= VM_PROT_WRITE; 407 cap_maxprot |= VM_PROT_WRITE; 408 } 409 handle = (void *)vp; 410 handle_type = OBJT_VNODE; 411 } 412 map: 413 td->td_fpop = fp; 414 maxprot &= cap_maxprot; 415 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 416 flags, handle_type, handle, pos); 417 td->td_fpop = NULL; 418 #ifdef HWPMC_HOOKS 419 /* inform hwpmc(4) if an executable is being mapped */ 420 if (error == 0 && handle_type == OBJT_VNODE && 421 (prot & PROT_EXEC)) { 422 pkm.pm_file = handle; 423 pkm.pm_address = (uintptr_t) addr; 424 PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); 425 } 426 #endif 427 if (error == 0) 428 td->td_retval[0] = (register_t) (addr + pageoff); 429 done: 430 if (fp) 431 fdrop(fp, td); 432 433 return (error); 434 } 435 436 int 437 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 438 { 439 struct mmap_args oargs; 440 441 oargs.addr = uap->addr; 442 oargs.len = uap->len; 443 oargs.prot = uap->prot; 444 oargs.flags = uap->flags; 445 oargs.fd = uap->fd; 446 oargs.pos = uap->pos; 447 return (sys_mmap(td, &oargs)); 448 } 449 450 #ifdef COMPAT_43 451 #ifndef _SYS_SYSPROTO_H_ 452 struct ommap_args { 453 caddr_t addr; 454 int len; 455 int prot; 456 int flags; 457 int fd; 458 long pos; 459 }; 460 #endif 461 int 462 ommap(td, uap) 463 struct thread *td; 464 struct ommap_args *uap; 465 { 466 struct mmap_args nargs; 467 static const char cvtbsdprot[8] = { 468 0, 469 PROT_EXEC, 470 PROT_WRITE, 471 PROT_EXEC | PROT_WRITE, 472 PROT_READ, 473 PROT_EXEC | PROT_READ, 474 PROT_WRITE | PROT_READ, 475 PROT_EXEC | PROT_WRITE | PROT_READ, 476 }; 477 478 #define OMAP_ANON 0x0002 479 #define OMAP_COPY 0x0020 480 #define OMAP_SHARED 0x0010 481 #define OMAP_FIXED 0x0100 482 483 nargs.addr = uap->addr; 484 nargs.len = uap->len; 485 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 486 #ifdef COMPAT_FREEBSD32 487 #if defined(__amd64__) || defined(__ia64__) 488 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 489 nargs.prot != 0) 490 nargs.prot |= PROT_EXEC; 491 #endif 492 #endif 493 nargs.flags = 0; 494 if (uap->flags & OMAP_ANON) 495 nargs.flags |= MAP_ANON; 496 if (uap->flags & OMAP_COPY) 497 nargs.flags |= MAP_COPY; 498 if (uap->flags & OMAP_SHARED) 499 nargs.flags |= MAP_SHARED; 500 else 501 nargs.flags |= MAP_PRIVATE; 502 if (uap->flags & OMAP_FIXED) 503 nargs.flags |= MAP_FIXED; 504 nargs.fd = uap->fd; 505 nargs.pos = uap->pos; 506 return (sys_mmap(td, &nargs)); 507 } 508 #endif /* COMPAT_43 */ 509 510 511 #ifndef _SYS_SYSPROTO_H_ 512 struct msync_args { 513 void *addr; 514 size_t len; 515 int flags; 516 }; 517 #endif 518 /* 519 * MPSAFE 520 */ 521 int 522 sys_msync(td, uap) 523 struct thread *td; 524 struct msync_args *uap; 525 { 526 vm_offset_t addr; 527 vm_size_t size, pageoff; 528 int flags; 529 vm_map_t map; 530 int rv; 531 532 addr = (vm_offset_t) uap->addr; 533 size = uap->len; 534 flags = uap->flags; 535 536 pageoff = (addr & PAGE_MASK); 537 addr -= pageoff; 538 size += pageoff; 539 size = (vm_size_t) round_page(size); 540 if (addr + size < addr) 541 return (EINVAL); 542 543 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 544 return (EINVAL); 545 546 map = &td->td_proc->p_vmspace->vm_map; 547 548 /* 549 * Clean the pages and interpret the return value. 550 */ 551 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 552 (flags & MS_INVALIDATE) != 0); 553 switch (rv) { 554 case KERN_SUCCESS: 555 return (0); 556 case KERN_INVALID_ADDRESS: 557 return (EINVAL); /* Sun returns ENOMEM? */ 558 case KERN_INVALID_ARGUMENT: 559 return (EBUSY); 560 case KERN_FAILURE: 561 return (EIO); 562 default: 563 return (EINVAL); 564 } 565 } 566 567 #ifndef _SYS_SYSPROTO_H_ 568 struct munmap_args { 569 void *addr; 570 size_t len; 571 }; 572 #endif 573 /* 574 * MPSAFE 575 */ 576 int 577 sys_munmap(td, uap) 578 struct thread *td; 579 struct munmap_args *uap; 580 { 581 #ifdef HWPMC_HOOKS 582 struct pmckern_map_out pkm; 583 vm_map_entry_t entry; 584 #endif 585 vm_offset_t addr; 586 vm_size_t size, pageoff; 587 vm_map_t map; 588 589 addr = (vm_offset_t) uap->addr; 590 size = uap->len; 591 if (size == 0) 592 return (EINVAL); 593 594 pageoff = (addr & PAGE_MASK); 595 addr -= pageoff; 596 size += pageoff; 597 size = (vm_size_t) round_page(size); 598 if (addr + size < addr) 599 return (EINVAL); 600 601 /* 602 * Check for illegal addresses. Watch out for address wrap... 603 */ 604 map = &td->td_proc->p_vmspace->vm_map; 605 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 606 return (EINVAL); 607 vm_map_lock(map); 608 #ifdef HWPMC_HOOKS 609 /* 610 * Inform hwpmc if the address range being unmapped contains 611 * an executable region. 612 */ 613 pkm.pm_address = (uintptr_t) NULL; 614 if (vm_map_lookup_entry(map, addr, &entry)) { 615 for (; 616 entry != &map->header && entry->start < addr + size; 617 entry = entry->next) { 618 if (vm_map_check_protection(map, entry->start, 619 entry->end, VM_PROT_EXECUTE) == TRUE) { 620 pkm.pm_address = (uintptr_t) addr; 621 pkm.pm_size = (size_t) size; 622 break; 623 } 624 } 625 } 626 #endif 627 vm_map_delete(map, addr, addr + size); 628 629 #ifdef HWPMC_HOOKS 630 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 631 vm_map_lock_downgrade(map); 632 if (pkm.pm_address != (uintptr_t) NULL) 633 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 634 vm_map_unlock_read(map); 635 #else 636 vm_map_unlock(map); 637 #endif 638 /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ 639 return (0); 640 } 641 642 #ifndef _SYS_SYSPROTO_H_ 643 struct mprotect_args { 644 const void *addr; 645 size_t len; 646 int prot; 647 }; 648 #endif 649 /* 650 * MPSAFE 651 */ 652 int 653 sys_mprotect(td, uap) 654 struct thread *td; 655 struct mprotect_args *uap; 656 { 657 vm_offset_t addr; 658 vm_size_t size, pageoff; 659 vm_prot_t prot; 660 661 addr = (vm_offset_t) uap->addr; 662 size = uap->len; 663 prot = uap->prot & VM_PROT_ALL; 664 665 pageoff = (addr & PAGE_MASK); 666 addr -= pageoff; 667 size += pageoff; 668 size = (vm_size_t) round_page(size); 669 if (addr + size < addr) 670 return (EINVAL); 671 672 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 673 addr + size, prot, FALSE)) { 674 case KERN_SUCCESS: 675 return (0); 676 case KERN_PROTECTION_FAILURE: 677 return (EACCES); 678 case KERN_RESOURCE_SHORTAGE: 679 return (ENOMEM); 680 } 681 return (EINVAL); 682 } 683 684 #ifndef _SYS_SYSPROTO_H_ 685 struct minherit_args { 686 void *addr; 687 size_t len; 688 int inherit; 689 }; 690 #endif 691 /* 692 * MPSAFE 693 */ 694 int 695 sys_minherit(td, uap) 696 struct thread *td; 697 struct minherit_args *uap; 698 { 699 vm_offset_t addr; 700 vm_size_t size, pageoff; 701 vm_inherit_t inherit; 702 703 addr = (vm_offset_t)uap->addr; 704 size = uap->len; 705 inherit = uap->inherit; 706 707 pageoff = (addr & PAGE_MASK); 708 addr -= pageoff; 709 size += pageoff; 710 size = (vm_size_t) round_page(size); 711 if (addr + size < addr) 712 return (EINVAL); 713 714 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 715 addr + size, inherit)) { 716 case KERN_SUCCESS: 717 return (0); 718 case KERN_PROTECTION_FAILURE: 719 return (EACCES); 720 } 721 return (EINVAL); 722 } 723 724 #ifndef _SYS_SYSPROTO_H_ 725 struct madvise_args { 726 void *addr; 727 size_t len; 728 int behav; 729 }; 730 #endif 731 732 /* 733 * MPSAFE 734 */ 735 int 736 sys_madvise(td, uap) 737 struct thread *td; 738 struct madvise_args *uap; 739 { 740 vm_offset_t start, end; 741 vm_map_t map; 742 struct proc *p; 743 int error; 744 745 /* 746 * Check for our special case, advising the swap pager we are 747 * "immortal." 748 */ 749 if (uap->behav == MADV_PROTECT) { 750 error = priv_check(td, PRIV_VM_MADV_PROTECT); 751 if (error == 0) { 752 p = td->td_proc; 753 PROC_LOCK(p); 754 p->p_flag |= P_PROTECTED; 755 PROC_UNLOCK(p); 756 } 757 return (error); 758 } 759 /* 760 * Check for illegal behavior 761 */ 762 if (uap->behav < 0 || uap->behav > MADV_CORE) 763 return (EINVAL); 764 /* 765 * Check for illegal addresses. Watch out for address wrap... Note 766 * that VM_*_ADDRESS are not constants due to casts (argh). 767 */ 768 map = &td->td_proc->p_vmspace->vm_map; 769 if ((vm_offset_t)uap->addr < vm_map_min(map) || 770 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 771 return (EINVAL); 772 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 773 return (EINVAL); 774 775 /* 776 * Since this routine is only advisory, we default to conservative 777 * behavior. 778 */ 779 start = trunc_page((vm_offset_t) uap->addr); 780 end = round_page((vm_offset_t) uap->addr + uap->len); 781 782 if (vm_map_madvise(map, start, end, uap->behav)) 783 return (EINVAL); 784 return (0); 785 } 786 787 #ifndef _SYS_SYSPROTO_H_ 788 struct mincore_args { 789 const void *addr; 790 size_t len; 791 char *vec; 792 }; 793 #endif 794 795 /* 796 * MPSAFE 797 */ 798 int 799 sys_mincore(td, uap) 800 struct thread *td; 801 struct mincore_args *uap; 802 { 803 vm_offset_t addr, first_addr; 804 vm_offset_t end, cend; 805 pmap_t pmap; 806 vm_map_t map; 807 char *vec; 808 int error = 0; 809 int vecindex, lastvecindex; 810 vm_map_entry_t current; 811 vm_map_entry_t entry; 812 vm_object_t object; 813 vm_paddr_t locked_pa; 814 vm_page_t m; 815 vm_pindex_t pindex; 816 int mincoreinfo; 817 unsigned int timestamp; 818 boolean_t locked; 819 820 /* 821 * Make sure that the addresses presented are valid for user 822 * mode. 823 */ 824 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 825 end = addr + (vm_size_t)round_page(uap->len); 826 map = &td->td_proc->p_vmspace->vm_map; 827 if (end > vm_map_max(map) || end < addr) 828 return (ENOMEM); 829 830 /* 831 * Address of byte vector 832 */ 833 vec = uap->vec; 834 835 pmap = vmspace_pmap(td->td_proc->p_vmspace); 836 837 vm_map_lock_read(map); 838 RestartScan: 839 timestamp = map->timestamp; 840 841 if (!vm_map_lookup_entry(map, addr, &entry)) { 842 vm_map_unlock_read(map); 843 return (ENOMEM); 844 } 845 846 /* 847 * Do this on a map entry basis so that if the pages are not 848 * in the current processes address space, we can easily look 849 * up the pages elsewhere. 850 */ 851 lastvecindex = -1; 852 for (current = entry; 853 (current != &map->header) && (current->start < end); 854 current = current->next) { 855 856 /* 857 * check for contiguity 858 */ 859 if (current->end < end && 860 (entry->next == &map->header || 861 current->next->start > current->end)) { 862 vm_map_unlock_read(map); 863 return (ENOMEM); 864 } 865 866 /* 867 * ignore submaps (for now) or null objects 868 */ 869 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 870 current->object.vm_object == NULL) 871 continue; 872 873 /* 874 * limit this scan to the current map entry and the 875 * limits for the mincore call 876 */ 877 if (addr < current->start) 878 addr = current->start; 879 cend = current->end; 880 if (cend > end) 881 cend = end; 882 883 /* 884 * scan this entry one page at a time 885 */ 886 while (addr < cend) { 887 /* 888 * Check pmap first, it is likely faster, also 889 * it can provide info as to whether we are the 890 * one referencing or modifying the page. 891 */ 892 object = NULL; 893 locked_pa = 0; 894 retry: 895 m = NULL; 896 mincoreinfo = pmap_mincore(pmap, addr, &locked_pa); 897 if (locked_pa != 0) { 898 /* 899 * The page is mapped by this process but not 900 * both accessed and modified. It is also 901 * managed. Acquire the object lock so that 902 * other mappings might be examined. 903 */ 904 m = PHYS_TO_VM_PAGE(locked_pa); 905 if (m->object != object) { 906 if (object != NULL) 907 VM_OBJECT_WUNLOCK(object); 908 object = m->object; 909 locked = VM_OBJECT_TRYWLOCK(object); 910 vm_page_unlock(m); 911 if (!locked) { 912 VM_OBJECT_WLOCK(object); 913 vm_page_lock(m); 914 goto retry; 915 } 916 } else 917 vm_page_unlock(m); 918 KASSERT(m->valid == VM_PAGE_BITS_ALL, 919 ("mincore: page %p is mapped but invalid", 920 m)); 921 } else if (mincoreinfo == 0) { 922 /* 923 * The page is not mapped by this process. If 924 * the object implements managed pages, then 925 * determine if the page is resident so that 926 * the mappings might be examined. 927 */ 928 if (current->object.vm_object != object) { 929 if (object != NULL) 930 VM_OBJECT_WUNLOCK(object); 931 object = current->object.vm_object; 932 VM_OBJECT_WLOCK(object); 933 } 934 if (object->type == OBJT_DEFAULT || 935 object->type == OBJT_SWAP || 936 object->type == OBJT_VNODE) { 937 pindex = OFF_TO_IDX(current->offset + 938 (addr - current->start)); 939 m = vm_page_lookup(object, pindex); 940 if (m == NULL && 941 vm_page_is_cached(object, pindex)) 942 mincoreinfo = MINCORE_INCORE; 943 if (m != NULL && m->valid == 0) 944 m = NULL; 945 if (m != NULL) 946 mincoreinfo = MINCORE_INCORE; 947 } 948 } 949 if (m != NULL) { 950 /* Examine other mappings to the page. */ 951 if (m->dirty == 0 && pmap_is_modified(m)) 952 vm_page_dirty(m); 953 if (m->dirty != 0) 954 mincoreinfo |= MINCORE_MODIFIED_OTHER; 955 /* 956 * The first test for PGA_REFERENCED is an 957 * optimization. The second test is 958 * required because a concurrent pmap 959 * operation could clear the last reference 960 * and set PGA_REFERENCED before the call to 961 * pmap_is_referenced(). 962 */ 963 if ((m->aflags & PGA_REFERENCED) != 0 || 964 pmap_is_referenced(m) || 965 (m->aflags & PGA_REFERENCED) != 0) 966 mincoreinfo |= MINCORE_REFERENCED_OTHER; 967 } 968 if (object != NULL) 969 VM_OBJECT_WUNLOCK(object); 970 971 /* 972 * subyte may page fault. In case it needs to modify 973 * the map, we release the lock. 974 */ 975 vm_map_unlock_read(map); 976 977 /* 978 * calculate index into user supplied byte vector 979 */ 980 vecindex = OFF_TO_IDX(addr - first_addr); 981 982 /* 983 * If we have skipped map entries, we need to make sure that 984 * the byte vector is zeroed for those skipped entries. 985 */ 986 while ((lastvecindex + 1) < vecindex) { 987 error = subyte(vec + lastvecindex, 0); 988 if (error) { 989 error = EFAULT; 990 goto done2; 991 } 992 ++lastvecindex; 993 } 994 995 /* 996 * Pass the page information to the user 997 */ 998 error = subyte(vec + vecindex, mincoreinfo); 999 if (error) { 1000 error = EFAULT; 1001 goto done2; 1002 } 1003 1004 /* 1005 * If the map has changed, due to the subyte, the previous 1006 * output may be invalid. 1007 */ 1008 vm_map_lock_read(map); 1009 if (timestamp != map->timestamp) 1010 goto RestartScan; 1011 1012 lastvecindex = vecindex; 1013 addr += PAGE_SIZE; 1014 } 1015 } 1016 1017 /* 1018 * subyte may page fault. In case it needs to modify 1019 * the map, we release the lock. 1020 */ 1021 vm_map_unlock_read(map); 1022 1023 /* 1024 * Zero the last entries in the byte vector. 1025 */ 1026 vecindex = OFF_TO_IDX(end - first_addr); 1027 while ((lastvecindex + 1) < vecindex) { 1028 error = subyte(vec + lastvecindex, 0); 1029 if (error) { 1030 error = EFAULT; 1031 goto done2; 1032 } 1033 ++lastvecindex; 1034 } 1035 1036 /* 1037 * If the map has changed, due to the subyte, the previous 1038 * output may be invalid. 1039 */ 1040 vm_map_lock_read(map); 1041 if (timestamp != map->timestamp) 1042 goto RestartScan; 1043 vm_map_unlock_read(map); 1044 done2: 1045 return (error); 1046 } 1047 1048 #ifndef _SYS_SYSPROTO_H_ 1049 struct mlock_args { 1050 const void *addr; 1051 size_t len; 1052 }; 1053 #endif 1054 /* 1055 * MPSAFE 1056 */ 1057 int 1058 sys_mlock(td, uap) 1059 struct thread *td; 1060 struct mlock_args *uap; 1061 { 1062 1063 return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len)); 1064 } 1065 1066 int 1067 vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len) 1068 { 1069 vm_offset_t addr, end, last, start; 1070 vm_size_t npages, size; 1071 vm_map_t map; 1072 unsigned long nsize; 1073 int error; 1074 1075 error = priv_check_cred(cred, PRIV_VM_MLOCK, 0); 1076 if (error) 1077 return (error); 1078 addr = (vm_offset_t)addr0; 1079 size = len; 1080 last = addr + size; 1081 start = trunc_page(addr); 1082 end = round_page(last); 1083 if (last < addr || end < addr) 1084 return (EINVAL); 1085 npages = atop(end - start); 1086 if (npages > vm_page_max_wired) 1087 return (ENOMEM); 1088 map = &proc->p_vmspace->vm_map; 1089 PROC_LOCK(proc); 1090 nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1091 if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) { 1092 PROC_UNLOCK(proc); 1093 return (ENOMEM); 1094 } 1095 PROC_UNLOCK(proc); 1096 if (npages + cnt.v_wire_count > vm_page_max_wired) 1097 return (EAGAIN); 1098 #ifdef RACCT 1099 PROC_LOCK(proc); 1100 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1101 PROC_UNLOCK(proc); 1102 if (error != 0) 1103 return (ENOMEM); 1104 #endif 1105 error = vm_map_wire(map, start, end, 1106 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1107 #ifdef RACCT 1108 if (error != KERN_SUCCESS) { 1109 PROC_LOCK(proc); 1110 racct_set(proc, RACCT_MEMLOCK, 1111 ptoa(pmap_wired_count(map->pmap))); 1112 PROC_UNLOCK(proc); 1113 } 1114 #endif 1115 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1116 } 1117 1118 #ifndef _SYS_SYSPROTO_H_ 1119 struct mlockall_args { 1120 int how; 1121 }; 1122 #endif 1123 1124 /* 1125 * MPSAFE 1126 */ 1127 int 1128 sys_mlockall(td, uap) 1129 struct thread *td; 1130 struct mlockall_args *uap; 1131 { 1132 vm_map_t map; 1133 int error; 1134 1135 map = &td->td_proc->p_vmspace->vm_map; 1136 error = priv_check(td, PRIV_VM_MLOCK); 1137 if (error) 1138 return (error); 1139 1140 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1141 return (EINVAL); 1142 1143 /* 1144 * If wiring all pages in the process would cause it to exceed 1145 * a hard resource limit, return ENOMEM. 1146 */ 1147 if (!old_mlock && uap->how & MCL_CURRENT) { 1148 PROC_LOCK(td->td_proc); 1149 if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1150 PROC_UNLOCK(td->td_proc); 1151 return (ENOMEM); 1152 } 1153 PROC_UNLOCK(td->td_proc); 1154 } 1155 #ifdef RACCT 1156 PROC_LOCK(td->td_proc); 1157 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1158 PROC_UNLOCK(td->td_proc); 1159 if (error != 0) 1160 return (ENOMEM); 1161 #endif 1162 1163 if (uap->how & MCL_FUTURE) { 1164 vm_map_lock(map); 1165 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1166 vm_map_unlock(map); 1167 error = 0; 1168 } 1169 1170 if (uap->how & MCL_CURRENT) { 1171 /* 1172 * P1003.1-2001 mandates that all currently mapped pages 1173 * will be memory resident and locked (wired) upon return 1174 * from mlockall(). vm_map_wire() will wire pages, by 1175 * calling vm_fault_wire() for each page in the region. 1176 */ 1177 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1178 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1179 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1180 } 1181 #ifdef RACCT 1182 if (error != KERN_SUCCESS) { 1183 PROC_LOCK(td->td_proc); 1184 racct_set(td->td_proc, RACCT_MEMLOCK, 1185 ptoa(pmap_wired_count(map->pmap))); 1186 PROC_UNLOCK(td->td_proc); 1187 } 1188 #endif 1189 1190 return (error); 1191 } 1192 1193 #ifndef _SYS_SYSPROTO_H_ 1194 struct munlockall_args { 1195 register_t dummy; 1196 }; 1197 #endif 1198 1199 /* 1200 * MPSAFE 1201 */ 1202 int 1203 sys_munlockall(td, uap) 1204 struct thread *td; 1205 struct munlockall_args *uap; 1206 { 1207 vm_map_t map; 1208 int error; 1209 1210 map = &td->td_proc->p_vmspace->vm_map; 1211 error = priv_check(td, PRIV_VM_MUNLOCK); 1212 if (error) 1213 return (error); 1214 1215 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1216 vm_map_lock(map); 1217 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1218 vm_map_unlock(map); 1219 1220 /* Forcibly unwire all pages. */ 1221 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1222 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1223 #ifdef RACCT 1224 if (error == KERN_SUCCESS) { 1225 PROC_LOCK(td->td_proc); 1226 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1227 PROC_UNLOCK(td->td_proc); 1228 } 1229 #endif 1230 1231 return (error); 1232 } 1233 1234 #ifndef _SYS_SYSPROTO_H_ 1235 struct munlock_args { 1236 const void *addr; 1237 size_t len; 1238 }; 1239 #endif 1240 /* 1241 * MPSAFE 1242 */ 1243 int 1244 sys_munlock(td, uap) 1245 struct thread *td; 1246 struct munlock_args *uap; 1247 { 1248 vm_offset_t addr, end, last, start; 1249 vm_size_t size; 1250 #ifdef RACCT 1251 vm_map_t map; 1252 #endif 1253 int error; 1254 1255 error = priv_check(td, PRIV_VM_MUNLOCK); 1256 if (error) 1257 return (error); 1258 addr = (vm_offset_t)uap->addr; 1259 size = uap->len; 1260 last = addr + size; 1261 start = trunc_page(addr); 1262 end = round_page(last); 1263 if (last < addr || end < addr) 1264 return (EINVAL); 1265 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1266 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1267 #ifdef RACCT 1268 if (error == KERN_SUCCESS) { 1269 PROC_LOCK(td->td_proc); 1270 map = &td->td_proc->p_vmspace->vm_map; 1271 racct_set(td->td_proc, RACCT_MEMLOCK, 1272 ptoa(pmap_wired_count(map->pmap))); 1273 PROC_UNLOCK(td->td_proc); 1274 } 1275 #endif 1276 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1277 } 1278 1279 /* 1280 * vm_mmap_vnode() 1281 * 1282 * Helper function for vm_mmap. Perform sanity check specific for mmap 1283 * operations on vnodes. 1284 * 1285 * For VCHR vnodes, the vnode lock is held over the call to 1286 * vm_mmap_cdev() to keep vp->v_rdev valid. 1287 */ 1288 int 1289 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1290 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1291 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1292 boolean_t *writecounted) 1293 { 1294 struct vattr va; 1295 vm_object_t obj; 1296 vm_offset_t foff; 1297 struct mount *mp; 1298 struct ucred *cred; 1299 int error, flags, locktype; 1300 1301 mp = vp->v_mount; 1302 cred = td->td_ucred; 1303 if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED)) 1304 locktype = LK_EXCLUSIVE; 1305 else 1306 locktype = LK_SHARED; 1307 if ((error = vget(vp, locktype, td)) != 0) 1308 return (error); 1309 foff = *foffp; 1310 flags = *flagsp; 1311 obj = vp->v_object; 1312 if (vp->v_type == VREG) { 1313 /* 1314 * Get the proper underlying object 1315 */ 1316 if (obj == NULL) { 1317 error = EINVAL; 1318 goto done; 1319 } 1320 if (obj->type == OBJT_VNODE && obj->handle != vp) { 1321 vput(vp); 1322 vp = (struct vnode *)obj->handle; 1323 /* 1324 * Bypass filesystems obey the mpsafety of the 1325 * underlying fs. Tmpfs never bypasses. 1326 */ 1327 error = vget(vp, locktype, td); 1328 if (error != 0) 1329 return (error); 1330 } 1331 if (locktype == LK_EXCLUSIVE) { 1332 *writecounted = TRUE; 1333 vnode_pager_update_writecount(obj, 0, objsize); 1334 } 1335 } else if (vp->v_type == VCHR) { 1336 error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp, 1337 vp->v_rdev, foffp, objp); 1338 if (error == 0) 1339 goto mark_atime; 1340 goto done; 1341 } else { 1342 error = EINVAL; 1343 goto done; 1344 } 1345 if ((error = VOP_GETATTR(vp, &va, cred))) 1346 goto done; 1347 #ifdef MAC 1348 error = mac_vnode_check_mmap(cred, vp, prot, flags); 1349 if (error != 0) 1350 goto done; 1351 #endif 1352 if ((flags & MAP_SHARED) != 0) { 1353 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1354 if (prot & PROT_WRITE) { 1355 error = EPERM; 1356 goto done; 1357 } 1358 *maxprotp &= ~VM_PROT_WRITE; 1359 } 1360 } 1361 /* 1362 * If it is a regular file without any references 1363 * we do not need to sync it. 1364 * Adjust object size to be the size of actual file. 1365 */ 1366 objsize = round_page(va.va_size); 1367 if (va.va_nlink == 0) 1368 flags |= MAP_NOSYNC; 1369 if (obj->type == OBJT_VNODE) 1370 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, 1371 cred); 1372 else { 1373 KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP, 1374 ("wrong object type")); 1375 vm_object_reference(obj); 1376 } 1377 if (obj == NULL) { 1378 error = ENOMEM; 1379 goto done; 1380 } 1381 *objp = obj; 1382 *flagsp = flags; 1383 1384 mark_atime: 1385 vfs_mark_atime(vp, cred); 1386 1387 done: 1388 if (error != 0 && *writecounted) { 1389 *writecounted = FALSE; 1390 vnode_pager_update_writecount(obj, objsize, 0); 1391 } 1392 vput(vp); 1393 return (error); 1394 } 1395 1396 /* 1397 * vm_mmap_cdev() 1398 * 1399 * MPSAFE 1400 * 1401 * Helper function for vm_mmap. Perform sanity check specific for mmap 1402 * operations on cdevs. 1403 */ 1404 int 1405 vm_mmap_cdev(struct thread *td, vm_size_t objsize, 1406 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1407 struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp) 1408 { 1409 vm_object_t obj; 1410 struct cdevsw *dsw; 1411 int error, flags, ref; 1412 1413 flags = *flagsp; 1414 1415 dsw = dev_refthread(cdev, &ref); 1416 if (dsw == NULL) 1417 return (ENXIO); 1418 if (dsw->d_flags & D_MMAP_ANON) { 1419 dev_relthread(cdev, ref); 1420 *maxprotp = VM_PROT_ALL; 1421 *flagsp |= MAP_ANON; 1422 return (0); 1423 } 1424 /* 1425 * cdevs do not provide private mappings of any kind. 1426 */ 1427 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1428 (prot & PROT_WRITE) != 0) { 1429 dev_relthread(cdev, ref); 1430 return (EACCES); 1431 } 1432 if (flags & (MAP_PRIVATE|MAP_COPY)) { 1433 dev_relthread(cdev, ref); 1434 return (EINVAL); 1435 } 1436 /* 1437 * Force device mappings to be shared. 1438 */ 1439 flags |= MAP_SHARED; 1440 #ifdef MAC_XXX 1441 error = mac_cdev_check_mmap(td->td_ucred, cdev, prot); 1442 if (error != 0) { 1443 dev_relthread(cdev, ref); 1444 return (error); 1445 } 1446 #endif 1447 /* 1448 * First, try d_mmap_single(). If that is not implemented 1449 * (returns ENODEV), fall back to using the device pager. 1450 * Note that d_mmap_single() must return a reference to the 1451 * object (it needs to bump the reference count of the object 1452 * it returns somehow). 1453 * 1454 * XXX assumes VM_PROT_* == PROT_* 1455 */ 1456 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1457 dev_relthread(cdev, ref); 1458 if (error != ENODEV) 1459 return (error); 1460 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1461 td->td_ucred); 1462 if (obj == NULL) 1463 return (EINVAL); 1464 *objp = obj; 1465 *flagsp = flags; 1466 return (0); 1467 } 1468 1469 /* 1470 * vm_mmap_shm() 1471 * 1472 * MPSAFE 1473 * 1474 * Helper function for vm_mmap. Perform sanity check specific for mmap 1475 * operations on shm file descriptors. 1476 */ 1477 int 1478 vm_mmap_shm(struct thread *td, vm_size_t objsize, 1479 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1480 struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp) 1481 { 1482 int error; 1483 1484 if ((*flagsp & MAP_SHARED) != 0 && 1485 (*maxprotp & VM_PROT_WRITE) == 0 && 1486 (prot & PROT_WRITE) != 0) 1487 return (EACCES); 1488 #ifdef MAC 1489 error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp); 1490 if (error != 0) 1491 return (error); 1492 #endif 1493 error = shm_mmap(shmfd, objsize, foff, objp); 1494 if (error) 1495 return (error); 1496 return (0); 1497 } 1498 1499 /* 1500 * vm_mmap() 1501 * 1502 * MPSAFE 1503 * 1504 * Internal version of mmap. Currently used by mmap, exec, and sys5 1505 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1506 */ 1507 int 1508 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1509 vm_prot_t maxprot, int flags, 1510 objtype_t handle_type, void *handle, 1511 vm_ooffset_t foff) 1512 { 1513 boolean_t fitit; 1514 vm_object_t object = NULL; 1515 struct thread *td = curthread; 1516 int docow, error, findspace, rv; 1517 boolean_t writecounted; 1518 1519 if (size == 0) 1520 return (0); 1521 1522 size = round_page(size); 1523 1524 if (map == &td->td_proc->p_vmspace->vm_map) { 1525 PROC_LOCK(td->td_proc); 1526 if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) { 1527 PROC_UNLOCK(td->td_proc); 1528 return (ENOMEM); 1529 } 1530 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1531 PROC_UNLOCK(td->td_proc); 1532 return (ENOMEM); 1533 } 1534 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1535 if (ptoa(pmap_wired_count(map->pmap)) + size > 1536 lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1537 racct_set_force(td->td_proc, RACCT_VMEM, 1538 map->size); 1539 PROC_UNLOCK(td->td_proc); 1540 return (ENOMEM); 1541 } 1542 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1543 ptoa(pmap_wired_count(map->pmap)) + size); 1544 if (error != 0) { 1545 racct_set_force(td->td_proc, RACCT_VMEM, 1546 map->size); 1547 PROC_UNLOCK(td->td_proc); 1548 return (error); 1549 } 1550 } 1551 PROC_UNLOCK(td->td_proc); 1552 } 1553 1554 /* 1555 * We currently can only deal with page aligned file offsets. 1556 * The check is here rather than in the syscall because the 1557 * kernel calls this function internally for other mmaping 1558 * operations (such as in exec) and non-aligned offsets will 1559 * cause pmap inconsistencies...so we want to be sure to 1560 * disallow this in all cases. 1561 */ 1562 if (foff & PAGE_MASK) 1563 return (EINVAL); 1564 1565 if ((flags & MAP_FIXED) == 0) { 1566 fitit = TRUE; 1567 *addr = round_page(*addr); 1568 } else { 1569 if (*addr != trunc_page(*addr)) 1570 return (EINVAL); 1571 fitit = FALSE; 1572 } 1573 writecounted = FALSE; 1574 1575 /* 1576 * Lookup/allocate object. 1577 */ 1578 switch (handle_type) { 1579 case OBJT_DEVICE: 1580 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, 1581 handle, &foff, &object); 1582 break; 1583 case OBJT_VNODE: 1584 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1585 handle, &foff, &object, &writecounted); 1586 break; 1587 case OBJT_SWAP: 1588 error = vm_mmap_shm(td, size, prot, &maxprot, &flags, 1589 handle, foff, &object); 1590 break; 1591 case OBJT_DEFAULT: 1592 if (handle == NULL) { 1593 error = 0; 1594 break; 1595 } 1596 /* FALLTHROUGH */ 1597 default: 1598 error = EINVAL; 1599 break; 1600 } 1601 if (error) 1602 return (error); 1603 if (flags & MAP_ANON) { 1604 object = NULL; 1605 docow = 0; 1606 /* 1607 * Unnamed anonymous regions always start at 0. 1608 */ 1609 if (handle == 0) 1610 foff = 0; 1611 } else if (flags & MAP_PREFAULT_READ) 1612 docow = MAP_PREFAULT; 1613 else 1614 docow = MAP_PREFAULT_PARTIAL; 1615 1616 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1617 docow |= MAP_COPY_ON_WRITE; 1618 if (flags & MAP_NOSYNC) 1619 docow |= MAP_DISABLE_SYNCER; 1620 if (flags & MAP_NOCORE) 1621 docow |= MAP_DISABLE_COREDUMP; 1622 /* Shared memory is also shared with children. */ 1623 if (flags & MAP_SHARED) 1624 docow |= MAP_INHERIT_SHARE; 1625 if (writecounted) 1626 docow |= MAP_VN_WRITECOUNT; 1627 1628 if (flags & MAP_STACK) 1629 rv = vm_map_stack(map, *addr, size, prot, maxprot, 1630 docow | MAP_STACK_GROWS_DOWN); 1631 else if (fitit) { 1632 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) 1633 findspace = VMFS_SUPER_SPACE; 1634 else if ((flags & MAP_ALIGNMENT_MASK) != 0) 1635 findspace = VMFS_ALIGNED_SPACE(flags >> 1636 MAP_ALIGNMENT_SHIFT); 1637 else 1638 findspace = VMFS_OPTIMAL_SPACE; 1639 rv = vm_map_find(map, object, foff, addr, size, 1640 #ifdef MAP_32BIT 1641 flags & MAP_32BIT ? MAP_32BIT_MAX_ADDR : 1642 #endif 1643 0, findspace, prot, maxprot, docow); 1644 } else 1645 rv = vm_map_fixed(map, object, foff, *addr, size, 1646 prot, maxprot, docow); 1647 1648 if (rv == KERN_SUCCESS) { 1649 /* 1650 * If the process has requested that all future mappings 1651 * be wired, then heed this. 1652 */ 1653 if (map->flags & MAP_WIREFUTURE) { 1654 vm_map_wire(map, *addr, *addr + size, 1655 VM_MAP_WIRE_USER | ((flags & MAP_STACK) ? 1656 VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES)); 1657 } 1658 } else { 1659 /* 1660 * If this mapping was accounted for in the vnode's 1661 * writecount, then undo that now. 1662 */ 1663 if (writecounted) 1664 vnode_pager_release_writecount(object, 0, size); 1665 /* 1666 * Lose the object reference. Will destroy the 1667 * object if it's an unnamed anonymous mapping 1668 * or named anonymous without other references. 1669 */ 1670 vm_object_deallocate(object); 1671 } 1672 return (vm_mmap_to_errno(rv)); 1673 } 1674 1675 /* 1676 * Translate a Mach VM return code to zero on success or the appropriate errno 1677 * on failure. 1678 */ 1679 int 1680 vm_mmap_to_errno(int rv) 1681 { 1682 1683 switch (rv) { 1684 case KERN_SUCCESS: 1685 return (0); 1686 case KERN_INVALID_ADDRESS: 1687 case KERN_NO_SPACE: 1688 return (ENOMEM); 1689 case KERN_PROTECTION_FAILURE: 1690 return (EACCES); 1691 default: 1692 return (EINVAL); 1693 } 1694 } 1695