1 /*- 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35 * 36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_compat.h" 47 #include "opt_hwpmc_hooks.h" 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/capsicum.h> 52 #include <sys/kernel.h> 53 #include <sys/lock.h> 54 #include <sys/mutex.h> 55 #include <sys/sysproto.h> 56 #include <sys/filedesc.h> 57 #include <sys/priv.h> 58 #include <sys/proc.h> 59 #include <sys/procctl.h> 60 #include <sys/racct.h> 61 #include <sys/resource.h> 62 #include <sys/resourcevar.h> 63 #include <sys/rwlock.h> 64 #include <sys/sysctl.h> 65 #include <sys/vnode.h> 66 #include <sys/fcntl.h> 67 #include <sys/file.h> 68 #include <sys/mman.h> 69 #include <sys/mount.h> 70 #include <sys/conf.h> 71 #include <sys/stat.h> 72 #include <sys/syscallsubr.h> 73 #include <sys/sysent.h> 74 #include <sys/vmmeter.h> 75 76 #include <security/mac/mac_framework.h> 77 78 #include <vm/vm.h> 79 #include <vm/vm_param.h> 80 #include <vm/pmap.h> 81 #include <vm/vm_map.h> 82 #include <vm/vm_object.h> 83 #include <vm/vm_page.h> 84 #include <vm/vm_pager.h> 85 #include <vm/vm_pageout.h> 86 #include <vm/vm_extern.h> 87 #include <vm/vm_page.h> 88 #include <vm/vnode_pager.h> 89 90 #ifdef HWPMC_HOOKS 91 #include <sys/pmckern.h> 92 #endif 93 94 int old_mlock = 0; 95 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0, 96 "Do not apply RLIMIT_MEMLOCK on mlockall"); 97 98 #ifdef MAP_32BIT 99 #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) 100 #endif 101 102 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 103 int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *); 104 static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 105 int *, struct cdev *, vm_ooffset_t *, vm_object_t *); 106 static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 107 int *, struct shmfd *, vm_ooffset_t, vm_object_t *); 108 109 #ifndef _SYS_SYSPROTO_H_ 110 struct sbrk_args { 111 int incr; 112 }; 113 #endif 114 115 /* 116 * MPSAFE 117 */ 118 /* ARGSUSED */ 119 int 120 sys_sbrk(td, uap) 121 struct thread *td; 122 struct sbrk_args *uap; 123 { 124 /* Not yet implemented */ 125 return (EOPNOTSUPP); 126 } 127 128 #ifndef _SYS_SYSPROTO_H_ 129 struct sstk_args { 130 int incr; 131 }; 132 #endif 133 134 /* 135 * MPSAFE 136 */ 137 /* ARGSUSED */ 138 int 139 sys_sstk(td, uap) 140 struct thread *td; 141 struct sstk_args *uap; 142 { 143 /* Not yet implemented */ 144 return (EOPNOTSUPP); 145 } 146 147 #if defined(COMPAT_43) 148 #ifndef _SYS_SYSPROTO_H_ 149 struct getpagesize_args { 150 int dummy; 151 }; 152 #endif 153 154 int 155 ogetpagesize(td, uap) 156 struct thread *td; 157 struct getpagesize_args *uap; 158 { 159 /* MP SAFE */ 160 td->td_retval[0] = PAGE_SIZE; 161 return (0); 162 } 163 #endif /* COMPAT_43 */ 164 165 166 /* 167 * Memory Map (mmap) system call. Note that the file offset 168 * and address are allowed to be NOT page aligned, though if 169 * the MAP_FIXED flag it set, both must have the same remainder 170 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 171 * page-aligned, the actual mapping starts at trunc_page(addr) 172 * and the return value is adjusted up by the page offset. 173 * 174 * Generally speaking, only character devices which are themselves 175 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 176 * there would be no cache coherency between a descriptor and a VM mapping 177 * both to the same character device. 178 */ 179 #ifndef _SYS_SYSPROTO_H_ 180 struct mmap_args { 181 void *addr; 182 size_t len; 183 int prot; 184 int flags; 185 int fd; 186 long pad; 187 off_t pos; 188 }; 189 #endif 190 191 /* 192 * MPSAFE 193 */ 194 int 195 sys_mmap(td, uap) 196 struct thread *td; 197 struct mmap_args *uap; 198 { 199 #ifdef HWPMC_HOOKS 200 struct pmckern_map_in pkm; 201 #endif 202 struct file *fp; 203 struct vnode *vp; 204 vm_offset_t addr; 205 vm_size_t size, pageoff; 206 vm_prot_t cap_maxprot, maxprot; 207 void *handle; 208 objtype_t handle_type; 209 int align, error, flags, prot; 210 off_t pos; 211 struct vmspace *vms = td->td_proc->p_vmspace; 212 cap_rights_t rights; 213 214 addr = (vm_offset_t) uap->addr; 215 size = uap->len; 216 prot = uap->prot; 217 flags = uap->flags; 218 pos = uap->pos; 219 220 fp = NULL; 221 222 /* 223 * Enforce the constraints. 224 * Mapping of length 0 is only allowed for old binaries. 225 * Anonymous mapping shall specify -1 as filedescriptor and 226 * zero position for new code. Be nice to ancient a.out 227 * binaries and correct pos for anonymous mapping, since old 228 * ld.so sometimes issues anonymous map requests with non-zero 229 * pos. 230 */ 231 if (!SV_CURPROC_FLAG(SV_AOUT)) { 232 if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) || 233 ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0))) 234 return (EINVAL); 235 } else { 236 if ((flags & MAP_ANON) != 0) 237 pos = 0; 238 } 239 240 if (flags & MAP_STACK) { 241 if ((uap->fd != -1) || 242 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 243 return (EINVAL); 244 flags |= MAP_ANON; 245 pos = 0; 246 } 247 if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_RENAME | 248 MAP_NORESERVE | MAP_HASSEMAPHORE | MAP_STACK | MAP_NOSYNC | 249 MAP_ANON | MAP_EXCL | MAP_NOCORE | MAP_PREFAULT_READ | 250 #ifdef MAP_32BIT 251 MAP_32BIT | 252 #endif 253 MAP_ALIGNMENT_MASK)) != 0) 254 return (EINVAL); 255 if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) 256 return (EINVAL); 257 if ((flags & (MAP_ANON | MAP_SHARED | MAP_PRIVATE)) == 0 || 258 (flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE)) 259 return (EINVAL); 260 if (prot != PROT_NONE && 261 (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0) 262 return (EINVAL); 263 264 /* 265 * Align the file position to a page boundary, 266 * and save its page offset component. 267 */ 268 pageoff = (pos & PAGE_MASK); 269 pos -= pageoff; 270 271 /* Adjust size for rounding (on both ends). */ 272 size += pageoff; /* low end... */ 273 size = (vm_size_t) round_page(size); /* hi end */ 274 275 /* Ensure alignment is at least a page and fits in a pointer. */ 276 align = flags & MAP_ALIGNMENT_MASK; 277 if (align != 0 && align != MAP_ALIGNED_SUPER && 278 (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || 279 align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) 280 return (EINVAL); 281 282 /* 283 * Check for illegal addresses. Watch out for address wrap... Note 284 * that VM_*_ADDRESS are not constants due to casts (argh). 285 */ 286 if (flags & MAP_FIXED) { 287 /* 288 * The specified address must have the same remainder 289 * as the file offset taken modulo PAGE_SIZE, so it 290 * should be aligned after adjustment by pageoff. 291 */ 292 addr -= pageoff; 293 if (addr & PAGE_MASK) 294 return (EINVAL); 295 296 /* Address range must be all in user VM space. */ 297 if (addr < vm_map_min(&vms->vm_map) || 298 addr + size > vm_map_max(&vms->vm_map)) 299 return (EINVAL); 300 if (addr + size < addr) 301 return (EINVAL); 302 #ifdef MAP_32BIT 303 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) 304 return (EINVAL); 305 } else if (flags & MAP_32BIT) { 306 /* 307 * For MAP_32BIT, override the hint if it is too high and 308 * do not bother moving the mapping past the heap (since 309 * the heap is usually above 2GB). 310 */ 311 if (addr + size > MAP_32BIT_MAX_ADDR) 312 addr = 0; 313 #endif 314 } else { 315 /* 316 * XXX for non-fixed mappings where no hint is provided or 317 * the hint would fall in the potential heap space, 318 * place it after the end of the largest possible heap. 319 * 320 * There should really be a pmap call to determine a reasonable 321 * location. 322 */ 323 PROC_LOCK(td->td_proc); 324 if (addr == 0 || 325 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 326 addr < round_page((vm_offset_t)vms->vm_daddr + 327 lim_max(td->td_proc, RLIMIT_DATA)))) 328 addr = round_page((vm_offset_t)vms->vm_daddr + 329 lim_max(td->td_proc, RLIMIT_DATA)); 330 PROC_UNLOCK(td->td_proc); 331 } 332 if (flags & MAP_ANON) { 333 /* 334 * Mapping blank space is trivial. 335 */ 336 handle = NULL; 337 handle_type = OBJT_DEFAULT; 338 maxprot = VM_PROT_ALL; 339 cap_maxprot = VM_PROT_ALL; 340 } else { 341 /* 342 * Mapping file, get fp for validation and don't let the 343 * descriptor disappear on us if we block. Check capability 344 * rights, but also return the maximum rights to be combined 345 * with maxprot later. 346 */ 347 cap_rights_init(&rights, CAP_MMAP); 348 if (prot & PROT_READ) 349 cap_rights_set(&rights, CAP_MMAP_R); 350 if ((flags & MAP_SHARED) != 0) { 351 if (prot & PROT_WRITE) 352 cap_rights_set(&rights, CAP_MMAP_W); 353 } 354 if (prot & PROT_EXEC) 355 cap_rights_set(&rights, CAP_MMAP_X); 356 error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp); 357 if (error != 0) 358 goto done; 359 if (fp->f_type == DTYPE_SHM) { 360 handle = fp->f_data; 361 handle_type = OBJT_SWAP; 362 maxprot = VM_PROT_NONE; 363 364 /* FREAD should always be set. */ 365 if (fp->f_flag & FREAD) 366 maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; 367 if (fp->f_flag & FWRITE) 368 maxprot |= VM_PROT_WRITE; 369 goto map; 370 } 371 if (fp->f_type != DTYPE_VNODE) { 372 error = ENODEV; 373 goto done; 374 } 375 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ 376 defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) 377 /* 378 * POSIX shared-memory objects are defined to have 379 * kernel persistence, and are not defined to support 380 * read(2)/write(2) -- or even open(2). Thus, we can 381 * use MAP_ASYNC to trade on-disk coherence for speed. 382 * The shm_open(3) library routine turns on the FPOSIXSHM 383 * flag to request this behavior. 384 */ 385 if (fp->f_flag & FPOSIXSHM) 386 flags |= MAP_NOSYNC; 387 #endif 388 vp = fp->f_vnode; 389 /* 390 * Ensure that file and memory protections are 391 * compatible. Note that we only worry about 392 * writability if mapping is shared; in this case, 393 * current and max prot are dictated by the open file. 394 * XXX use the vnode instead? Problem is: what 395 * credentials do we use for determination? What if 396 * proc does a setuid? 397 */ 398 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 399 maxprot = VM_PROT_NONE; 400 else 401 maxprot = VM_PROT_EXECUTE; 402 if (fp->f_flag & FREAD) { 403 maxprot |= VM_PROT_READ; 404 } else if (prot & PROT_READ) { 405 error = EACCES; 406 goto done; 407 } 408 /* 409 * If we are sharing potential changes (either via 410 * MAP_SHARED or via the implicit sharing of character 411 * device mappings), and we are trying to get write 412 * permission although we opened it without asking 413 * for it, bail out. 414 */ 415 if ((flags & MAP_SHARED) != 0) { 416 if ((fp->f_flag & FWRITE) != 0) { 417 maxprot |= VM_PROT_WRITE; 418 } else if ((prot & PROT_WRITE) != 0) { 419 error = EACCES; 420 goto done; 421 } 422 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 423 maxprot |= VM_PROT_WRITE; 424 cap_maxprot |= VM_PROT_WRITE; 425 } 426 handle = (void *)vp; 427 handle_type = OBJT_VNODE; 428 } 429 map: 430 td->td_fpop = fp; 431 maxprot &= cap_maxprot; 432 433 /* This relies on VM_PROT_* matching PROT_*. */ 434 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 435 flags, handle_type, handle, pos); 436 td->td_fpop = NULL; 437 #ifdef HWPMC_HOOKS 438 /* inform hwpmc(4) if an executable is being mapped */ 439 if (error == 0 && handle_type == OBJT_VNODE && 440 (prot & PROT_EXEC)) { 441 pkm.pm_file = handle; 442 pkm.pm_address = (uintptr_t) addr; 443 PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); 444 } 445 #endif 446 if (error == 0) 447 td->td_retval[0] = (register_t) (addr + pageoff); 448 done: 449 if (fp) 450 fdrop(fp, td); 451 452 return (error); 453 } 454 455 int 456 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 457 { 458 struct mmap_args oargs; 459 460 oargs.addr = uap->addr; 461 oargs.len = uap->len; 462 oargs.prot = uap->prot; 463 oargs.flags = uap->flags; 464 oargs.fd = uap->fd; 465 oargs.pos = uap->pos; 466 return (sys_mmap(td, &oargs)); 467 } 468 469 #ifdef COMPAT_43 470 #ifndef _SYS_SYSPROTO_H_ 471 struct ommap_args { 472 caddr_t addr; 473 int len; 474 int prot; 475 int flags; 476 int fd; 477 long pos; 478 }; 479 #endif 480 int 481 ommap(td, uap) 482 struct thread *td; 483 struct ommap_args *uap; 484 { 485 struct mmap_args nargs; 486 static const char cvtbsdprot[8] = { 487 0, 488 PROT_EXEC, 489 PROT_WRITE, 490 PROT_EXEC | PROT_WRITE, 491 PROT_READ, 492 PROT_EXEC | PROT_READ, 493 PROT_WRITE | PROT_READ, 494 PROT_EXEC | PROT_WRITE | PROT_READ, 495 }; 496 497 #define OMAP_ANON 0x0002 498 #define OMAP_COPY 0x0020 499 #define OMAP_SHARED 0x0010 500 #define OMAP_FIXED 0x0100 501 502 nargs.addr = uap->addr; 503 nargs.len = uap->len; 504 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 505 #ifdef COMPAT_FREEBSD32 506 #if defined(__amd64__) 507 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 508 nargs.prot != 0) 509 nargs.prot |= PROT_EXEC; 510 #endif 511 #endif 512 nargs.flags = 0; 513 if (uap->flags & OMAP_ANON) 514 nargs.flags |= MAP_ANON; 515 if (uap->flags & OMAP_COPY) 516 nargs.flags |= MAP_COPY; 517 if (uap->flags & OMAP_SHARED) 518 nargs.flags |= MAP_SHARED; 519 else 520 nargs.flags |= MAP_PRIVATE; 521 if (uap->flags & OMAP_FIXED) 522 nargs.flags |= MAP_FIXED; 523 nargs.fd = uap->fd; 524 nargs.pos = uap->pos; 525 return (sys_mmap(td, &nargs)); 526 } 527 #endif /* COMPAT_43 */ 528 529 530 #ifndef _SYS_SYSPROTO_H_ 531 struct msync_args { 532 void *addr; 533 size_t len; 534 int flags; 535 }; 536 #endif 537 /* 538 * MPSAFE 539 */ 540 int 541 sys_msync(td, uap) 542 struct thread *td; 543 struct msync_args *uap; 544 { 545 vm_offset_t addr; 546 vm_size_t size, pageoff; 547 int flags; 548 vm_map_t map; 549 int rv; 550 551 addr = (vm_offset_t) uap->addr; 552 size = uap->len; 553 flags = uap->flags; 554 555 pageoff = (addr & PAGE_MASK); 556 addr -= pageoff; 557 size += pageoff; 558 size = (vm_size_t) round_page(size); 559 if (addr + size < addr) 560 return (EINVAL); 561 562 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 563 return (EINVAL); 564 565 map = &td->td_proc->p_vmspace->vm_map; 566 567 /* 568 * Clean the pages and interpret the return value. 569 */ 570 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 571 (flags & MS_INVALIDATE) != 0); 572 switch (rv) { 573 case KERN_SUCCESS: 574 return (0); 575 case KERN_INVALID_ADDRESS: 576 return (ENOMEM); 577 case KERN_INVALID_ARGUMENT: 578 return (EBUSY); 579 case KERN_FAILURE: 580 return (EIO); 581 default: 582 return (EINVAL); 583 } 584 } 585 586 #ifndef _SYS_SYSPROTO_H_ 587 struct munmap_args { 588 void *addr; 589 size_t len; 590 }; 591 #endif 592 /* 593 * MPSAFE 594 */ 595 int 596 sys_munmap(td, uap) 597 struct thread *td; 598 struct munmap_args *uap; 599 { 600 #ifdef HWPMC_HOOKS 601 struct pmckern_map_out pkm; 602 vm_map_entry_t entry; 603 #endif 604 vm_offset_t addr; 605 vm_size_t size, pageoff; 606 vm_map_t map; 607 608 addr = (vm_offset_t) uap->addr; 609 size = uap->len; 610 if (size == 0) 611 return (EINVAL); 612 613 pageoff = (addr & PAGE_MASK); 614 addr -= pageoff; 615 size += pageoff; 616 size = (vm_size_t) round_page(size); 617 if (addr + size < addr) 618 return (EINVAL); 619 620 /* 621 * Check for illegal addresses. Watch out for address wrap... 622 */ 623 map = &td->td_proc->p_vmspace->vm_map; 624 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 625 return (EINVAL); 626 vm_map_lock(map); 627 #ifdef HWPMC_HOOKS 628 /* 629 * Inform hwpmc if the address range being unmapped contains 630 * an executable region. 631 */ 632 pkm.pm_address = (uintptr_t) NULL; 633 if (vm_map_lookup_entry(map, addr, &entry)) { 634 for (; 635 entry != &map->header && entry->start < addr + size; 636 entry = entry->next) { 637 if (vm_map_check_protection(map, entry->start, 638 entry->end, VM_PROT_EXECUTE) == TRUE) { 639 pkm.pm_address = (uintptr_t) addr; 640 pkm.pm_size = (size_t) size; 641 break; 642 } 643 } 644 } 645 #endif 646 vm_map_delete(map, addr, addr + size); 647 648 #ifdef HWPMC_HOOKS 649 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 650 vm_map_lock_downgrade(map); 651 if (pkm.pm_address != (uintptr_t) NULL) 652 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 653 vm_map_unlock_read(map); 654 #else 655 vm_map_unlock(map); 656 #endif 657 /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ 658 return (0); 659 } 660 661 #ifndef _SYS_SYSPROTO_H_ 662 struct mprotect_args { 663 const void *addr; 664 size_t len; 665 int prot; 666 }; 667 #endif 668 /* 669 * MPSAFE 670 */ 671 int 672 sys_mprotect(td, uap) 673 struct thread *td; 674 struct mprotect_args *uap; 675 { 676 vm_offset_t addr; 677 vm_size_t size, pageoff; 678 vm_prot_t prot; 679 680 addr = (vm_offset_t) uap->addr; 681 size = uap->len; 682 prot = uap->prot & VM_PROT_ALL; 683 684 pageoff = (addr & PAGE_MASK); 685 addr -= pageoff; 686 size += pageoff; 687 size = (vm_size_t) round_page(size); 688 if (addr + size < addr) 689 return (EINVAL); 690 691 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 692 addr + size, prot, FALSE)) { 693 case KERN_SUCCESS: 694 return (0); 695 case KERN_PROTECTION_FAILURE: 696 return (EACCES); 697 case KERN_RESOURCE_SHORTAGE: 698 return (ENOMEM); 699 } 700 return (EINVAL); 701 } 702 703 #ifndef _SYS_SYSPROTO_H_ 704 struct minherit_args { 705 void *addr; 706 size_t len; 707 int inherit; 708 }; 709 #endif 710 /* 711 * MPSAFE 712 */ 713 int 714 sys_minherit(td, uap) 715 struct thread *td; 716 struct minherit_args *uap; 717 { 718 vm_offset_t addr; 719 vm_size_t size, pageoff; 720 vm_inherit_t inherit; 721 722 addr = (vm_offset_t)uap->addr; 723 size = uap->len; 724 inherit = uap->inherit; 725 726 pageoff = (addr & PAGE_MASK); 727 addr -= pageoff; 728 size += pageoff; 729 size = (vm_size_t) round_page(size); 730 if (addr + size < addr) 731 return (EINVAL); 732 733 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 734 addr + size, inherit)) { 735 case KERN_SUCCESS: 736 return (0); 737 case KERN_PROTECTION_FAILURE: 738 return (EACCES); 739 } 740 return (EINVAL); 741 } 742 743 #ifndef _SYS_SYSPROTO_H_ 744 struct madvise_args { 745 void *addr; 746 size_t len; 747 int behav; 748 }; 749 #endif 750 751 /* 752 * MPSAFE 753 */ 754 int 755 sys_madvise(td, uap) 756 struct thread *td; 757 struct madvise_args *uap; 758 { 759 vm_offset_t start, end; 760 vm_map_t map; 761 int flags; 762 763 /* 764 * Check for our special case, advising the swap pager we are 765 * "immortal." 766 */ 767 if (uap->behav == MADV_PROTECT) { 768 flags = PPROT_SET; 769 return (kern_procctl(td, P_PID, td->td_proc->p_pid, 770 PROC_SPROTECT, &flags)); 771 } 772 773 /* 774 * Check for illegal behavior 775 */ 776 if (uap->behav < 0 || uap->behav > MADV_CORE) 777 return (EINVAL); 778 /* 779 * Check for illegal addresses. Watch out for address wrap... Note 780 * that VM_*_ADDRESS are not constants due to casts (argh). 781 */ 782 map = &td->td_proc->p_vmspace->vm_map; 783 if ((vm_offset_t)uap->addr < vm_map_min(map) || 784 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 785 return (EINVAL); 786 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 787 return (EINVAL); 788 789 /* 790 * Since this routine is only advisory, we default to conservative 791 * behavior. 792 */ 793 start = trunc_page((vm_offset_t) uap->addr); 794 end = round_page((vm_offset_t) uap->addr + uap->len); 795 796 if (vm_map_madvise(map, start, end, uap->behav)) 797 return (EINVAL); 798 return (0); 799 } 800 801 #ifndef _SYS_SYSPROTO_H_ 802 struct mincore_args { 803 const void *addr; 804 size_t len; 805 char *vec; 806 }; 807 #endif 808 809 /* 810 * MPSAFE 811 */ 812 int 813 sys_mincore(td, uap) 814 struct thread *td; 815 struct mincore_args *uap; 816 { 817 vm_offset_t addr, first_addr; 818 vm_offset_t end, cend; 819 pmap_t pmap; 820 vm_map_t map; 821 char *vec; 822 int error = 0; 823 int vecindex, lastvecindex; 824 vm_map_entry_t current; 825 vm_map_entry_t entry; 826 vm_object_t object; 827 vm_paddr_t locked_pa; 828 vm_page_t m; 829 vm_pindex_t pindex; 830 int mincoreinfo; 831 unsigned int timestamp; 832 boolean_t locked; 833 834 /* 835 * Make sure that the addresses presented are valid for user 836 * mode. 837 */ 838 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 839 end = addr + (vm_size_t)round_page(uap->len); 840 map = &td->td_proc->p_vmspace->vm_map; 841 if (end > vm_map_max(map) || end < addr) 842 return (ENOMEM); 843 844 /* 845 * Address of byte vector 846 */ 847 vec = uap->vec; 848 849 pmap = vmspace_pmap(td->td_proc->p_vmspace); 850 851 vm_map_lock_read(map); 852 RestartScan: 853 timestamp = map->timestamp; 854 855 if (!vm_map_lookup_entry(map, addr, &entry)) { 856 vm_map_unlock_read(map); 857 return (ENOMEM); 858 } 859 860 /* 861 * Do this on a map entry basis so that if the pages are not 862 * in the current processes address space, we can easily look 863 * up the pages elsewhere. 864 */ 865 lastvecindex = -1; 866 for (current = entry; 867 (current != &map->header) && (current->start < end); 868 current = current->next) { 869 870 /* 871 * check for contiguity 872 */ 873 if (current->end < end && 874 (entry->next == &map->header || 875 current->next->start > current->end)) { 876 vm_map_unlock_read(map); 877 return (ENOMEM); 878 } 879 880 /* 881 * ignore submaps (for now) or null objects 882 */ 883 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 884 current->object.vm_object == NULL) 885 continue; 886 887 /* 888 * limit this scan to the current map entry and the 889 * limits for the mincore call 890 */ 891 if (addr < current->start) 892 addr = current->start; 893 cend = current->end; 894 if (cend > end) 895 cend = end; 896 897 /* 898 * scan this entry one page at a time 899 */ 900 while (addr < cend) { 901 /* 902 * Check pmap first, it is likely faster, also 903 * it can provide info as to whether we are the 904 * one referencing or modifying the page. 905 */ 906 object = NULL; 907 locked_pa = 0; 908 retry: 909 m = NULL; 910 mincoreinfo = pmap_mincore(pmap, addr, &locked_pa); 911 if (locked_pa != 0) { 912 /* 913 * The page is mapped by this process but not 914 * both accessed and modified. It is also 915 * managed. Acquire the object lock so that 916 * other mappings might be examined. 917 */ 918 m = PHYS_TO_VM_PAGE(locked_pa); 919 if (m->object != object) { 920 if (object != NULL) 921 VM_OBJECT_WUNLOCK(object); 922 object = m->object; 923 locked = VM_OBJECT_TRYWLOCK(object); 924 vm_page_unlock(m); 925 if (!locked) { 926 VM_OBJECT_WLOCK(object); 927 vm_page_lock(m); 928 goto retry; 929 } 930 } else 931 vm_page_unlock(m); 932 KASSERT(m->valid == VM_PAGE_BITS_ALL, 933 ("mincore: page %p is mapped but invalid", 934 m)); 935 } else if (mincoreinfo == 0) { 936 /* 937 * The page is not mapped by this process. If 938 * the object implements managed pages, then 939 * determine if the page is resident so that 940 * the mappings might be examined. 941 */ 942 if (current->object.vm_object != object) { 943 if (object != NULL) 944 VM_OBJECT_WUNLOCK(object); 945 object = current->object.vm_object; 946 VM_OBJECT_WLOCK(object); 947 } 948 if (object->type == OBJT_DEFAULT || 949 object->type == OBJT_SWAP || 950 object->type == OBJT_VNODE) { 951 pindex = OFF_TO_IDX(current->offset + 952 (addr - current->start)); 953 m = vm_page_lookup(object, pindex); 954 if (m == NULL && 955 vm_page_is_cached(object, pindex)) 956 mincoreinfo = MINCORE_INCORE; 957 if (m != NULL && m->valid == 0) 958 m = NULL; 959 if (m != NULL) 960 mincoreinfo = MINCORE_INCORE; 961 } 962 } 963 if (m != NULL) { 964 /* Examine other mappings to the page. */ 965 if (m->dirty == 0 && pmap_is_modified(m)) 966 vm_page_dirty(m); 967 if (m->dirty != 0) 968 mincoreinfo |= MINCORE_MODIFIED_OTHER; 969 /* 970 * The first test for PGA_REFERENCED is an 971 * optimization. The second test is 972 * required because a concurrent pmap 973 * operation could clear the last reference 974 * and set PGA_REFERENCED before the call to 975 * pmap_is_referenced(). 976 */ 977 if ((m->aflags & PGA_REFERENCED) != 0 || 978 pmap_is_referenced(m) || 979 (m->aflags & PGA_REFERENCED) != 0) 980 mincoreinfo |= MINCORE_REFERENCED_OTHER; 981 } 982 if (object != NULL) 983 VM_OBJECT_WUNLOCK(object); 984 985 /* 986 * subyte may page fault. In case it needs to modify 987 * the map, we release the lock. 988 */ 989 vm_map_unlock_read(map); 990 991 /* 992 * calculate index into user supplied byte vector 993 */ 994 vecindex = OFF_TO_IDX(addr - first_addr); 995 996 /* 997 * If we have skipped map entries, we need to make sure that 998 * the byte vector is zeroed for those skipped entries. 999 */ 1000 while ((lastvecindex + 1) < vecindex) { 1001 ++lastvecindex; 1002 error = subyte(vec + lastvecindex, 0); 1003 if (error) { 1004 error = EFAULT; 1005 goto done2; 1006 } 1007 } 1008 1009 /* 1010 * Pass the page information to the user 1011 */ 1012 error = subyte(vec + vecindex, mincoreinfo); 1013 if (error) { 1014 error = EFAULT; 1015 goto done2; 1016 } 1017 1018 /* 1019 * If the map has changed, due to the subyte, the previous 1020 * output may be invalid. 1021 */ 1022 vm_map_lock_read(map); 1023 if (timestamp != map->timestamp) 1024 goto RestartScan; 1025 1026 lastvecindex = vecindex; 1027 addr += PAGE_SIZE; 1028 } 1029 } 1030 1031 /* 1032 * subyte may page fault. In case it needs to modify 1033 * the map, we release the lock. 1034 */ 1035 vm_map_unlock_read(map); 1036 1037 /* 1038 * Zero the last entries in the byte vector. 1039 */ 1040 vecindex = OFF_TO_IDX(end - first_addr); 1041 while ((lastvecindex + 1) < vecindex) { 1042 ++lastvecindex; 1043 error = subyte(vec + lastvecindex, 0); 1044 if (error) { 1045 error = EFAULT; 1046 goto done2; 1047 } 1048 } 1049 1050 /* 1051 * If the map has changed, due to the subyte, the previous 1052 * output may be invalid. 1053 */ 1054 vm_map_lock_read(map); 1055 if (timestamp != map->timestamp) 1056 goto RestartScan; 1057 vm_map_unlock_read(map); 1058 done2: 1059 return (error); 1060 } 1061 1062 #ifndef _SYS_SYSPROTO_H_ 1063 struct mlock_args { 1064 const void *addr; 1065 size_t len; 1066 }; 1067 #endif 1068 /* 1069 * MPSAFE 1070 */ 1071 int 1072 sys_mlock(td, uap) 1073 struct thread *td; 1074 struct mlock_args *uap; 1075 { 1076 1077 return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len)); 1078 } 1079 1080 int 1081 vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len) 1082 { 1083 vm_offset_t addr, end, last, start; 1084 vm_size_t npages, size; 1085 vm_map_t map; 1086 unsigned long nsize; 1087 int error; 1088 1089 error = priv_check_cred(cred, PRIV_VM_MLOCK, 0); 1090 if (error) 1091 return (error); 1092 addr = (vm_offset_t)addr0; 1093 size = len; 1094 last = addr + size; 1095 start = trunc_page(addr); 1096 end = round_page(last); 1097 if (last < addr || end < addr) 1098 return (EINVAL); 1099 npages = atop(end - start); 1100 if (npages > vm_page_max_wired) 1101 return (ENOMEM); 1102 map = &proc->p_vmspace->vm_map; 1103 PROC_LOCK(proc); 1104 nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1105 if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) { 1106 PROC_UNLOCK(proc); 1107 return (ENOMEM); 1108 } 1109 PROC_UNLOCK(proc); 1110 if (npages + vm_cnt.v_wire_count > vm_page_max_wired) 1111 return (EAGAIN); 1112 #ifdef RACCT 1113 PROC_LOCK(proc); 1114 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1115 PROC_UNLOCK(proc); 1116 if (error != 0) 1117 return (ENOMEM); 1118 #endif 1119 error = vm_map_wire(map, start, end, 1120 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1121 #ifdef RACCT 1122 if (error != KERN_SUCCESS) { 1123 PROC_LOCK(proc); 1124 racct_set(proc, RACCT_MEMLOCK, 1125 ptoa(pmap_wired_count(map->pmap))); 1126 PROC_UNLOCK(proc); 1127 } 1128 #endif 1129 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1130 } 1131 1132 #ifndef _SYS_SYSPROTO_H_ 1133 struct mlockall_args { 1134 int how; 1135 }; 1136 #endif 1137 1138 /* 1139 * MPSAFE 1140 */ 1141 int 1142 sys_mlockall(td, uap) 1143 struct thread *td; 1144 struct mlockall_args *uap; 1145 { 1146 vm_map_t map; 1147 int error; 1148 1149 map = &td->td_proc->p_vmspace->vm_map; 1150 error = priv_check(td, PRIV_VM_MLOCK); 1151 if (error) 1152 return (error); 1153 1154 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1155 return (EINVAL); 1156 1157 /* 1158 * If wiring all pages in the process would cause it to exceed 1159 * a hard resource limit, return ENOMEM. 1160 */ 1161 if (!old_mlock && uap->how & MCL_CURRENT) { 1162 PROC_LOCK(td->td_proc); 1163 if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1164 PROC_UNLOCK(td->td_proc); 1165 return (ENOMEM); 1166 } 1167 PROC_UNLOCK(td->td_proc); 1168 } 1169 #ifdef RACCT 1170 PROC_LOCK(td->td_proc); 1171 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1172 PROC_UNLOCK(td->td_proc); 1173 if (error != 0) 1174 return (ENOMEM); 1175 #endif 1176 1177 if (uap->how & MCL_FUTURE) { 1178 vm_map_lock(map); 1179 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1180 vm_map_unlock(map); 1181 error = 0; 1182 } 1183 1184 if (uap->how & MCL_CURRENT) { 1185 /* 1186 * P1003.1-2001 mandates that all currently mapped pages 1187 * will be memory resident and locked (wired) upon return 1188 * from mlockall(). vm_map_wire() will wire pages, by 1189 * calling vm_fault_wire() for each page in the region. 1190 */ 1191 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1192 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1193 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1194 } 1195 #ifdef RACCT 1196 if (error != KERN_SUCCESS) { 1197 PROC_LOCK(td->td_proc); 1198 racct_set(td->td_proc, RACCT_MEMLOCK, 1199 ptoa(pmap_wired_count(map->pmap))); 1200 PROC_UNLOCK(td->td_proc); 1201 } 1202 #endif 1203 1204 return (error); 1205 } 1206 1207 #ifndef _SYS_SYSPROTO_H_ 1208 struct munlockall_args { 1209 register_t dummy; 1210 }; 1211 #endif 1212 1213 /* 1214 * MPSAFE 1215 */ 1216 int 1217 sys_munlockall(td, uap) 1218 struct thread *td; 1219 struct munlockall_args *uap; 1220 { 1221 vm_map_t map; 1222 int error; 1223 1224 map = &td->td_proc->p_vmspace->vm_map; 1225 error = priv_check(td, PRIV_VM_MUNLOCK); 1226 if (error) 1227 return (error); 1228 1229 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1230 vm_map_lock(map); 1231 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1232 vm_map_unlock(map); 1233 1234 /* Forcibly unwire all pages. */ 1235 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1236 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1237 #ifdef RACCT 1238 if (error == KERN_SUCCESS) { 1239 PROC_LOCK(td->td_proc); 1240 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1241 PROC_UNLOCK(td->td_proc); 1242 } 1243 #endif 1244 1245 return (error); 1246 } 1247 1248 #ifndef _SYS_SYSPROTO_H_ 1249 struct munlock_args { 1250 const void *addr; 1251 size_t len; 1252 }; 1253 #endif 1254 /* 1255 * MPSAFE 1256 */ 1257 int 1258 sys_munlock(td, uap) 1259 struct thread *td; 1260 struct munlock_args *uap; 1261 { 1262 vm_offset_t addr, end, last, start; 1263 vm_size_t size; 1264 #ifdef RACCT 1265 vm_map_t map; 1266 #endif 1267 int error; 1268 1269 error = priv_check(td, PRIV_VM_MUNLOCK); 1270 if (error) 1271 return (error); 1272 addr = (vm_offset_t)uap->addr; 1273 size = uap->len; 1274 last = addr + size; 1275 start = trunc_page(addr); 1276 end = round_page(last); 1277 if (last < addr || end < addr) 1278 return (EINVAL); 1279 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1280 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1281 #ifdef RACCT 1282 if (error == KERN_SUCCESS) { 1283 PROC_LOCK(td->td_proc); 1284 map = &td->td_proc->p_vmspace->vm_map; 1285 racct_set(td->td_proc, RACCT_MEMLOCK, 1286 ptoa(pmap_wired_count(map->pmap))); 1287 PROC_UNLOCK(td->td_proc); 1288 } 1289 #endif 1290 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1291 } 1292 1293 /* 1294 * vm_mmap_vnode() 1295 * 1296 * Helper function for vm_mmap. Perform sanity check specific for mmap 1297 * operations on vnodes. 1298 * 1299 * For VCHR vnodes, the vnode lock is held over the call to 1300 * vm_mmap_cdev() to keep vp->v_rdev valid. 1301 */ 1302 int 1303 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1304 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1305 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1306 boolean_t *writecounted) 1307 { 1308 struct vattr va; 1309 vm_object_t obj; 1310 vm_offset_t foff; 1311 struct mount *mp; 1312 struct ucred *cred; 1313 int error, flags, locktype; 1314 1315 mp = vp->v_mount; 1316 cred = td->td_ucred; 1317 if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED)) 1318 locktype = LK_EXCLUSIVE; 1319 else 1320 locktype = LK_SHARED; 1321 if ((error = vget(vp, locktype, td)) != 0) 1322 return (error); 1323 foff = *foffp; 1324 flags = *flagsp; 1325 obj = vp->v_object; 1326 if (vp->v_type == VREG) { 1327 /* 1328 * Get the proper underlying object 1329 */ 1330 if (obj == NULL) { 1331 error = EINVAL; 1332 goto done; 1333 } 1334 if (obj->type == OBJT_VNODE && obj->handle != vp) { 1335 vput(vp); 1336 vp = (struct vnode *)obj->handle; 1337 /* 1338 * Bypass filesystems obey the mpsafety of the 1339 * underlying fs. Tmpfs never bypasses. 1340 */ 1341 error = vget(vp, locktype, td); 1342 if (error != 0) 1343 return (error); 1344 } 1345 if (locktype == LK_EXCLUSIVE) { 1346 *writecounted = TRUE; 1347 vnode_pager_update_writecount(obj, 0, objsize); 1348 } 1349 } else if (vp->v_type == VCHR) { 1350 error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp, 1351 vp->v_rdev, foffp, objp); 1352 if (error == 0) 1353 goto mark_atime; 1354 goto done; 1355 } else { 1356 error = EINVAL; 1357 goto done; 1358 } 1359 if ((error = VOP_GETATTR(vp, &va, cred))) 1360 goto done; 1361 #ifdef MAC 1362 error = mac_vnode_check_mmap(cred, vp, prot, flags); 1363 if (error != 0) 1364 goto done; 1365 #endif 1366 if ((flags & MAP_SHARED) != 0) { 1367 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1368 if (prot & PROT_WRITE) { 1369 error = EPERM; 1370 goto done; 1371 } 1372 *maxprotp &= ~VM_PROT_WRITE; 1373 } 1374 } 1375 /* 1376 * If it is a regular file without any references 1377 * we do not need to sync it. 1378 * Adjust object size to be the size of actual file. 1379 */ 1380 objsize = round_page(va.va_size); 1381 if (va.va_nlink == 0) 1382 flags |= MAP_NOSYNC; 1383 if (obj->type == OBJT_VNODE) 1384 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, 1385 cred); 1386 else { 1387 KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP, 1388 ("wrong object type")); 1389 vm_object_reference(obj); 1390 } 1391 if (obj == NULL) { 1392 error = ENOMEM; 1393 goto done; 1394 } 1395 *objp = obj; 1396 *flagsp = flags; 1397 1398 mark_atime: 1399 vfs_mark_atime(vp, cred); 1400 1401 done: 1402 if (error != 0 && *writecounted) { 1403 *writecounted = FALSE; 1404 vnode_pager_update_writecount(obj, objsize, 0); 1405 } 1406 vput(vp); 1407 return (error); 1408 } 1409 1410 /* 1411 * vm_mmap_cdev() 1412 * 1413 * MPSAFE 1414 * 1415 * Helper function for vm_mmap. Perform sanity check specific for mmap 1416 * operations on cdevs. 1417 */ 1418 int 1419 vm_mmap_cdev(struct thread *td, vm_size_t objsize, 1420 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1421 struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp) 1422 { 1423 vm_object_t obj; 1424 struct cdevsw *dsw; 1425 int error, flags, ref; 1426 1427 flags = *flagsp; 1428 1429 dsw = dev_refthread(cdev, &ref); 1430 if (dsw == NULL) 1431 return (ENXIO); 1432 if (dsw->d_flags & D_MMAP_ANON) { 1433 dev_relthread(cdev, ref); 1434 *maxprotp = VM_PROT_ALL; 1435 *flagsp |= MAP_ANON; 1436 return (0); 1437 } 1438 /* 1439 * cdevs do not provide private mappings of any kind. 1440 */ 1441 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1442 (prot & PROT_WRITE) != 0) { 1443 dev_relthread(cdev, ref); 1444 return (EACCES); 1445 } 1446 if (flags & (MAP_PRIVATE|MAP_COPY)) { 1447 dev_relthread(cdev, ref); 1448 return (EINVAL); 1449 } 1450 /* 1451 * Force device mappings to be shared. 1452 */ 1453 flags |= MAP_SHARED; 1454 #ifdef MAC_XXX 1455 error = mac_cdev_check_mmap(td->td_ucred, cdev, prot); 1456 if (error != 0) { 1457 dev_relthread(cdev, ref); 1458 return (error); 1459 } 1460 #endif 1461 /* 1462 * First, try d_mmap_single(). If that is not implemented 1463 * (returns ENODEV), fall back to using the device pager. 1464 * Note that d_mmap_single() must return a reference to the 1465 * object (it needs to bump the reference count of the object 1466 * it returns somehow). 1467 * 1468 * XXX assumes VM_PROT_* == PROT_* 1469 */ 1470 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1471 dev_relthread(cdev, ref); 1472 if (error != ENODEV) 1473 return (error); 1474 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1475 td->td_ucred); 1476 if (obj == NULL) 1477 return (EINVAL); 1478 *objp = obj; 1479 *flagsp = flags; 1480 return (0); 1481 } 1482 1483 /* 1484 * vm_mmap_shm() 1485 * 1486 * MPSAFE 1487 * 1488 * Helper function for vm_mmap. Perform sanity check specific for mmap 1489 * operations on shm file descriptors. 1490 */ 1491 int 1492 vm_mmap_shm(struct thread *td, vm_size_t objsize, 1493 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1494 struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp) 1495 { 1496 int error; 1497 1498 if ((*flagsp & MAP_SHARED) != 0 && 1499 (*maxprotp & VM_PROT_WRITE) == 0 && 1500 (prot & PROT_WRITE) != 0) 1501 return (EACCES); 1502 #ifdef MAC 1503 error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp); 1504 if (error != 0) 1505 return (error); 1506 #endif 1507 error = shm_mmap(shmfd, objsize, foff, objp); 1508 if (error) 1509 return (error); 1510 return (0); 1511 } 1512 1513 /* 1514 * vm_mmap() 1515 * 1516 * MPSAFE 1517 * 1518 * Internal version of mmap. Currently used by mmap, exec, and sys5 1519 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1520 */ 1521 int 1522 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1523 vm_prot_t maxprot, int flags, 1524 objtype_t handle_type, void *handle, 1525 vm_ooffset_t foff) 1526 { 1527 boolean_t fitit; 1528 vm_object_t object = NULL; 1529 struct thread *td = curthread; 1530 int docow, error, findspace, rv; 1531 boolean_t writecounted; 1532 1533 if (size == 0) 1534 return (0); 1535 1536 size = round_page(size); 1537 1538 if (map == &td->td_proc->p_vmspace->vm_map) { 1539 PROC_LOCK(td->td_proc); 1540 if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) { 1541 PROC_UNLOCK(td->td_proc); 1542 return (ENOMEM); 1543 } 1544 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1545 PROC_UNLOCK(td->td_proc); 1546 return (ENOMEM); 1547 } 1548 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1549 if (ptoa(pmap_wired_count(map->pmap)) + size > 1550 lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1551 racct_set_force(td->td_proc, RACCT_VMEM, 1552 map->size); 1553 PROC_UNLOCK(td->td_proc); 1554 return (ENOMEM); 1555 } 1556 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1557 ptoa(pmap_wired_count(map->pmap)) + size); 1558 if (error != 0) { 1559 racct_set_force(td->td_proc, RACCT_VMEM, 1560 map->size); 1561 PROC_UNLOCK(td->td_proc); 1562 return (error); 1563 } 1564 } 1565 PROC_UNLOCK(td->td_proc); 1566 } 1567 1568 /* 1569 * We currently can only deal with page aligned file offsets. 1570 * The check is here rather than in the syscall because the 1571 * kernel calls this function internally for other mmaping 1572 * operations (such as in exec) and non-aligned offsets will 1573 * cause pmap inconsistencies...so we want to be sure to 1574 * disallow this in all cases. 1575 */ 1576 if (foff & PAGE_MASK) 1577 return (EINVAL); 1578 1579 if ((flags & MAP_FIXED) == 0) { 1580 fitit = TRUE; 1581 *addr = round_page(*addr); 1582 } else { 1583 if (*addr != trunc_page(*addr)) 1584 return (EINVAL); 1585 fitit = FALSE; 1586 } 1587 writecounted = FALSE; 1588 1589 /* 1590 * Lookup/allocate object. 1591 */ 1592 switch (handle_type) { 1593 case OBJT_DEVICE: 1594 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, 1595 handle, &foff, &object); 1596 break; 1597 case OBJT_VNODE: 1598 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1599 handle, &foff, &object, &writecounted); 1600 break; 1601 case OBJT_SWAP: 1602 error = vm_mmap_shm(td, size, prot, &maxprot, &flags, 1603 handle, foff, &object); 1604 break; 1605 case OBJT_DEFAULT: 1606 if (handle == NULL) { 1607 error = 0; 1608 break; 1609 } 1610 /* FALLTHROUGH */ 1611 default: 1612 error = EINVAL; 1613 break; 1614 } 1615 if (error) 1616 return (error); 1617 if (flags & MAP_ANON) { 1618 object = NULL; 1619 docow = 0; 1620 /* 1621 * Unnamed anonymous regions always start at 0. 1622 */ 1623 if (handle == 0) 1624 foff = 0; 1625 } else if (flags & MAP_PREFAULT_READ) 1626 docow = MAP_PREFAULT; 1627 else 1628 docow = MAP_PREFAULT_PARTIAL; 1629 1630 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1631 docow |= MAP_COPY_ON_WRITE; 1632 if (flags & MAP_NOSYNC) 1633 docow |= MAP_DISABLE_SYNCER; 1634 if (flags & MAP_NOCORE) 1635 docow |= MAP_DISABLE_COREDUMP; 1636 /* Shared memory is also shared with children. */ 1637 if (flags & MAP_SHARED) 1638 docow |= MAP_INHERIT_SHARE; 1639 if (writecounted) 1640 docow |= MAP_VN_WRITECOUNT; 1641 if (flags & MAP_STACK) { 1642 if (object != NULL) 1643 return (EINVAL); 1644 docow |= MAP_STACK_GROWS_DOWN; 1645 } 1646 if ((flags & MAP_EXCL) != 0) 1647 docow |= MAP_CHECK_EXCL; 1648 1649 if (fitit) { 1650 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) 1651 findspace = VMFS_SUPER_SPACE; 1652 else if ((flags & MAP_ALIGNMENT_MASK) != 0) 1653 findspace = VMFS_ALIGNED_SPACE(flags >> 1654 MAP_ALIGNMENT_SHIFT); 1655 else 1656 findspace = VMFS_OPTIMAL_SPACE; 1657 rv = vm_map_find(map, object, foff, addr, size, 1658 #ifdef MAP_32BIT 1659 flags & MAP_32BIT ? MAP_32BIT_MAX_ADDR : 1660 #endif 1661 0, findspace, prot, maxprot, docow); 1662 } else { 1663 rv = vm_map_fixed(map, object, foff, *addr, size, 1664 prot, maxprot, docow); 1665 } 1666 1667 if (rv == KERN_SUCCESS) { 1668 /* 1669 * If the process has requested that all future mappings 1670 * be wired, then heed this. 1671 */ 1672 if (map->flags & MAP_WIREFUTURE) { 1673 vm_map_wire(map, *addr, *addr + size, 1674 VM_MAP_WIRE_USER | ((flags & MAP_STACK) ? 1675 VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES)); 1676 } 1677 } else { 1678 /* 1679 * If this mapping was accounted for in the vnode's 1680 * writecount, then undo that now. 1681 */ 1682 if (writecounted) 1683 vnode_pager_release_writecount(object, 0, size); 1684 /* 1685 * Lose the object reference. Will destroy the 1686 * object if it's an unnamed anonymous mapping 1687 * or named anonymous without other references. 1688 */ 1689 vm_object_deallocate(object); 1690 } 1691 return (vm_mmap_to_errno(rv)); 1692 } 1693 1694 /* 1695 * Translate a Mach VM return code to zero on success or the appropriate errno 1696 * on failure. 1697 */ 1698 int 1699 vm_mmap_to_errno(int rv) 1700 { 1701 1702 switch (rv) { 1703 case KERN_SUCCESS: 1704 return (0); 1705 case KERN_INVALID_ADDRESS: 1706 case KERN_NO_SPACE: 1707 return (ENOMEM); 1708 case KERN_PROTECTION_FAILURE: 1709 return (EACCES); 1710 default: 1711 return (EINVAL); 1712 } 1713 } 1714