1 /*- 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35 * 36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_compat.h" 47 #include "opt_hwpmc_hooks.h" 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/capsicum.h> 52 #include <sys/kernel.h> 53 #include <sys/lock.h> 54 #include <sys/mutex.h> 55 #include <sys/sysproto.h> 56 #include <sys/filedesc.h> 57 #include <sys/priv.h> 58 #include <sys/proc.h> 59 #include <sys/procctl.h> 60 #include <sys/racct.h> 61 #include <sys/resource.h> 62 #include <sys/resourcevar.h> 63 #include <sys/rwlock.h> 64 #include <sys/sysctl.h> 65 #include <sys/vnode.h> 66 #include <sys/fcntl.h> 67 #include <sys/file.h> 68 #include <sys/mman.h> 69 #include <sys/mount.h> 70 #include <sys/conf.h> 71 #include <sys/stat.h> 72 #include <sys/syscallsubr.h> 73 #include <sys/sysent.h> 74 #include <sys/vmmeter.h> 75 76 #include <security/mac/mac_framework.h> 77 78 #include <vm/vm.h> 79 #include <vm/vm_param.h> 80 #include <vm/pmap.h> 81 #include <vm/vm_map.h> 82 #include <vm/vm_object.h> 83 #include <vm/vm_page.h> 84 #include <vm/vm_pager.h> 85 #include <vm/vm_pageout.h> 86 #include <vm/vm_extern.h> 87 #include <vm/vm_page.h> 88 #include <vm/vnode_pager.h> 89 90 #ifdef HWPMC_HOOKS 91 #include <sys/pmckern.h> 92 #endif 93 94 int old_mlock = 0; 95 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0, 96 "Do not apply RLIMIT_MEMLOCK on mlockall"); 97 98 #ifdef MAP_32BIT 99 #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) 100 #endif 101 102 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 103 int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *); 104 static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 105 int *, struct cdev *, vm_ooffset_t *, vm_object_t *); 106 static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 107 int *, struct shmfd *, vm_ooffset_t, vm_object_t *); 108 109 #ifndef _SYS_SYSPROTO_H_ 110 struct sbrk_args { 111 int incr; 112 }; 113 #endif 114 115 /* 116 * MPSAFE 117 */ 118 /* ARGSUSED */ 119 int 120 sys_sbrk(td, uap) 121 struct thread *td; 122 struct sbrk_args *uap; 123 { 124 /* Not yet implemented */ 125 return (EOPNOTSUPP); 126 } 127 128 #ifndef _SYS_SYSPROTO_H_ 129 struct sstk_args { 130 int incr; 131 }; 132 #endif 133 134 /* 135 * MPSAFE 136 */ 137 /* ARGSUSED */ 138 int 139 sys_sstk(td, uap) 140 struct thread *td; 141 struct sstk_args *uap; 142 { 143 /* Not yet implemented */ 144 return (EOPNOTSUPP); 145 } 146 147 #if defined(COMPAT_43) 148 #ifndef _SYS_SYSPROTO_H_ 149 struct getpagesize_args { 150 int dummy; 151 }; 152 #endif 153 154 int 155 ogetpagesize(td, uap) 156 struct thread *td; 157 struct getpagesize_args *uap; 158 { 159 /* MP SAFE */ 160 td->td_retval[0] = PAGE_SIZE; 161 return (0); 162 } 163 #endif /* COMPAT_43 */ 164 165 166 /* 167 * Memory Map (mmap) system call. Note that the file offset 168 * and address are allowed to be NOT page aligned, though if 169 * the MAP_FIXED flag it set, both must have the same remainder 170 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 171 * page-aligned, the actual mapping starts at trunc_page(addr) 172 * and the return value is adjusted up by the page offset. 173 * 174 * Generally speaking, only character devices which are themselves 175 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 176 * there would be no cache coherency between a descriptor and a VM mapping 177 * both to the same character device. 178 */ 179 #ifndef _SYS_SYSPROTO_H_ 180 struct mmap_args { 181 void *addr; 182 size_t len; 183 int prot; 184 int flags; 185 int fd; 186 long pad; 187 off_t pos; 188 }; 189 #endif 190 191 /* 192 * MPSAFE 193 */ 194 int 195 sys_mmap(td, uap) 196 struct thread *td; 197 struct mmap_args *uap; 198 { 199 #ifdef HWPMC_HOOKS 200 struct pmckern_map_in pkm; 201 #endif 202 struct file *fp; 203 struct vnode *vp; 204 vm_offset_t addr; 205 vm_size_t size, pageoff; 206 vm_prot_t cap_maxprot, maxprot; 207 void *handle; 208 objtype_t handle_type; 209 int align, error, flags, prot; 210 off_t pos; 211 struct vmspace *vms = td->td_proc->p_vmspace; 212 cap_rights_t rights; 213 214 addr = (vm_offset_t) uap->addr; 215 size = uap->len; 216 prot = uap->prot; 217 flags = uap->flags; 218 pos = uap->pos; 219 220 fp = NULL; 221 222 /* 223 * Enforce the constraints. 224 * Mapping of length 0 is only allowed for old binaries. 225 * Anonymous mapping shall specify -1 as filedescriptor and 226 * zero position for new code. Be nice to ancient a.out 227 * binaries and correct pos for anonymous mapping, since old 228 * ld.so sometimes issues anonymous map requests with non-zero 229 * pos. 230 */ 231 if (!SV_CURPROC_FLAG(SV_AOUT)) { 232 if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) || 233 ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0))) 234 return (EINVAL); 235 } else { 236 if ((flags & MAP_ANON) != 0) 237 pos = 0; 238 } 239 240 if (flags & MAP_STACK) { 241 if ((uap->fd != -1) || 242 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 243 return (EINVAL); 244 flags |= MAP_ANON; 245 pos = 0; 246 } 247 /* XXX: MAP_RENAME, MAP_NORESERVE */ 248 if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE | 249 MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE | 250 MAP_PREFAULT_READ | 251 #ifdef MAP_32BIT 252 MAP_32BIT | 253 #endif 254 MAP_ALIGNMENT_MASK)) != 0) 255 return (EINVAL); 256 if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) 257 return (EINVAL); 258 if ((flags & (MAP_ANON | MAP_SHARED | MAP_PRIVATE)) == 0 || 259 (flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE)) 260 return (EINVAL); 261 if (prot != PROT_NONE && 262 (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0) 263 return (EINVAL); 264 265 /* 266 * Align the file position to a page boundary, 267 * and save its page offset component. 268 */ 269 pageoff = (pos & PAGE_MASK); 270 pos -= pageoff; 271 272 /* Adjust size for rounding (on both ends). */ 273 size += pageoff; /* low end... */ 274 size = (vm_size_t) round_page(size); /* hi end */ 275 276 /* Ensure alignment is at least a page and fits in a pointer. */ 277 align = flags & MAP_ALIGNMENT_MASK; 278 if (align != 0 && align != MAP_ALIGNED_SUPER && 279 (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || 280 align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) 281 return (EINVAL); 282 283 /* 284 * Check for illegal addresses. Watch out for address wrap... Note 285 * that VM_*_ADDRESS are not constants due to casts (argh). 286 */ 287 if (flags & MAP_FIXED) { 288 /* 289 * The specified address must have the same remainder 290 * as the file offset taken modulo PAGE_SIZE, so it 291 * should be aligned after adjustment by pageoff. 292 */ 293 addr -= pageoff; 294 if (addr & PAGE_MASK) 295 return (EINVAL); 296 297 /* Address range must be all in user VM space. */ 298 if (addr < vm_map_min(&vms->vm_map) || 299 addr + size > vm_map_max(&vms->vm_map)) 300 return (EINVAL); 301 if (addr + size < addr) 302 return (EINVAL); 303 #ifdef MAP_32BIT 304 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) 305 return (EINVAL); 306 } else if (flags & MAP_32BIT) { 307 /* 308 * For MAP_32BIT, override the hint if it is too high and 309 * do not bother moving the mapping past the heap (since 310 * the heap is usually above 2GB). 311 */ 312 if (addr + size > MAP_32BIT_MAX_ADDR) 313 addr = 0; 314 #endif 315 } else { 316 /* 317 * XXX for non-fixed mappings where no hint is provided or 318 * the hint would fall in the potential heap space, 319 * place it after the end of the largest possible heap. 320 * 321 * There should really be a pmap call to determine a reasonable 322 * location. 323 */ 324 PROC_LOCK(td->td_proc); 325 if (addr == 0 || 326 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 327 addr < round_page((vm_offset_t)vms->vm_daddr + 328 lim_max(td->td_proc, RLIMIT_DATA)))) 329 addr = round_page((vm_offset_t)vms->vm_daddr + 330 lim_max(td->td_proc, RLIMIT_DATA)); 331 PROC_UNLOCK(td->td_proc); 332 } 333 if (flags & MAP_ANON) { 334 /* 335 * Mapping blank space is trivial. 336 */ 337 handle = NULL; 338 handle_type = OBJT_DEFAULT; 339 maxprot = VM_PROT_ALL; 340 cap_maxprot = VM_PROT_ALL; 341 } else { 342 /* 343 * Mapping file, get fp for validation and don't let the 344 * descriptor disappear on us if we block. Check capability 345 * rights, but also return the maximum rights to be combined 346 * with maxprot later. 347 */ 348 cap_rights_init(&rights, CAP_MMAP); 349 if (prot & PROT_READ) 350 cap_rights_set(&rights, CAP_MMAP_R); 351 if ((flags & MAP_SHARED) != 0) { 352 if (prot & PROT_WRITE) 353 cap_rights_set(&rights, CAP_MMAP_W); 354 } 355 if (prot & PROT_EXEC) 356 cap_rights_set(&rights, CAP_MMAP_X); 357 error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp); 358 if (error != 0) 359 goto done; 360 if (fp->f_type == DTYPE_SHM) { 361 handle = fp->f_data; 362 handle_type = OBJT_SWAP; 363 maxprot = VM_PROT_NONE; 364 365 /* FREAD should always be set. */ 366 if (fp->f_flag & FREAD) 367 maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; 368 if (fp->f_flag & FWRITE) 369 maxprot |= VM_PROT_WRITE; 370 goto map; 371 } 372 if (fp->f_type != DTYPE_VNODE) { 373 error = ENODEV; 374 goto done; 375 } 376 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ 377 defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) 378 /* 379 * POSIX shared-memory objects are defined to have 380 * kernel persistence, and are not defined to support 381 * read(2)/write(2) -- or even open(2). Thus, we can 382 * use MAP_ASYNC to trade on-disk coherence for speed. 383 * The shm_open(3) library routine turns on the FPOSIXSHM 384 * flag to request this behavior. 385 */ 386 if (fp->f_flag & FPOSIXSHM) 387 flags |= MAP_NOSYNC; 388 #endif 389 vp = fp->f_vnode; 390 /* 391 * Ensure that file and memory protections are 392 * compatible. Note that we only worry about 393 * writability if mapping is shared; in this case, 394 * current and max prot are dictated by the open file. 395 * XXX use the vnode instead? Problem is: what 396 * credentials do we use for determination? What if 397 * proc does a setuid? 398 */ 399 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 400 maxprot = VM_PROT_NONE; 401 else 402 maxprot = VM_PROT_EXECUTE; 403 if (fp->f_flag & FREAD) { 404 maxprot |= VM_PROT_READ; 405 } else if (prot & PROT_READ) { 406 error = EACCES; 407 goto done; 408 } 409 /* 410 * If we are sharing potential changes (either via 411 * MAP_SHARED or via the implicit sharing of character 412 * device mappings), and we are trying to get write 413 * permission although we opened it without asking 414 * for it, bail out. 415 */ 416 if ((flags & MAP_SHARED) != 0) { 417 if ((fp->f_flag & FWRITE) != 0) { 418 maxprot |= VM_PROT_WRITE; 419 } else if ((prot & PROT_WRITE) != 0) { 420 error = EACCES; 421 goto done; 422 } 423 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 424 maxprot |= VM_PROT_WRITE; 425 cap_maxprot |= VM_PROT_WRITE; 426 } 427 handle = (void *)vp; 428 handle_type = OBJT_VNODE; 429 } 430 map: 431 td->td_fpop = fp; 432 maxprot &= cap_maxprot; 433 434 /* This relies on VM_PROT_* matching PROT_*. */ 435 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 436 flags, handle_type, handle, pos); 437 td->td_fpop = NULL; 438 #ifdef HWPMC_HOOKS 439 /* inform hwpmc(4) if an executable is being mapped */ 440 if (error == 0 && handle_type == OBJT_VNODE && 441 (prot & PROT_EXEC)) { 442 pkm.pm_file = handle; 443 pkm.pm_address = (uintptr_t) addr; 444 PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); 445 } 446 #endif 447 if (error == 0) 448 td->td_retval[0] = (register_t) (addr + pageoff); 449 done: 450 if (fp) 451 fdrop(fp, td); 452 453 return (error); 454 } 455 456 int 457 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 458 { 459 struct mmap_args oargs; 460 461 oargs.addr = uap->addr; 462 oargs.len = uap->len; 463 oargs.prot = uap->prot; 464 oargs.flags = uap->flags; 465 oargs.fd = uap->fd; 466 oargs.pos = uap->pos; 467 return (sys_mmap(td, &oargs)); 468 } 469 470 #ifdef COMPAT_43 471 #ifndef _SYS_SYSPROTO_H_ 472 struct ommap_args { 473 caddr_t addr; 474 int len; 475 int prot; 476 int flags; 477 int fd; 478 long pos; 479 }; 480 #endif 481 int 482 ommap(td, uap) 483 struct thread *td; 484 struct ommap_args *uap; 485 { 486 struct mmap_args nargs; 487 static const char cvtbsdprot[8] = { 488 0, 489 PROT_EXEC, 490 PROT_WRITE, 491 PROT_EXEC | PROT_WRITE, 492 PROT_READ, 493 PROT_EXEC | PROT_READ, 494 PROT_WRITE | PROT_READ, 495 PROT_EXEC | PROT_WRITE | PROT_READ, 496 }; 497 498 #define OMAP_ANON 0x0002 499 #define OMAP_COPY 0x0020 500 #define OMAP_SHARED 0x0010 501 #define OMAP_FIXED 0x0100 502 503 nargs.addr = uap->addr; 504 nargs.len = uap->len; 505 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 506 #ifdef COMPAT_FREEBSD32 507 #if defined(__amd64__) 508 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 509 nargs.prot != 0) 510 nargs.prot |= PROT_EXEC; 511 #endif 512 #endif 513 nargs.flags = 0; 514 if (uap->flags & OMAP_ANON) 515 nargs.flags |= MAP_ANON; 516 if (uap->flags & OMAP_COPY) 517 nargs.flags |= MAP_COPY; 518 if (uap->flags & OMAP_SHARED) 519 nargs.flags |= MAP_SHARED; 520 else 521 nargs.flags |= MAP_PRIVATE; 522 if (uap->flags & OMAP_FIXED) 523 nargs.flags |= MAP_FIXED; 524 nargs.fd = uap->fd; 525 nargs.pos = uap->pos; 526 return (sys_mmap(td, &nargs)); 527 } 528 #endif /* COMPAT_43 */ 529 530 531 #ifndef _SYS_SYSPROTO_H_ 532 struct msync_args { 533 void *addr; 534 size_t len; 535 int flags; 536 }; 537 #endif 538 /* 539 * MPSAFE 540 */ 541 int 542 sys_msync(td, uap) 543 struct thread *td; 544 struct msync_args *uap; 545 { 546 vm_offset_t addr; 547 vm_size_t size, pageoff; 548 int flags; 549 vm_map_t map; 550 int rv; 551 552 addr = (vm_offset_t) uap->addr; 553 size = uap->len; 554 flags = uap->flags; 555 556 pageoff = (addr & PAGE_MASK); 557 addr -= pageoff; 558 size += pageoff; 559 size = (vm_size_t) round_page(size); 560 if (addr + size < addr) 561 return (EINVAL); 562 563 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 564 return (EINVAL); 565 566 map = &td->td_proc->p_vmspace->vm_map; 567 568 /* 569 * Clean the pages and interpret the return value. 570 */ 571 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 572 (flags & MS_INVALIDATE) != 0); 573 switch (rv) { 574 case KERN_SUCCESS: 575 return (0); 576 case KERN_INVALID_ADDRESS: 577 return (ENOMEM); 578 case KERN_INVALID_ARGUMENT: 579 return (EBUSY); 580 case KERN_FAILURE: 581 return (EIO); 582 default: 583 return (EINVAL); 584 } 585 } 586 587 #ifndef _SYS_SYSPROTO_H_ 588 struct munmap_args { 589 void *addr; 590 size_t len; 591 }; 592 #endif 593 /* 594 * MPSAFE 595 */ 596 int 597 sys_munmap(td, uap) 598 struct thread *td; 599 struct munmap_args *uap; 600 { 601 #ifdef HWPMC_HOOKS 602 struct pmckern_map_out pkm; 603 vm_map_entry_t entry; 604 #endif 605 vm_offset_t addr; 606 vm_size_t size, pageoff; 607 vm_map_t map; 608 609 addr = (vm_offset_t) uap->addr; 610 size = uap->len; 611 if (size == 0) 612 return (EINVAL); 613 614 pageoff = (addr & PAGE_MASK); 615 addr -= pageoff; 616 size += pageoff; 617 size = (vm_size_t) round_page(size); 618 if (addr + size < addr) 619 return (EINVAL); 620 621 /* 622 * Check for illegal addresses. Watch out for address wrap... 623 */ 624 map = &td->td_proc->p_vmspace->vm_map; 625 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 626 return (EINVAL); 627 vm_map_lock(map); 628 #ifdef HWPMC_HOOKS 629 /* 630 * Inform hwpmc if the address range being unmapped contains 631 * an executable region. 632 */ 633 pkm.pm_address = (uintptr_t) NULL; 634 if (vm_map_lookup_entry(map, addr, &entry)) { 635 for (; 636 entry != &map->header && entry->start < addr + size; 637 entry = entry->next) { 638 if (vm_map_check_protection(map, entry->start, 639 entry->end, VM_PROT_EXECUTE) == TRUE) { 640 pkm.pm_address = (uintptr_t) addr; 641 pkm.pm_size = (size_t) size; 642 break; 643 } 644 } 645 } 646 #endif 647 vm_map_delete(map, addr, addr + size); 648 649 #ifdef HWPMC_HOOKS 650 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 651 vm_map_lock_downgrade(map); 652 if (pkm.pm_address != (uintptr_t) NULL) 653 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 654 vm_map_unlock_read(map); 655 #else 656 vm_map_unlock(map); 657 #endif 658 /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ 659 return (0); 660 } 661 662 #ifndef _SYS_SYSPROTO_H_ 663 struct mprotect_args { 664 const void *addr; 665 size_t len; 666 int prot; 667 }; 668 #endif 669 /* 670 * MPSAFE 671 */ 672 int 673 sys_mprotect(td, uap) 674 struct thread *td; 675 struct mprotect_args *uap; 676 { 677 vm_offset_t addr; 678 vm_size_t size, pageoff; 679 vm_prot_t prot; 680 681 addr = (vm_offset_t) uap->addr; 682 size = uap->len; 683 prot = uap->prot & VM_PROT_ALL; 684 685 pageoff = (addr & PAGE_MASK); 686 addr -= pageoff; 687 size += pageoff; 688 size = (vm_size_t) round_page(size); 689 if (addr + size < addr) 690 return (EINVAL); 691 692 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 693 addr + size, prot, FALSE)) { 694 case KERN_SUCCESS: 695 return (0); 696 case KERN_PROTECTION_FAILURE: 697 return (EACCES); 698 case KERN_RESOURCE_SHORTAGE: 699 return (ENOMEM); 700 } 701 return (EINVAL); 702 } 703 704 #ifndef _SYS_SYSPROTO_H_ 705 struct minherit_args { 706 void *addr; 707 size_t len; 708 int inherit; 709 }; 710 #endif 711 /* 712 * MPSAFE 713 */ 714 int 715 sys_minherit(td, uap) 716 struct thread *td; 717 struct minherit_args *uap; 718 { 719 vm_offset_t addr; 720 vm_size_t size, pageoff; 721 vm_inherit_t inherit; 722 723 addr = (vm_offset_t)uap->addr; 724 size = uap->len; 725 inherit = uap->inherit; 726 727 pageoff = (addr & PAGE_MASK); 728 addr -= pageoff; 729 size += pageoff; 730 size = (vm_size_t) round_page(size); 731 if (addr + size < addr) 732 return (EINVAL); 733 734 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 735 addr + size, inherit)) { 736 case KERN_SUCCESS: 737 return (0); 738 case KERN_PROTECTION_FAILURE: 739 return (EACCES); 740 } 741 return (EINVAL); 742 } 743 744 #ifndef _SYS_SYSPROTO_H_ 745 struct madvise_args { 746 void *addr; 747 size_t len; 748 int behav; 749 }; 750 #endif 751 752 /* 753 * MPSAFE 754 */ 755 int 756 sys_madvise(td, uap) 757 struct thread *td; 758 struct madvise_args *uap; 759 { 760 vm_offset_t start, end; 761 vm_map_t map; 762 int flags; 763 764 /* 765 * Check for our special case, advising the swap pager we are 766 * "immortal." 767 */ 768 if (uap->behav == MADV_PROTECT) { 769 flags = PPROT_SET; 770 return (kern_procctl(td, P_PID, td->td_proc->p_pid, 771 PROC_SPROTECT, &flags)); 772 } 773 774 /* 775 * Check for illegal behavior 776 */ 777 if (uap->behav < 0 || uap->behav > MADV_CORE) 778 return (EINVAL); 779 /* 780 * Check for illegal addresses. Watch out for address wrap... Note 781 * that VM_*_ADDRESS are not constants due to casts (argh). 782 */ 783 map = &td->td_proc->p_vmspace->vm_map; 784 if ((vm_offset_t)uap->addr < vm_map_min(map) || 785 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 786 return (EINVAL); 787 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 788 return (EINVAL); 789 790 /* 791 * Since this routine is only advisory, we default to conservative 792 * behavior. 793 */ 794 start = trunc_page((vm_offset_t) uap->addr); 795 end = round_page((vm_offset_t) uap->addr + uap->len); 796 797 if (vm_map_madvise(map, start, end, uap->behav)) 798 return (EINVAL); 799 return (0); 800 } 801 802 #ifndef _SYS_SYSPROTO_H_ 803 struct mincore_args { 804 const void *addr; 805 size_t len; 806 char *vec; 807 }; 808 #endif 809 810 /* 811 * MPSAFE 812 */ 813 int 814 sys_mincore(td, uap) 815 struct thread *td; 816 struct mincore_args *uap; 817 { 818 vm_offset_t addr, first_addr; 819 vm_offset_t end, cend; 820 pmap_t pmap; 821 vm_map_t map; 822 char *vec; 823 int error = 0; 824 int vecindex, lastvecindex; 825 vm_map_entry_t current; 826 vm_map_entry_t entry; 827 vm_object_t object; 828 vm_paddr_t locked_pa; 829 vm_page_t m; 830 vm_pindex_t pindex; 831 int mincoreinfo; 832 unsigned int timestamp; 833 boolean_t locked; 834 835 /* 836 * Make sure that the addresses presented are valid for user 837 * mode. 838 */ 839 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 840 end = addr + (vm_size_t)round_page(uap->len); 841 map = &td->td_proc->p_vmspace->vm_map; 842 if (end > vm_map_max(map) || end < addr) 843 return (ENOMEM); 844 845 /* 846 * Address of byte vector 847 */ 848 vec = uap->vec; 849 850 pmap = vmspace_pmap(td->td_proc->p_vmspace); 851 852 vm_map_lock_read(map); 853 RestartScan: 854 timestamp = map->timestamp; 855 856 if (!vm_map_lookup_entry(map, addr, &entry)) { 857 vm_map_unlock_read(map); 858 return (ENOMEM); 859 } 860 861 /* 862 * Do this on a map entry basis so that if the pages are not 863 * in the current processes address space, we can easily look 864 * up the pages elsewhere. 865 */ 866 lastvecindex = -1; 867 for (current = entry; 868 (current != &map->header) && (current->start < end); 869 current = current->next) { 870 871 /* 872 * check for contiguity 873 */ 874 if (current->end < end && 875 (entry->next == &map->header || 876 current->next->start > current->end)) { 877 vm_map_unlock_read(map); 878 return (ENOMEM); 879 } 880 881 /* 882 * ignore submaps (for now) or null objects 883 */ 884 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 885 current->object.vm_object == NULL) 886 continue; 887 888 /* 889 * limit this scan to the current map entry and the 890 * limits for the mincore call 891 */ 892 if (addr < current->start) 893 addr = current->start; 894 cend = current->end; 895 if (cend > end) 896 cend = end; 897 898 /* 899 * scan this entry one page at a time 900 */ 901 while (addr < cend) { 902 /* 903 * Check pmap first, it is likely faster, also 904 * it can provide info as to whether we are the 905 * one referencing or modifying the page. 906 */ 907 object = NULL; 908 locked_pa = 0; 909 retry: 910 m = NULL; 911 mincoreinfo = pmap_mincore(pmap, addr, &locked_pa); 912 if (locked_pa != 0) { 913 /* 914 * The page is mapped by this process but not 915 * both accessed and modified. It is also 916 * managed. Acquire the object lock so that 917 * other mappings might be examined. 918 */ 919 m = PHYS_TO_VM_PAGE(locked_pa); 920 if (m->object != object) { 921 if (object != NULL) 922 VM_OBJECT_WUNLOCK(object); 923 object = m->object; 924 locked = VM_OBJECT_TRYWLOCK(object); 925 vm_page_unlock(m); 926 if (!locked) { 927 VM_OBJECT_WLOCK(object); 928 vm_page_lock(m); 929 goto retry; 930 } 931 } else 932 vm_page_unlock(m); 933 KASSERT(m->valid == VM_PAGE_BITS_ALL, 934 ("mincore: page %p is mapped but invalid", 935 m)); 936 } else if (mincoreinfo == 0) { 937 /* 938 * The page is not mapped by this process. If 939 * the object implements managed pages, then 940 * determine if the page is resident so that 941 * the mappings might be examined. 942 */ 943 if (current->object.vm_object != object) { 944 if (object != NULL) 945 VM_OBJECT_WUNLOCK(object); 946 object = current->object.vm_object; 947 VM_OBJECT_WLOCK(object); 948 } 949 if (object->type == OBJT_DEFAULT || 950 object->type == OBJT_SWAP || 951 object->type == OBJT_VNODE) { 952 pindex = OFF_TO_IDX(current->offset + 953 (addr - current->start)); 954 m = vm_page_lookup(object, pindex); 955 if (m == NULL && 956 vm_page_is_cached(object, pindex)) 957 mincoreinfo = MINCORE_INCORE; 958 if (m != NULL && m->valid == 0) 959 m = NULL; 960 if (m != NULL) 961 mincoreinfo = MINCORE_INCORE; 962 } 963 } 964 if (m != NULL) { 965 /* Examine other mappings to the page. */ 966 if (m->dirty == 0 && pmap_is_modified(m)) 967 vm_page_dirty(m); 968 if (m->dirty != 0) 969 mincoreinfo |= MINCORE_MODIFIED_OTHER; 970 /* 971 * The first test for PGA_REFERENCED is an 972 * optimization. The second test is 973 * required because a concurrent pmap 974 * operation could clear the last reference 975 * and set PGA_REFERENCED before the call to 976 * pmap_is_referenced(). 977 */ 978 if ((m->aflags & PGA_REFERENCED) != 0 || 979 pmap_is_referenced(m) || 980 (m->aflags & PGA_REFERENCED) != 0) 981 mincoreinfo |= MINCORE_REFERENCED_OTHER; 982 } 983 if (object != NULL) 984 VM_OBJECT_WUNLOCK(object); 985 986 /* 987 * subyte may page fault. In case it needs to modify 988 * the map, we release the lock. 989 */ 990 vm_map_unlock_read(map); 991 992 /* 993 * calculate index into user supplied byte vector 994 */ 995 vecindex = OFF_TO_IDX(addr - first_addr); 996 997 /* 998 * If we have skipped map entries, we need to make sure that 999 * the byte vector is zeroed for those skipped entries. 1000 */ 1001 while ((lastvecindex + 1) < vecindex) { 1002 ++lastvecindex; 1003 error = subyte(vec + lastvecindex, 0); 1004 if (error) { 1005 error = EFAULT; 1006 goto done2; 1007 } 1008 } 1009 1010 /* 1011 * Pass the page information to the user 1012 */ 1013 error = subyte(vec + vecindex, mincoreinfo); 1014 if (error) { 1015 error = EFAULT; 1016 goto done2; 1017 } 1018 1019 /* 1020 * If the map has changed, due to the subyte, the previous 1021 * output may be invalid. 1022 */ 1023 vm_map_lock_read(map); 1024 if (timestamp != map->timestamp) 1025 goto RestartScan; 1026 1027 lastvecindex = vecindex; 1028 addr += PAGE_SIZE; 1029 } 1030 } 1031 1032 /* 1033 * subyte may page fault. In case it needs to modify 1034 * the map, we release the lock. 1035 */ 1036 vm_map_unlock_read(map); 1037 1038 /* 1039 * Zero the last entries in the byte vector. 1040 */ 1041 vecindex = OFF_TO_IDX(end - first_addr); 1042 while ((lastvecindex + 1) < vecindex) { 1043 ++lastvecindex; 1044 error = subyte(vec + lastvecindex, 0); 1045 if (error) { 1046 error = EFAULT; 1047 goto done2; 1048 } 1049 } 1050 1051 /* 1052 * If the map has changed, due to the subyte, the previous 1053 * output may be invalid. 1054 */ 1055 vm_map_lock_read(map); 1056 if (timestamp != map->timestamp) 1057 goto RestartScan; 1058 vm_map_unlock_read(map); 1059 done2: 1060 return (error); 1061 } 1062 1063 #ifndef _SYS_SYSPROTO_H_ 1064 struct mlock_args { 1065 const void *addr; 1066 size_t len; 1067 }; 1068 #endif 1069 /* 1070 * MPSAFE 1071 */ 1072 int 1073 sys_mlock(td, uap) 1074 struct thread *td; 1075 struct mlock_args *uap; 1076 { 1077 1078 return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len)); 1079 } 1080 1081 int 1082 vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len) 1083 { 1084 vm_offset_t addr, end, last, start; 1085 vm_size_t npages, size; 1086 vm_map_t map; 1087 unsigned long nsize; 1088 int error; 1089 1090 error = priv_check_cred(cred, PRIV_VM_MLOCK, 0); 1091 if (error) 1092 return (error); 1093 addr = (vm_offset_t)addr0; 1094 size = len; 1095 last = addr + size; 1096 start = trunc_page(addr); 1097 end = round_page(last); 1098 if (last < addr || end < addr) 1099 return (EINVAL); 1100 npages = atop(end - start); 1101 if (npages > vm_page_max_wired) 1102 return (ENOMEM); 1103 map = &proc->p_vmspace->vm_map; 1104 PROC_LOCK(proc); 1105 nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1106 if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) { 1107 PROC_UNLOCK(proc); 1108 return (ENOMEM); 1109 } 1110 PROC_UNLOCK(proc); 1111 if (npages + vm_cnt.v_wire_count > vm_page_max_wired) 1112 return (EAGAIN); 1113 #ifdef RACCT 1114 PROC_LOCK(proc); 1115 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1116 PROC_UNLOCK(proc); 1117 if (error != 0) 1118 return (ENOMEM); 1119 #endif 1120 error = vm_map_wire(map, start, end, 1121 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1122 #ifdef RACCT 1123 if (error != KERN_SUCCESS) { 1124 PROC_LOCK(proc); 1125 racct_set(proc, RACCT_MEMLOCK, 1126 ptoa(pmap_wired_count(map->pmap))); 1127 PROC_UNLOCK(proc); 1128 } 1129 #endif 1130 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1131 } 1132 1133 #ifndef _SYS_SYSPROTO_H_ 1134 struct mlockall_args { 1135 int how; 1136 }; 1137 #endif 1138 1139 /* 1140 * MPSAFE 1141 */ 1142 int 1143 sys_mlockall(td, uap) 1144 struct thread *td; 1145 struct mlockall_args *uap; 1146 { 1147 vm_map_t map; 1148 int error; 1149 1150 map = &td->td_proc->p_vmspace->vm_map; 1151 error = priv_check(td, PRIV_VM_MLOCK); 1152 if (error) 1153 return (error); 1154 1155 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1156 return (EINVAL); 1157 1158 /* 1159 * If wiring all pages in the process would cause it to exceed 1160 * a hard resource limit, return ENOMEM. 1161 */ 1162 if (!old_mlock && uap->how & MCL_CURRENT) { 1163 PROC_LOCK(td->td_proc); 1164 if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1165 PROC_UNLOCK(td->td_proc); 1166 return (ENOMEM); 1167 } 1168 PROC_UNLOCK(td->td_proc); 1169 } 1170 #ifdef RACCT 1171 PROC_LOCK(td->td_proc); 1172 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1173 PROC_UNLOCK(td->td_proc); 1174 if (error != 0) 1175 return (ENOMEM); 1176 #endif 1177 1178 if (uap->how & MCL_FUTURE) { 1179 vm_map_lock(map); 1180 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1181 vm_map_unlock(map); 1182 error = 0; 1183 } 1184 1185 if (uap->how & MCL_CURRENT) { 1186 /* 1187 * P1003.1-2001 mandates that all currently mapped pages 1188 * will be memory resident and locked (wired) upon return 1189 * from mlockall(). vm_map_wire() will wire pages, by 1190 * calling vm_fault_wire() for each page in the region. 1191 */ 1192 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1193 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1194 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1195 } 1196 #ifdef RACCT 1197 if (error != KERN_SUCCESS) { 1198 PROC_LOCK(td->td_proc); 1199 racct_set(td->td_proc, RACCT_MEMLOCK, 1200 ptoa(pmap_wired_count(map->pmap))); 1201 PROC_UNLOCK(td->td_proc); 1202 } 1203 #endif 1204 1205 return (error); 1206 } 1207 1208 #ifndef _SYS_SYSPROTO_H_ 1209 struct munlockall_args { 1210 register_t dummy; 1211 }; 1212 #endif 1213 1214 /* 1215 * MPSAFE 1216 */ 1217 int 1218 sys_munlockall(td, uap) 1219 struct thread *td; 1220 struct munlockall_args *uap; 1221 { 1222 vm_map_t map; 1223 int error; 1224 1225 map = &td->td_proc->p_vmspace->vm_map; 1226 error = priv_check(td, PRIV_VM_MUNLOCK); 1227 if (error) 1228 return (error); 1229 1230 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1231 vm_map_lock(map); 1232 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1233 vm_map_unlock(map); 1234 1235 /* Forcibly unwire all pages. */ 1236 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1237 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1238 #ifdef RACCT 1239 if (error == KERN_SUCCESS) { 1240 PROC_LOCK(td->td_proc); 1241 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1242 PROC_UNLOCK(td->td_proc); 1243 } 1244 #endif 1245 1246 return (error); 1247 } 1248 1249 #ifndef _SYS_SYSPROTO_H_ 1250 struct munlock_args { 1251 const void *addr; 1252 size_t len; 1253 }; 1254 #endif 1255 /* 1256 * MPSAFE 1257 */ 1258 int 1259 sys_munlock(td, uap) 1260 struct thread *td; 1261 struct munlock_args *uap; 1262 { 1263 vm_offset_t addr, end, last, start; 1264 vm_size_t size; 1265 #ifdef RACCT 1266 vm_map_t map; 1267 #endif 1268 int error; 1269 1270 error = priv_check(td, PRIV_VM_MUNLOCK); 1271 if (error) 1272 return (error); 1273 addr = (vm_offset_t)uap->addr; 1274 size = uap->len; 1275 last = addr + size; 1276 start = trunc_page(addr); 1277 end = round_page(last); 1278 if (last < addr || end < addr) 1279 return (EINVAL); 1280 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1281 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1282 #ifdef RACCT 1283 if (error == KERN_SUCCESS) { 1284 PROC_LOCK(td->td_proc); 1285 map = &td->td_proc->p_vmspace->vm_map; 1286 racct_set(td->td_proc, RACCT_MEMLOCK, 1287 ptoa(pmap_wired_count(map->pmap))); 1288 PROC_UNLOCK(td->td_proc); 1289 } 1290 #endif 1291 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1292 } 1293 1294 /* 1295 * vm_mmap_vnode() 1296 * 1297 * Helper function for vm_mmap. Perform sanity check specific for mmap 1298 * operations on vnodes. 1299 * 1300 * For VCHR vnodes, the vnode lock is held over the call to 1301 * vm_mmap_cdev() to keep vp->v_rdev valid. 1302 */ 1303 int 1304 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1305 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1306 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1307 boolean_t *writecounted) 1308 { 1309 struct vattr va; 1310 vm_object_t obj; 1311 vm_offset_t foff; 1312 struct mount *mp; 1313 struct ucred *cred; 1314 int error, flags, locktype; 1315 1316 mp = vp->v_mount; 1317 cred = td->td_ucred; 1318 if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED)) 1319 locktype = LK_EXCLUSIVE; 1320 else 1321 locktype = LK_SHARED; 1322 if ((error = vget(vp, locktype, td)) != 0) 1323 return (error); 1324 foff = *foffp; 1325 flags = *flagsp; 1326 obj = vp->v_object; 1327 if (vp->v_type == VREG) { 1328 /* 1329 * Get the proper underlying object 1330 */ 1331 if (obj == NULL) { 1332 error = EINVAL; 1333 goto done; 1334 } 1335 if (obj->type == OBJT_VNODE && obj->handle != vp) { 1336 vput(vp); 1337 vp = (struct vnode *)obj->handle; 1338 /* 1339 * Bypass filesystems obey the mpsafety of the 1340 * underlying fs. Tmpfs never bypasses. 1341 */ 1342 error = vget(vp, locktype, td); 1343 if (error != 0) 1344 return (error); 1345 } 1346 if (locktype == LK_EXCLUSIVE) { 1347 *writecounted = TRUE; 1348 vnode_pager_update_writecount(obj, 0, objsize); 1349 } 1350 } else if (vp->v_type == VCHR) { 1351 error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp, 1352 vp->v_rdev, foffp, objp); 1353 if (error == 0) 1354 goto mark_atime; 1355 goto done; 1356 } else { 1357 error = EINVAL; 1358 goto done; 1359 } 1360 if ((error = VOP_GETATTR(vp, &va, cred))) 1361 goto done; 1362 #ifdef MAC 1363 error = mac_vnode_check_mmap(cred, vp, prot, flags); 1364 if (error != 0) 1365 goto done; 1366 #endif 1367 if ((flags & MAP_SHARED) != 0) { 1368 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1369 if (prot & PROT_WRITE) { 1370 error = EPERM; 1371 goto done; 1372 } 1373 *maxprotp &= ~VM_PROT_WRITE; 1374 } 1375 } 1376 /* 1377 * If it is a regular file without any references 1378 * we do not need to sync it. 1379 * Adjust object size to be the size of actual file. 1380 */ 1381 objsize = round_page(va.va_size); 1382 if (va.va_nlink == 0) 1383 flags |= MAP_NOSYNC; 1384 if (obj->type == OBJT_VNODE) 1385 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, 1386 cred); 1387 else { 1388 KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP, 1389 ("wrong object type")); 1390 vm_object_reference(obj); 1391 } 1392 if (obj == NULL) { 1393 error = ENOMEM; 1394 goto done; 1395 } 1396 *objp = obj; 1397 *flagsp = flags; 1398 1399 mark_atime: 1400 vfs_mark_atime(vp, cred); 1401 1402 done: 1403 if (error != 0 && *writecounted) { 1404 *writecounted = FALSE; 1405 vnode_pager_update_writecount(obj, objsize, 0); 1406 } 1407 vput(vp); 1408 return (error); 1409 } 1410 1411 /* 1412 * vm_mmap_cdev() 1413 * 1414 * MPSAFE 1415 * 1416 * Helper function for vm_mmap. Perform sanity check specific for mmap 1417 * operations on cdevs. 1418 */ 1419 int 1420 vm_mmap_cdev(struct thread *td, vm_size_t objsize, 1421 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1422 struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp) 1423 { 1424 vm_object_t obj; 1425 struct cdevsw *dsw; 1426 int error, flags, ref; 1427 1428 flags = *flagsp; 1429 1430 dsw = dev_refthread(cdev, &ref); 1431 if (dsw == NULL) 1432 return (ENXIO); 1433 if (dsw->d_flags & D_MMAP_ANON) { 1434 dev_relthread(cdev, ref); 1435 *maxprotp = VM_PROT_ALL; 1436 *flagsp |= MAP_ANON; 1437 return (0); 1438 } 1439 /* 1440 * cdevs do not provide private mappings of any kind. 1441 */ 1442 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1443 (prot & PROT_WRITE) != 0) { 1444 dev_relthread(cdev, ref); 1445 return (EACCES); 1446 } 1447 if (flags & (MAP_PRIVATE|MAP_COPY)) { 1448 dev_relthread(cdev, ref); 1449 return (EINVAL); 1450 } 1451 /* 1452 * Force device mappings to be shared. 1453 */ 1454 flags |= MAP_SHARED; 1455 #ifdef MAC_XXX 1456 error = mac_cdev_check_mmap(td->td_ucred, cdev, prot); 1457 if (error != 0) { 1458 dev_relthread(cdev, ref); 1459 return (error); 1460 } 1461 #endif 1462 /* 1463 * First, try d_mmap_single(). If that is not implemented 1464 * (returns ENODEV), fall back to using the device pager. 1465 * Note that d_mmap_single() must return a reference to the 1466 * object (it needs to bump the reference count of the object 1467 * it returns somehow). 1468 * 1469 * XXX assumes VM_PROT_* == PROT_* 1470 */ 1471 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1472 dev_relthread(cdev, ref); 1473 if (error != ENODEV) 1474 return (error); 1475 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1476 td->td_ucred); 1477 if (obj == NULL) 1478 return (EINVAL); 1479 *objp = obj; 1480 *flagsp = flags; 1481 return (0); 1482 } 1483 1484 /* 1485 * vm_mmap_shm() 1486 * 1487 * MPSAFE 1488 * 1489 * Helper function for vm_mmap. Perform sanity check specific for mmap 1490 * operations on shm file descriptors. 1491 */ 1492 int 1493 vm_mmap_shm(struct thread *td, vm_size_t objsize, 1494 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1495 struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp) 1496 { 1497 int error; 1498 1499 if ((*flagsp & MAP_SHARED) != 0 && 1500 (*maxprotp & VM_PROT_WRITE) == 0 && 1501 (prot & PROT_WRITE) != 0) 1502 return (EACCES); 1503 #ifdef MAC 1504 error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp); 1505 if (error != 0) 1506 return (error); 1507 #endif 1508 error = shm_mmap(shmfd, objsize, foff, objp); 1509 if (error) 1510 return (error); 1511 return (0); 1512 } 1513 1514 /* 1515 * vm_mmap() 1516 * 1517 * MPSAFE 1518 * 1519 * Internal version of mmap. Currently used by mmap, exec, and sys5 1520 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1521 */ 1522 int 1523 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1524 vm_prot_t maxprot, int flags, 1525 objtype_t handle_type, void *handle, 1526 vm_ooffset_t foff) 1527 { 1528 boolean_t fitit; 1529 vm_object_t object = NULL; 1530 struct thread *td = curthread; 1531 int docow, error, findspace, rv; 1532 boolean_t writecounted; 1533 1534 if (size == 0) 1535 return (0); 1536 1537 size = round_page(size); 1538 1539 if (map == &td->td_proc->p_vmspace->vm_map) { 1540 PROC_LOCK(td->td_proc); 1541 if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) { 1542 PROC_UNLOCK(td->td_proc); 1543 return (ENOMEM); 1544 } 1545 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1546 PROC_UNLOCK(td->td_proc); 1547 return (ENOMEM); 1548 } 1549 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1550 if (ptoa(pmap_wired_count(map->pmap)) + size > 1551 lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1552 racct_set_force(td->td_proc, RACCT_VMEM, 1553 map->size); 1554 PROC_UNLOCK(td->td_proc); 1555 return (ENOMEM); 1556 } 1557 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1558 ptoa(pmap_wired_count(map->pmap)) + size); 1559 if (error != 0) { 1560 racct_set_force(td->td_proc, RACCT_VMEM, 1561 map->size); 1562 PROC_UNLOCK(td->td_proc); 1563 return (error); 1564 } 1565 } 1566 PROC_UNLOCK(td->td_proc); 1567 } 1568 1569 /* 1570 * We currently can only deal with page aligned file offsets. 1571 * The check is here rather than in the syscall because the 1572 * kernel calls this function internally for other mmaping 1573 * operations (such as in exec) and non-aligned offsets will 1574 * cause pmap inconsistencies...so we want to be sure to 1575 * disallow this in all cases. 1576 */ 1577 if (foff & PAGE_MASK) 1578 return (EINVAL); 1579 1580 if ((flags & MAP_FIXED) == 0) { 1581 fitit = TRUE; 1582 *addr = round_page(*addr); 1583 } else { 1584 if (*addr != trunc_page(*addr)) 1585 return (EINVAL); 1586 fitit = FALSE; 1587 } 1588 writecounted = FALSE; 1589 1590 /* 1591 * Lookup/allocate object. 1592 */ 1593 switch (handle_type) { 1594 case OBJT_DEVICE: 1595 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, 1596 handle, &foff, &object); 1597 break; 1598 case OBJT_VNODE: 1599 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1600 handle, &foff, &object, &writecounted); 1601 break; 1602 case OBJT_SWAP: 1603 error = vm_mmap_shm(td, size, prot, &maxprot, &flags, 1604 handle, foff, &object); 1605 break; 1606 case OBJT_DEFAULT: 1607 if (handle == NULL) { 1608 error = 0; 1609 break; 1610 } 1611 /* FALLTHROUGH */ 1612 default: 1613 error = EINVAL; 1614 break; 1615 } 1616 if (error) 1617 return (error); 1618 if (flags & MAP_ANON) { 1619 object = NULL; 1620 docow = 0; 1621 /* 1622 * Unnamed anonymous regions always start at 0. 1623 */ 1624 if (handle == 0) 1625 foff = 0; 1626 } else if (flags & MAP_PREFAULT_READ) 1627 docow = MAP_PREFAULT; 1628 else 1629 docow = MAP_PREFAULT_PARTIAL; 1630 1631 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1632 docow |= MAP_COPY_ON_WRITE; 1633 if (flags & MAP_NOSYNC) 1634 docow |= MAP_DISABLE_SYNCER; 1635 if (flags & MAP_NOCORE) 1636 docow |= MAP_DISABLE_COREDUMP; 1637 /* Shared memory is also shared with children. */ 1638 if (flags & MAP_SHARED) 1639 docow |= MAP_INHERIT_SHARE; 1640 if (writecounted) 1641 docow |= MAP_VN_WRITECOUNT; 1642 if (flags & MAP_STACK) { 1643 if (object != NULL) 1644 return (EINVAL); 1645 docow |= MAP_STACK_GROWS_DOWN; 1646 } 1647 if ((flags & MAP_EXCL) != 0) 1648 docow |= MAP_CHECK_EXCL; 1649 1650 if (fitit) { 1651 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) 1652 findspace = VMFS_SUPER_SPACE; 1653 else if ((flags & MAP_ALIGNMENT_MASK) != 0) 1654 findspace = VMFS_ALIGNED_SPACE(flags >> 1655 MAP_ALIGNMENT_SHIFT); 1656 else 1657 findspace = VMFS_OPTIMAL_SPACE; 1658 rv = vm_map_find(map, object, foff, addr, size, 1659 #ifdef MAP_32BIT 1660 flags & MAP_32BIT ? MAP_32BIT_MAX_ADDR : 1661 #endif 1662 0, findspace, prot, maxprot, docow); 1663 } else { 1664 rv = vm_map_fixed(map, object, foff, *addr, size, 1665 prot, maxprot, docow); 1666 } 1667 1668 if (rv == KERN_SUCCESS) { 1669 /* 1670 * If the process has requested that all future mappings 1671 * be wired, then heed this. 1672 */ 1673 if (map->flags & MAP_WIREFUTURE) { 1674 vm_map_wire(map, *addr, *addr + size, 1675 VM_MAP_WIRE_USER | ((flags & MAP_STACK) ? 1676 VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES)); 1677 } 1678 } else { 1679 /* 1680 * If this mapping was accounted for in the vnode's 1681 * writecount, then undo that now. 1682 */ 1683 if (writecounted) 1684 vnode_pager_release_writecount(object, 0, size); 1685 /* 1686 * Lose the object reference. Will destroy the 1687 * object if it's an unnamed anonymous mapping 1688 * or named anonymous without other references. 1689 */ 1690 vm_object_deallocate(object); 1691 } 1692 return (vm_mmap_to_errno(rv)); 1693 } 1694 1695 /* 1696 * Translate a Mach VM return code to zero on success or the appropriate errno 1697 * on failure. 1698 */ 1699 int 1700 vm_mmap_to_errno(int rv) 1701 { 1702 1703 switch (rv) { 1704 case KERN_SUCCESS: 1705 return (0); 1706 case KERN_INVALID_ADDRESS: 1707 case KERN_NO_SPACE: 1708 return (ENOMEM); 1709 case KERN_PROTECTION_FAILURE: 1710 return (EACCES); 1711 default: 1712 return (EINVAL); 1713 } 1714 } 1715