1 /*- 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35 * 36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_compat.h" 47 #include "opt_hwpmc_hooks.h" 48 #include "opt_vm.h" 49 50 #include <sys/param.h> 51 #include <sys/systm.h> 52 #include <sys/capsicum.h> 53 #include <sys/kernel.h> 54 #include <sys/lock.h> 55 #include <sys/mutex.h> 56 #include <sys/sysproto.h> 57 #include <sys/filedesc.h> 58 #include <sys/priv.h> 59 #include <sys/proc.h> 60 #include <sys/procctl.h> 61 #include <sys/racct.h> 62 #include <sys/resource.h> 63 #include <sys/resourcevar.h> 64 #include <sys/rwlock.h> 65 #include <sys/sysctl.h> 66 #include <sys/vnode.h> 67 #include <sys/fcntl.h> 68 #include <sys/file.h> 69 #include <sys/mman.h> 70 #include <sys/mount.h> 71 #include <sys/conf.h> 72 #include <sys/stat.h> 73 #include <sys/syscallsubr.h> 74 #include <sys/sysent.h> 75 #include <sys/vmmeter.h> 76 77 #include <security/mac/mac_framework.h> 78 79 #include <vm/vm.h> 80 #include <vm/vm_param.h> 81 #include <vm/pmap.h> 82 #include <vm/vm_map.h> 83 #include <vm/vm_object.h> 84 #include <vm/vm_page.h> 85 #include <vm/vm_pager.h> 86 #include <vm/vm_pageout.h> 87 #include <vm/vm_extern.h> 88 #include <vm/vm_page.h> 89 #include <vm/vnode_pager.h> 90 91 #ifdef HWPMC_HOOKS 92 #include <sys/pmckern.h> 93 #endif 94 95 int old_mlock = 0; 96 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0, 97 "Do not apply RLIMIT_MEMLOCK on mlockall"); 98 99 #ifdef MAP_32BIT 100 #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) 101 #endif 102 103 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 104 int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *); 105 static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 106 int *, struct cdev *, vm_ooffset_t *, vm_object_t *); 107 static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 108 int *, struct shmfd *, vm_ooffset_t, vm_object_t *); 109 110 #ifndef _SYS_SYSPROTO_H_ 111 struct sbrk_args { 112 int incr; 113 }; 114 #endif 115 116 /* 117 * MPSAFE 118 */ 119 /* ARGSUSED */ 120 int 121 sys_sbrk(td, uap) 122 struct thread *td; 123 struct sbrk_args *uap; 124 { 125 /* Not yet implemented */ 126 return (EOPNOTSUPP); 127 } 128 129 #ifndef _SYS_SYSPROTO_H_ 130 struct sstk_args { 131 int incr; 132 }; 133 #endif 134 135 /* 136 * MPSAFE 137 */ 138 /* ARGSUSED */ 139 int 140 sys_sstk(td, uap) 141 struct thread *td; 142 struct sstk_args *uap; 143 { 144 /* Not yet implemented */ 145 return (EOPNOTSUPP); 146 } 147 148 #if defined(COMPAT_43) 149 #ifndef _SYS_SYSPROTO_H_ 150 struct getpagesize_args { 151 int dummy; 152 }; 153 #endif 154 155 int 156 ogetpagesize(td, uap) 157 struct thread *td; 158 struct getpagesize_args *uap; 159 { 160 /* MP SAFE */ 161 td->td_retval[0] = PAGE_SIZE; 162 return (0); 163 } 164 #endif /* COMPAT_43 */ 165 166 167 /* 168 * Memory Map (mmap) system call. Note that the file offset 169 * and address are allowed to be NOT page aligned, though if 170 * the MAP_FIXED flag it set, both must have the same remainder 171 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 172 * page-aligned, the actual mapping starts at trunc_page(addr) 173 * and the return value is adjusted up by the page offset. 174 * 175 * Generally speaking, only character devices which are themselves 176 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 177 * there would be no cache coherency between a descriptor and a VM mapping 178 * both to the same character device. 179 */ 180 #ifndef _SYS_SYSPROTO_H_ 181 struct mmap_args { 182 void *addr; 183 size_t len; 184 int prot; 185 int flags; 186 int fd; 187 long pad; 188 off_t pos; 189 }; 190 #endif 191 192 /* 193 * MPSAFE 194 */ 195 int 196 sys_mmap(td, uap) 197 struct thread *td; 198 struct mmap_args *uap; 199 { 200 #ifdef HWPMC_HOOKS 201 struct pmckern_map_in pkm; 202 #endif 203 struct file *fp; 204 struct vnode *vp; 205 vm_offset_t addr; 206 vm_size_t size, pageoff; 207 vm_prot_t cap_maxprot, maxprot; 208 void *handle; 209 objtype_t handle_type; 210 int align, error, flags, prot; 211 off_t pos; 212 struct vmspace *vms = td->td_proc->p_vmspace; 213 cap_rights_t rights; 214 215 addr = (vm_offset_t) uap->addr; 216 size = uap->len; 217 prot = uap->prot; 218 flags = uap->flags; 219 pos = uap->pos; 220 221 fp = NULL; 222 223 /* 224 * Ignore old flags that used to be defined but did not do anything. 225 */ 226 flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040); 227 228 /* 229 * Enforce the constraints. 230 * Mapping of length 0 is only allowed for old binaries. 231 * Anonymous mapping shall specify -1 as filedescriptor and 232 * zero position for new code. Be nice to ancient a.out 233 * binaries and correct pos for anonymous mapping, since old 234 * ld.so sometimes issues anonymous map requests with non-zero 235 * pos. 236 */ 237 if (!SV_CURPROC_FLAG(SV_AOUT)) { 238 if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) || 239 ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0))) 240 return (EINVAL); 241 } else { 242 if ((flags & MAP_ANON) != 0) 243 pos = 0; 244 } 245 246 if (flags & MAP_STACK) { 247 if ((uap->fd != -1) || 248 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 249 return (EINVAL); 250 flags |= MAP_ANON; 251 pos = 0; 252 } 253 if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE | 254 MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE | 255 MAP_PREFAULT_READ | 256 #ifdef MAP_32BIT 257 MAP_32BIT | 258 #endif 259 MAP_ALIGNMENT_MASK)) != 0) 260 return (EINVAL); 261 if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) 262 return (EINVAL); 263 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE)) 264 return (EINVAL); 265 if (prot != PROT_NONE && 266 (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0) 267 return (EINVAL); 268 269 /* 270 * Align the file position to a page boundary, 271 * and save its page offset component. 272 */ 273 pageoff = (pos & PAGE_MASK); 274 pos -= pageoff; 275 276 /* Adjust size for rounding (on both ends). */ 277 size += pageoff; /* low end... */ 278 size = (vm_size_t) round_page(size); /* hi end */ 279 280 /* Ensure alignment is at least a page and fits in a pointer. */ 281 align = flags & MAP_ALIGNMENT_MASK; 282 if (align != 0 && align != MAP_ALIGNED_SUPER && 283 (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || 284 align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) 285 return (EINVAL); 286 287 /* 288 * Check for illegal addresses. Watch out for address wrap... Note 289 * that VM_*_ADDRESS are not constants due to casts (argh). 290 */ 291 if (flags & MAP_FIXED) { 292 /* 293 * The specified address must have the same remainder 294 * as the file offset taken modulo PAGE_SIZE, so it 295 * should be aligned after adjustment by pageoff. 296 */ 297 addr -= pageoff; 298 if (addr & PAGE_MASK) 299 return (EINVAL); 300 301 /* Address range must be all in user VM space. */ 302 if (addr < vm_map_min(&vms->vm_map) || 303 addr + size > vm_map_max(&vms->vm_map)) 304 return (EINVAL); 305 if (addr + size < addr) 306 return (EINVAL); 307 #ifdef MAP_32BIT 308 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) 309 return (EINVAL); 310 } else if (flags & MAP_32BIT) { 311 /* 312 * For MAP_32BIT, override the hint if it is too high and 313 * do not bother moving the mapping past the heap (since 314 * the heap is usually above 2GB). 315 */ 316 if (addr + size > MAP_32BIT_MAX_ADDR) 317 addr = 0; 318 #endif 319 } else { 320 /* 321 * XXX for non-fixed mappings where no hint is provided or 322 * the hint would fall in the potential heap space, 323 * place it after the end of the largest possible heap. 324 * 325 * There should really be a pmap call to determine a reasonable 326 * location. 327 */ 328 PROC_LOCK(td->td_proc); 329 if (addr == 0 || 330 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 331 addr < round_page((vm_offset_t)vms->vm_daddr + 332 lim_max(td->td_proc, RLIMIT_DATA)))) 333 addr = round_page((vm_offset_t)vms->vm_daddr + 334 lim_max(td->td_proc, RLIMIT_DATA)); 335 PROC_UNLOCK(td->td_proc); 336 } 337 if (flags & MAP_ANON) { 338 /* 339 * Mapping blank space is trivial. 340 */ 341 handle = NULL; 342 handle_type = OBJT_DEFAULT; 343 maxprot = VM_PROT_ALL; 344 cap_maxprot = VM_PROT_ALL; 345 } else { 346 /* 347 * Mapping file, get fp for validation and don't let the 348 * descriptor disappear on us if we block. Check capability 349 * rights, but also return the maximum rights to be combined 350 * with maxprot later. 351 */ 352 cap_rights_init(&rights, CAP_MMAP); 353 if (prot & PROT_READ) 354 cap_rights_set(&rights, CAP_MMAP_R); 355 if ((flags & MAP_SHARED) != 0) { 356 if (prot & PROT_WRITE) 357 cap_rights_set(&rights, CAP_MMAP_W); 358 } 359 if (prot & PROT_EXEC) 360 cap_rights_set(&rights, CAP_MMAP_X); 361 error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp); 362 if (error != 0) 363 goto done; 364 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 && 365 td->td_proc->p_osrel >= P_OSREL_MAP_FSTRICT) { 366 error = EINVAL; 367 goto done; 368 } 369 if (fp->f_type == DTYPE_SHM) { 370 handle = fp->f_data; 371 handle_type = OBJT_SWAP; 372 maxprot = VM_PROT_NONE; 373 374 /* FREAD should always be set. */ 375 if (fp->f_flag & FREAD) 376 maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; 377 if (fp->f_flag & FWRITE) 378 maxprot |= VM_PROT_WRITE; 379 goto map; 380 } 381 if (fp->f_type != DTYPE_VNODE) { 382 error = ENODEV; 383 goto done; 384 } 385 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ 386 defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) 387 /* 388 * POSIX shared-memory objects are defined to have 389 * kernel persistence, and are not defined to support 390 * read(2)/write(2) -- or even open(2). Thus, we can 391 * use MAP_ASYNC to trade on-disk coherence for speed. 392 * The shm_open(3) library routine turns on the FPOSIXSHM 393 * flag to request this behavior. 394 */ 395 if (fp->f_flag & FPOSIXSHM) 396 flags |= MAP_NOSYNC; 397 #endif 398 vp = fp->f_vnode; 399 /* 400 * Ensure that file and memory protections are 401 * compatible. Note that we only worry about 402 * writability if mapping is shared; in this case, 403 * current and max prot are dictated by the open file. 404 * XXX use the vnode instead? Problem is: what 405 * credentials do we use for determination? What if 406 * proc does a setuid? 407 */ 408 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 409 maxprot = VM_PROT_NONE; 410 else 411 maxprot = VM_PROT_EXECUTE; 412 if (fp->f_flag & FREAD) { 413 maxprot |= VM_PROT_READ; 414 } else if (prot & PROT_READ) { 415 error = EACCES; 416 goto done; 417 } 418 /* 419 * If we are sharing potential changes (either via 420 * MAP_SHARED or via the implicit sharing of character 421 * device mappings), and we are trying to get write 422 * permission although we opened it without asking 423 * for it, bail out. 424 */ 425 if ((flags & MAP_SHARED) != 0) { 426 if ((fp->f_flag & FWRITE) != 0) { 427 maxprot |= VM_PROT_WRITE; 428 } else if ((prot & PROT_WRITE) != 0) { 429 error = EACCES; 430 goto done; 431 } 432 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 433 maxprot |= VM_PROT_WRITE; 434 cap_maxprot |= VM_PROT_WRITE; 435 } 436 handle = (void *)vp; 437 handle_type = OBJT_VNODE; 438 } 439 map: 440 td->td_fpop = fp; 441 maxprot &= cap_maxprot; 442 443 /* This relies on VM_PROT_* matching PROT_*. */ 444 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 445 flags, handle_type, handle, pos); 446 td->td_fpop = NULL; 447 #ifdef HWPMC_HOOKS 448 /* inform hwpmc(4) if an executable is being mapped */ 449 if (error == 0 && handle_type == OBJT_VNODE && 450 (prot & PROT_EXEC)) { 451 pkm.pm_file = handle; 452 pkm.pm_address = (uintptr_t) addr; 453 PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); 454 } 455 #endif 456 if (error == 0) 457 td->td_retval[0] = (register_t) (addr + pageoff); 458 done: 459 if (fp) 460 fdrop(fp, td); 461 462 return (error); 463 } 464 465 #if defined(COMPAT_FREEBSD6) 466 int 467 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 468 { 469 struct mmap_args oargs; 470 471 oargs.addr = uap->addr; 472 oargs.len = uap->len; 473 oargs.prot = uap->prot; 474 oargs.flags = uap->flags; 475 oargs.fd = uap->fd; 476 oargs.pos = uap->pos; 477 return (sys_mmap(td, &oargs)); 478 } 479 #endif 480 481 #ifdef COMPAT_43 482 #ifndef _SYS_SYSPROTO_H_ 483 struct ommap_args { 484 caddr_t addr; 485 int len; 486 int prot; 487 int flags; 488 int fd; 489 long pos; 490 }; 491 #endif 492 int 493 ommap(td, uap) 494 struct thread *td; 495 struct ommap_args *uap; 496 { 497 struct mmap_args nargs; 498 static const char cvtbsdprot[8] = { 499 0, 500 PROT_EXEC, 501 PROT_WRITE, 502 PROT_EXEC | PROT_WRITE, 503 PROT_READ, 504 PROT_EXEC | PROT_READ, 505 PROT_WRITE | PROT_READ, 506 PROT_EXEC | PROT_WRITE | PROT_READ, 507 }; 508 509 #define OMAP_ANON 0x0002 510 #define OMAP_COPY 0x0020 511 #define OMAP_SHARED 0x0010 512 #define OMAP_FIXED 0x0100 513 514 nargs.addr = uap->addr; 515 nargs.len = uap->len; 516 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 517 #ifdef COMPAT_FREEBSD32 518 #if defined(__amd64__) 519 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 520 nargs.prot != 0) 521 nargs.prot |= PROT_EXEC; 522 #endif 523 #endif 524 nargs.flags = 0; 525 if (uap->flags & OMAP_ANON) 526 nargs.flags |= MAP_ANON; 527 if (uap->flags & OMAP_COPY) 528 nargs.flags |= MAP_COPY; 529 if (uap->flags & OMAP_SHARED) 530 nargs.flags |= MAP_SHARED; 531 else 532 nargs.flags |= MAP_PRIVATE; 533 if (uap->flags & OMAP_FIXED) 534 nargs.flags |= MAP_FIXED; 535 nargs.fd = uap->fd; 536 nargs.pos = uap->pos; 537 return (sys_mmap(td, &nargs)); 538 } 539 #endif /* COMPAT_43 */ 540 541 542 #ifndef _SYS_SYSPROTO_H_ 543 struct msync_args { 544 void *addr; 545 size_t len; 546 int flags; 547 }; 548 #endif 549 /* 550 * MPSAFE 551 */ 552 int 553 sys_msync(td, uap) 554 struct thread *td; 555 struct msync_args *uap; 556 { 557 vm_offset_t addr; 558 vm_size_t size, pageoff; 559 int flags; 560 vm_map_t map; 561 int rv; 562 563 addr = (vm_offset_t) uap->addr; 564 size = uap->len; 565 flags = uap->flags; 566 567 pageoff = (addr & PAGE_MASK); 568 addr -= pageoff; 569 size += pageoff; 570 size = (vm_size_t) round_page(size); 571 if (addr + size < addr) 572 return (EINVAL); 573 574 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 575 return (EINVAL); 576 577 map = &td->td_proc->p_vmspace->vm_map; 578 579 /* 580 * Clean the pages and interpret the return value. 581 */ 582 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 583 (flags & MS_INVALIDATE) != 0); 584 switch (rv) { 585 case KERN_SUCCESS: 586 return (0); 587 case KERN_INVALID_ADDRESS: 588 return (ENOMEM); 589 case KERN_INVALID_ARGUMENT: 590 return (EBUSY); 591 case KERN_FAILURE: 592 return (EIO); 593 default: 594 return (EINVAL); 595 } 596 } 597 598 #ifndef _SYS_SYSPROTO_H_ 599 struct munmap_args { 600 void *addr; 601 size_t len; 602 }; 603 #endif 604 /* 605 * MPSAFE 606 */ 607 int 608 sys_munmap(td, uap) 609 struct thread *td; 610 struct munmap_args *uap; 611 { 612 #ifdef HWPMC_HOOKS 613 struct pmckern_map_out pkm; 614 vm_map_entry_t entry; 615 #endif 616 vm_offset_t addr; 617 vm_size_t size, pageoff; 618 vm_map_t map; 619 620 addr = (vm_offset_t) uap->addr; 621 size = uap->len; 622 if (size == 0) 623 return (EINVAL); 624 625 pageoff = (addr & PAGE_MASK); 626 addr -= pageoff; 627 size += pageoff; 628 size = (vm_size_t) round_page(size); 629 if (addr + size < addr) 630 return (EINVAL); 631 632 /* 633 * Check for illegal addresses. Watch out for address wrap... 634 */ 635 map = &td->td_proc->p_vmspace->vm_map; 636 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 637 return (EINVAL); 638 vm_map_lock(map); 639 #ifdef HWPMC_HOOKS 640 /* 641 * Inform hwpmc if the address range being unmapped contains 642 * an executable region. 643 */ 644 pkm.pm_address = (uintptr_t) NULL; 645 if (vm_map_lookup_entry(map, addr, &entry)) { 646 for (; 647 entry != &map->header && entry->start < addr + size; 648 entry = entry->next) { 649 if (vm_map_check_protection(map, entry->start, 650 entry->end, VM_PROT_EXECUTE) == TRUE) { 651 pkm.pm_address = (uintptr_t) addr; 652 pkm.pm_size = (size_t) size; 653 break; 654 } 655 } 656 } 657 #endif 658 vm_map_delete(map, addr, addr + size); 659 660 #ifdef HWPMC_HOOKS 661 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 662 vm_map_lock_downgrade(map); 663 if (pkm.pm_address != (uintptr_t) NULL) 664 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 665 vm_map_unlock_read(map); 666 #else 667 vm_map_unlock(map); 668 #endif 669 /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ 670 return (0); 671 } 672 673 #ifndef _SYS_SYSPROTO_H_ 674 struct mprotect_args { 675 const void *addr; 676 size_t len; 677 int prot; 678 }; 679 #endif 680 /* 681 * MPSAFE 682 */ 683 int 684 sys_mprotect(td, uap) 685 struct thread *td; 686 struct mprotect_args *uap; 687 { 688 vm_offset_t addr; 689 vm_size_t size, pageoff; 690 vm_prot_t prot; 691 692 addr = (vm_offset_t) uap->addr; 693 size = uap->len; 694 prot = uap->prot & VM_PROT_ALL; 695 696 pageoff = (addr & PAGE_MASK); 697 addr -= pageoff; 698 size += pageoff; 699 size = (vm_size_t) round_page(size); 700 if (addr + size < addr) 701 return (EINVAL); 702 703 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 704 addr + size, prot, FALSE)) { 705 case KERN_SUCCESS: 706 return (0); 707 case KERN_PROTECTION_FAILURE: 708 return (EACCES); 709 case KERN_RESOURCE_SHORTAGE: 710 return (ENOMEM); 711 } 712 return (EINVAL); 713 } 714 715 #ifndef _SYS_SYSPROTO_H_ 716 struct minherit_args { 717 void *addr; 718 size_t len; 719 int inherit; 720 }; 721 #endif 722 /* 723 * MPSAFE 724 */ 725 int 726 sys_minherit(td, uap) 727 struct thread *td; 728 struct minherit_args *uap; 729 { 730 vm_offset_t addr; 731 vm_size_t size, pageoff; 732 vm_inherit_t inherit; 733 734 addr = (vm_offset_t)uap->addr; 735 size = uap->len; 736 inherit = uap->inherit; 737 738 pageoff = (addr & PAGE_MASK); 739 addr -= pageoff; 740 size += pageoff; 741 size = (vm_size_t) round_page(size); 742 if (addr + size < addr) 743 return (EINVAL); 744 745 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 746 addr + size, inherit)) { 747 case KERN_SUCCESS: 748 return (0); 749 case KERN_PROTECTION_FAILURE: 750 return (EACCES); 751 } 752 return (EINVAL); 753 } 754 755 #ifndef _SYS_SYSPROTO_H_ 756 struct madvise_args { 757 void *addr; 758 size_t len; 759 int behav; 760 }; 761 #endif 762 763 /* 764 * MPSAFE 765 */ 766 int 767 sys_madvise(td, uap) 768 struct thread *td; 769 struct madvise_args *uap; 770 { 771 vm_offset_t start, end; 772 vm_map_t map; 773 int flags; 774 775 /* 776 * Check for our special case, advising the swap pager we are 777 * "immortal." 778 */ 779 if (uap->behav == MADV_PROTECT) { 780 flags = PPROT_SET; 781 return (kern_procctl(td, P_PID, td->td_proc->p_pid, 782 PROC_SPROTECT, &flags)); 783 } 784 785 /* 786 * Check for illegal behavior 787 */ 788 if (uap->behav < 0 || uap->behav > MADV_CORE) 789 return (EINVAL); 790 /* 791 * Check for illegal addresses. Watch out for address wrap... Note 792 * that VM_*_ADDRESS are not constants due to casts (argh). 793 */ 794 map = &td->td_proc->p_vmspace->vm_map; 795 if ((vm_offset_t)uap->addr < vm_map_min(map) || 796 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 797 return (EINVAL); 798 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 799 return (EINVAL); 800 801 /* 802 * Since this routine is only advisory, we default to conservative 803 * behavior. 804 */ 805 start = trunc_page((vm_offset_t) uap->addr); 806 end = round_page((vm_offset_t) uap->addr + uap->len); 807 808 if (vm_map_madvise(map, start, end, uap->behav)) 809 return (EINVAL); 810 return (0); 811 } 812 813 #ifndef _SYS_SYSPROTO_H_ 814 struct mincore_args { 815 const void *addr; 816 size_t len; 817 char *vec; 818 }; 819 #endif 820 821 /* 822 * MPSAFE 823 */ 824 int 825 sys_mincore(td, uap) 826 struct thread *td; 827 struct mincore_args *uap; 828 { 829 vm_offset_t addr, first_addr; 830 vm_offset_t end, cend; 831 pmap_t pmap; 832 vm_map_t map; 833 char *vec; 834 int error = 0; 835 int vecindex, lastvecindex; 836 vm_map_entry_t current; 837 vm_map_entry_t entry; 838 vm_object_t object; 839 vm_paddr_t locked_pa; 840 vm_page_t m; 841 vm_pindex_t pindex; 842 int mincoreinfo; 843 unsigned int timestamp; 844 boolean_t locked; 845 846 /* 847 * Make sure that the addresses presented are valid for user 848 * mode. 849 */ 850 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 851 end = addr + (vm_size_t)round_page(uap->len); 852 map = &td->td_proc->p_vmspace->vm_map; 853 if (end > vm_map_max(map) || end < addr) 854 return (ENOMEM); 855 856 /* 857 * Address of byte vector 858 */ 859 vec = uap->vec; 860 861 pmap = vmspace_pmap(td->td_proc->p_vmspace); 862 863 vm_map_lock_read(map); 864 RestartScan: 865 timestamp = map->timestamp; 866 867 if (!vm_map_lookup_entry(map, addr, &entry)) { 868 vm_map_unlock_read(map); 869 return (ENOMEM); 870 } 871 872 /* 873 * Do this on a map entry basis so that if the pages are not 874 * in the current processes address space, we can easily look 875 * up the pages elsewhere. 876 */ 877 lastvecindex = -1; 878 for (current = entry; 879 (current != &map->header) && (current->start < end); 880 current = current->next) { 881 882 /* 883 * check for contiguity 884 */ 885 if (current->end < end && 886 (entry->next == &map->header || 887 current->next->start > current->end)) { 888 vm_map_unlock_read(map); 889 return (ENOMEM); 890 } 891 892 /* 893 * ignore submaps (for now) or null objects 894 */ 895 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 896 current->object.vm_object == NULL) 897 continue; 898 899 /* 900 * limit this scan to the current map entry and the 901 * limits for the mincore call 902 */ 903 if (addr < current->start) 904 addr = current->start; 905 cend = current->end; 906 if (cend > end) 907 cend = end; 908 909 /* 910 * scan this entry one page at a time 911 */ 912 while (addr < cend) { 913 /* 914 * Check pmap first, it is likely faster, also 915 * it can provide info as to whether we are the 916 * one referencing or modifying the page. 917 */ 918 object = NULL; 919 locked_pa = 0; 920 retry: 921 m = NULL; 922 mincoreinfo = pmap_mincore(pmap, addr, &locked_pa); 923 if (locked_pa != 0) { 924 /* 925 * The page is mapped by this process but not 926 * both accessed and modified. It is also 927 * managed. Acquire the object lock so that 928 * other mappings might be examined. 929 */ 930 m = PHYS_TO_VM_PAGE(locked_pa); 931 if (m->object != object) { 932 if (object != NULL) 933 VM_OBJECT_WUNLOCK(object); 934 object = m->object; 935 locked = VM_OBJECT_TRYWLOCK(object); 936 vm_page_unlock(m); 937 if (!locked) { 938 VM_OBJECT_WLOCK(object); 939 vm_page_lock(m); 940 goto retry; 941 } 942 } else 943 vm_page_unlock(m); 944 KASSERT(m->valid == VM_PAGE_BITS_ALL, 945 ("mincore: page %p is mapped but invalid", 946 m)); 947 } else if (mincoreinfo == 0) { 948 /* 949 * The page is not mapped by this process. If 950 * the object implements managed pages, then 951 * determine if the page is resident so that 952 * the mappings might be examined. 953 */ 954 if (current->object.vm_object != object) { 955 if (object != NULL) 956 VM_OBJECT_WUNLOCK(object); 957 object = current->object.vm_object; 958 VM_OBJECT_WLOCK(object); 959 } 960 if (object->type == OBJT_DEFAULT || 961 object->type == OBJT_SWAP || 962 object->type == OBJT_VNODE) { 963 pindex = OFF_TO_IDX(current->offset + 964 (addr - current->start)); 965 m = vm_page_lookup(object, pindex); 966 if (m == NULL && 967 vm_page_is_cached(object, pindex)) 968 mincoreinfo = MINCORE_INCORE; 969 if (m != NULL && m->valid == 0) 970 m = NULL; 971 if (m != NULL) 972 mincoreinfo = MINCORE_INCORE; 973 } 974 } 975 if (m != NULL) { 976 /* Examine other mappings to the page. */ 977 if (m->dirty == 0 && pmap_is_modified(m)) 978 vm_page_dirty(m); 979 if (m->dirty != 0) 980 mincoreinfo |= MINCORE_MODIFIED_OTHER; 981 /* 982 * The first test for PGA_REFERENCED is an 983 * optimization. The second test is 984 * required because a concurrent pmap 985 * operation could clear the last reference 986 * and set PGA_REFERENCED before the call to 987 * pmap_is_referenced(). 988 */ 989 if ((m->aflags & PGA_REFERENCED) != 0 || 990 pmap_is_referenced(m) || 991 (m->aflags & PGA_REFERENCED) != 0) 992 mincoreinfo |= MINCORE_REFERENCED_OTHER; 993 } 994 if (object != NULL) 995 VM_OBJECT_WUNLOCK(object); 996 997 /* 998 * subyte may page fault. In case it needs to modify 999 * the map, we release the lock. 1000 */ 1001 vm_map_unlock_read(map); 1002 1003 /* 1004 * calculate index into user supplied byte vector 1005 */ 1006 vecindex = OFF_TO_IDX(addr - first_addr); 1007 1008 /* 1009 * If we have skipped map entries, we need to make sure that 1010 * the byte vector is zeroed for those skipped entries. 1011 */ 1012 while ((lastvecindex + 1) < vecindex) { 1013 ++lastvecindex; 1014 error = subyte(vec + lastvecindex, 0); 1015 if (error) { 1016 error = EFAULT; 1017 goto done2; 1018 } 1019 } 1020 1021 /* 1022 * Pass the page information to the user 1023 */ 1024 error = subyte(vec + vecindex, mincoreinfo); 1025 if (error) { 1026 error = EFAULT; 1027 goto done2; 1028 } 1029 1030 /* 1031 * If the map has changed, due to the subyte, the previous 1032 * output may be invalid. 1033 */ 1034 vm_map_lock_read(map); 1035 if (timestamp != map->timestamp) 1036 goto RestartScan; 1037 1038 lastvecindex = vecindex; 1039 addr += PAGE_SIZE; 1040 } 1041 } 1042 1043 /* 1044 * subyte may page fault. In case it needs to modify 1045 * the map, we release the lock. 1046 */ 1047 vm_map_unlock_read(map); 1048 1049 /* 1050 * Zero the last entries in the byte vector. 1051 */ 1052 vecindex = OFF_TO_IDX(end - first_addr); 1053 while ((lastvecindex + 1) < vecindex) { 1054 ++lastvecindex; 1055 error = subyte(vec + lastvecindex, 0); 1056 if (error) { 1057 error = EFAULT; 1058 goto done2; 1059 } 1060 } 1061 1062 /* 1063 * If the map has changed, due to the subyte, the previous 1064 * output may be invalid. 1065 */ 1066 vm_map_lock_read(map); 1067 if (timestamp != map->timestamp) 1068 goto RestartScan; 1069 vm_map_unlock_read(map); 1070 done2: 1071 return (error); 1072 } 1073 1074 #ifndef _SYS_SYSPROTO_H_ 1075 struct mlock_args { 1076 const void *addr; 1077 size_t len; 1078 }; 1079 #endif 1080 /* 1081 * MPSAFE 1082 */ 1083 int 1084 sys_mlock(td, uap) 1085 struct thread *td; 1086 struct mlock_args *uap; 1087 { 1088 1089 return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len)); 1090 } 1091 1092 int 1093 vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len) 1094 { 1095 vm_offset_t addr, end, last, start; 1096 vm_size_t npages, size; 1097 vm_map_t map; 1098 unsigned long nsize; 1099 int error; 1100 1101 error = priv_check_cred(cred, PRIV_VM_MLOCK, 0); 1102 if (error) 1103 return (error); 1104 addr = (vm_offset_t)addr0; 1105 size = len; 1106 last = addr + size; 1107 start = trunc_page(addr); 1108 end = round_page(last); 1109 if (last < addr || end < addr) 1110 return (EINVAL); 1111 npages = atop(end - start); 1112 if (npages > vm_page_max_wired) 1113 return (ENOMEM); 1114 map = &proc->p_vmspace->vm_map; 1115 PROC_LOCK(proc); 1116 nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1117 if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) { 1118 PROC_UNLOCK(proc); 1119 return (ENOMEM); 1120 } 1121 PROC_UNLOCK(proc); 1122 if (npages + vm_cnt.v_wire_count > vm_page_max_wired) 1123 return (EAGAIN); 1124 #ifdef RACCT 1125 if (racct_enable) { 1126 PROC_LOCK(proc); 1127 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1128 PROC_UNLOCK(proc); 1129 if (error != 0) 1130 return (ENOMEM); 1131 } 1132 #endif 1133 error = vm_map_wire(map, start, end, 1134 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1135 #ifdef RACCT 1136 if (racct_enable && error != KERN_SUCCESS) { 1137 PROC_LOCK(proc); 1138 racct_set(proc, RACCT_MEMLOCK, 1139 ptoa(pmap_wired_count(map->pmap))); 1140 PROC_UNLOCK(proc); 1141 } 1142 #endif 1143 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1144 } 1145 1146 #ifndef _SYS_SYSPROTO_H_ 1147 struct mlockall_args { 1148 int how; 1149 }; 1150 #endif 1151 1152 /* 1153 * MPSAFE 1154 */ 1155 int 1156 sys_mlockall(td, uap) 1157 struct thread *td; 1158 struct mlockall_args *uap; 1159 { 1160 vm_map_t map; 1161 int error; 1162 1163 map = &td->td_proc->p_vmspace->vm_map; 1164 error = priv_check(td, PRIV_VM_MLOCK); 1165 if (error) 1166 return (error); 1167 1168 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1169 return (EINVAL); 1170 1171 /* 1172 * If wiring all pages in the process would cause it to exceed 1173 * a hard resource limit, return ENOMEM. 1174 */ 1175 if (!old_mlock && uap->how & MCL_CURRENT) { 1176 PROC_LOCK(td->td_proc); 1177 if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1178 PROC_UNLOCK(td->td_proc); 1179 return (ENOMEM); 1180 } 1181 PROC_UNLOCK(td->td_proc); 1182 } 1183 #ifdef RACCT 1184 if (racct_enable) { 1185 PROC_LOCK(td->td_proc); 1186 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1187 PROC_UNLOCK(td->td_proc); 1188 if (error != 0) 1189 return (ENOMEM); 1190 } 1191 #endif 1192 1193 if (uap->how & MCL_FUTURE) { 1194 vm_map_lock(map); 1195 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1196 vm_map_unlock(map); 1197 error = 0; 1198 } 1199 1200 if (uap->how & MCL_CURRENT) { 1201 /* 1202 * P1003.1-2001 mandates that all currently mapped pages 1203 * will be memory resident and locked (wired) upon return 1204 * from mlockall(). vm_map_wire() will wire pages, by 1205 * calling vm_fault_wire() for each page in the region. 1206 */ 1207 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1208 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1209 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1210 } 1211 #ifdef RACCT 1212 if (racct_enable && error != KERN_SUCCESS) { 1213 PROC_LOCK(td->td_proc); 1214 racct_set(td->td_proc, RACCT_MEMLOCK, 1215 ptoa(pmap_wired_count(map->pmap))); 1216 PROC_UNLOCK(td->td_proc); 1217 } 1218 #endif 1219 1220 return (error); 1221 } 1222 1223 #ifndef _SYS_SYSPROTO_H_ 1224 struct munlockall_args { 1225 register_t dummy; 1226 }; 1227 #endif 1228 1229 /* 1230 * MPSAFE 1231 */ 1232 int 1233 sys_munlockall(td, uap) 1234 struct thread *td; 1235 struct munlockall_args *uap; 1236 { 1237 vm_map_t map; 1238 int error; 1239 1240 map = &td->td_proc->p_vmspace->vm_map; 1241 error = priv_check(td, PRIV_VM_MUNLOCK); 1242 if (error) 1243 return (error); 1244 1245 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1246 vm_map_lock(map); 1247 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1248 vm_map_unlock(map); 1249 1250 /* Forcibly unwire all pages. */ 1251 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1252 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1253 #ifdef RACCT 1254 if (racct_enable && error == KERN_SUCCESS) { 1255 PROC_LOCK(td->td_proc); 1256 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1257 PROC_UNLOCK(td->td_proc); 1258 } 1259 #endif 1260 1261 return (error); 1262 } 1263 1264 #ifndef _SYS_SYSPROTO_H_ 1265 struct munlock_args { 1266 const void *addr; 1267 size_t len; 1268 }; 1269 #endif 1270 /* 1271 * MPSAFE 1272 */ 1273 int 1274 sys_munlock(td, uap) 1275 struct thread *td; 1276 struct munlock_args *uap; 1277 { 1278 vm_offset_t addr, end, last, start; 1279 vm_size_t size; 1280 #ifdef RACCT 1281 vm_map_t map; 1282 #endif 1283 int error; 1284 1285 error = priv_check(td, PRIV_VM_MUNLOCK); 1286 if (error) 1287 return (error); 1288 addr = (vm_offset_t)uap->addr; 1289 size = uap->len; 1290 last = addr + size; 1291 start = trunc_page(addr); 1292 end = round_page(last); 1293 if (last < addr || end < addr) 1294 return (EINVAL); 1295 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1296 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1297 #ifdef RACCT 1298 if (racct_enable && error == KERN_SUCCESS) { 1299 PROC_LOCK(td->td_proc); 1300 map = &td->td_proc->p_vmspace->vm_map; 1301 racct_set(td->td_proc, RACCT_MEMLOCK, 1302 ptoa(pmap_wired_count(map->pmap))); 1303 PROC_UNLOCK(td->td_proc); 1304 } 1305 #endif 1306 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1307 } 1308 1309 /* 1310 * vm_mmap_vnode() 1311 * 1312 * Helper function for vm_mmap. Perform sanity check specific for mmap 1313 * operations on vnodes. 1314 * 1315 * For VCHR vnodes, the vnode lock is held over the call to 1316 * vm_mmap_cdev() to keep vp->v_rdev valid. 1317 */ 1318 int 1319 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1320 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1321 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1322 boolean_t *writecounted) 1323 { 1324 struct vattr va; 1325 vm_object_t obj; 1326 vm_offset_t foff; 1327 struct ucred *cred; 1328 int error, flags, locktype; 1329 1330 cred = td->td_ucred; 1331 if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED)) 1332 locktype = LK_EXCLUSIVE; 1333 else 1334 locktype = LK_SHARED; 1335 if ((error = vget(vp, locktype, td)) != 0) 1336 return (error); 1337 foff = *foffp; 1338 flags = *flagsp; 1339 obj = vp->v_object; 1340 if (vp->v_type == VREG) { 1341 /* 1342 * Get the proper underlying object 1343 */ 1344 if (obj == NULL) { 1345 error = EINVAL; 1346 goto done; 1347 } 1348 if (obj->type == OBJT_VNODE && obj->handle != vp) { 1349 vput(vp); 1350 vp = (struct vnode *)obj->handle; 1351 /* 1352 * Bypass filesystems obey the mpsafety of the 1353 * underlying fs. Tmpfs never bypasses. 1354 */ 1355 error = vget(vp, locktype, td); 1356 if (error != 0) 1357 return (error); 1358 } 1359 if (locktype == LK_EXCLUSIVE) { 1360 *writecounted = TRUE; 1361 vnode_pager_update_writecount(obj, 0, objsize); 1362 } 1363 } else if (vp->v_type == VCHR) { 1364 error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp, 1365 vp->v_rdev, foffp, objp); 1366 if (error == 0) 1367 goto mark_atime; 1368 goto done; 1369 } else { 1370 error = EINVAL; 1371 goto done; 1372 } 1373 if ((error = VOP_GETATTR(vp, &va, cred))) 1374 goto done; 1375 #ifdef MAC 1376 error = mac_vnode_check_mmap(cred, vp, prot, flags); 1377 if (error != 0) 1378 goto done; 1379 #endif 1380 if ((flags & MAP_SHARED) != 0) { 1381 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1382 if (prot & PROT_WRITE) { 1383 error = EPERM; 1384 goto done; 1385 } 1386 *maxprotp &= ~VM_PROT_WRITE; 1387 } 1388 } 1389 /* 1390 * If it is a regular file without any references 1391 * we do not need to sync it. 1392 * Adjust object size to be the size of actual file. 1393 */ 1394 objsize = round_page(va.va_size); 1395 if (va.va_nlink == 0) 1396 flags |= MAP_NOSYNC; 1397 if (obj->type == OBJT_VNODE) { 1398 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, 1399 cred); 1400 if (obj == NULL) { 1401 error = ENOMEM; 1402 goto done; 1403 } 1404 } else { 1405 KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP, 1406 ("wrong object type")); 1407 VM_OBJECT_WLOCK(obj); 1408 vm_object_reference_locked(obj); 1409 #if VM_NRESERVLEVEL > 0 1410 vm_object_color(obj, 0); 1411 #endif 1412 VM_OBJECT_WUNLOCK(obj); 1413 } 1414 *objp = obj; 1415 *flagsp = flags; 1416 1417 mark_atime: 1418 vfs_mark_atime(vp, cred); 1419 1420 done: 1421 if (error != 0 && *writecounted) { 1422 *writecounted = FALSE; 1423 vnode_pager_update_writecount(obj, objsize, 0); 1424 } 1425 vput(vp); 1426 return (error); 1427 } 1428 1429 /* 1430 * vm_mmap_cdev() 1431 * 1432 * MPSAFE 1433 * 1434 * Helper function for vm_mmap. Perform sanity check specific for mmap 1435 * operations on cdevs. 1436 */ 1437 int 1438 vm_mmap_cdev(struct thread *td, vm_size_t objsize, 1439 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1440 struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp) 1441 { 1442 vm_object_t obj; 1443 struct cdevsw *dsw; 1444 int error, flags, ref; 1445 1446 flags = *flagsp; 1447 1448 dsw = dev_refthread(cdev, &ref); 1449 if (dsw == NULL) 1450 return (ENXIO); 1451 if (dsw->d_flags & D_MMAP_ANON) { 1452 dev_relthread(cdev, ref); 1453 *maxprotp = VM_PROT_ALL; 1454 *flagsp |= MAP_ANON; 1455 return (0); 1456 } 1457 /* 1458 * cdevs do not provide private mappings of any kind. 1459 */ 1460 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1461 (prot & PROT_WRITE) != 0) { 1462 dev_relthread(cdev, ref); 1463 return (EACCES); 1464 } 1465 if (flags & (MAP_PRIVATE|MAP_COPY)) { 1466 dev_relthread(cdev, ref); 1467 return (EINVAL); 1468 } 1469 /* 1470 * Force device mappings to be shared. 1471 */ 1472 flags |= MAP_SHARED; 1473 #ifdef MAC_XXX 1474 error = mac_cdev_check_mmap(td->td_ucred, cdev, prot); 1475 if (error != 0) { 1476 dev_relthread(cdev, ref); 1477 return (error); 1478 } 1479 #endif 1480 /* 1481 * First, try d_mmap_single(). If that is not implemented 1482 * (returns ENODEV), fall back to using the device pager. 1483 * Note that d_mmap_single() must return a reference to the 1484 * object (it needs to bump the reference count of the object 1485 * it returns somehow). 1486 * 1487 * XXX assumes VM_PROT_* == PROT_* 1488 */ 1489 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1490 dev_relthread(cdev, ref); 1491 if (error != ENODEV) 1492 return (error); 1493 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1494 td->td_ucred); 1495 if (obj == NULL) 1496 return (EINVAL); 1497 *objp = obj; 1498 *flagsp = flags; 1499 return (0); 1500 } 1501 1502 /* 1503 * vm_mmap_shm() 1504 * 1505 * MPSAFE 1506 * 1507 * Helper function for vm_mmap. Perform sanity check specific for mmap 1508 * operations on shm file descriptors. 1509 */ 1510 int 1511 vm_mmap_shm(struct thread *td, vm_size_t objsize, 1512 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1513 struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp) 1514 { 1515 int error; 1516 1517 if ((*flagsp & MAP_SHARED) != 0 && 1518 (*maxprotp & VM_PROT_WRITE) == 0 && 1519 (prot & PROT_WRITE) != 0) 1520 return (EACCES); 1521 #ifdef MAC 1522 error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp); 1523 if (error != 0) 1524 return (error); 1525 #endif 1526 error = shm_mmap(shmfd, objsize, foff, objp); 1527 if (error) 1528 return (error); 1529 return (0); 1530 } 1531 1532 /* 1533 * vm_mmap() 1534 * 1535 * MPSAFE 1536 * 1537 * Internal version of mmap. Currently used by mmap, exec, and sys5 1538 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1539 */ 1540 int 1541 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1542 vm_prot_t maxprot, int flags, 1543 objtype_t handle_type, void *handle, 1544 vm_ooffset_t foff) 1545 { 1546 boolean_t fitit; 1547 vm_object_t object = NULL; 1548 struct thread *td = curthread; 1549 int docow, error, findspace, rv; 1550 boolean_t writecounted; 1551 1552 if (size == 0) 1553 return (0); 1554 1555 size = round_page(size); 1556 1557 if (map == &td->td_proc->p_vmspace->vm_map) { 1558 PROC_LOCK(td->td_proc); 1559 if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) { 1560 PROC_UNLOCK(td->td_proc); 1561 return (ENOMEM); 1562 } 1563 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1564 PROC_UNLOCK(td->td_proc); 1565 return (ENOMEM); 1566 } 1567 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1568 if (ptoa(pmap_wired_count(map->pmap)) + size > 1569 lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1570 racct_set_force(td->td_proc, RACCT_VMEM, 1571 map->size); 1572 PROC_UNLOCK(td->td_proc); 1573 return (ENOMEM); 1574 } 1575 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1576 ptoa(pmap_wired_count(map->pmap)) + size); 1577 if (error != 0) { 1578 racct_set_force(td->td_proc, RACCT_VMEM, 1579 map->size); 1580 PROC_UNLOCK(td->td_proc); 1581 return (error); 1582 } 1583 } 1584 PROC_UNLOCK(td->td_proc); 1585 } 1586 1587 /* 1588 * We currently can only deal with page aligned file offsets. 1589 * The check is here rather than in the syscall because the 1590 * kernel calls this function internally for other mmaping 1591 * operations (such as in exec) and non-aligned offsets will 1592 * cause pmap inconsistencies...so we want to be sure to 1593 * disallow this in all cases. 1594 */ 1595 if (foff & PAGE_MASK) 1596 return (EINVAL); 1597 1598 if ((flags & MAP_FIXED) == 0) { 1599 fitit = TRUE; 1600 *addr = round_page(*addr); 1601 } else { 1602 if (*addr != trunc_page(*addr)) 1603 return (EINVAL); 1604 fitit = FALSE; 1605 } 1606 writecounted = FALSE; 1607 1608 /* 1609 * Lookup/allocate object. 1610 */ 1611 switch (handle_type) { 1612 case OBJT_DEVICE: 1613 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, 1614 handle, &foff, &object); 1615 break; 1616 case OBJT_VNODE: 1617 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1618 handle, &foff, &object, &writecounted); 1619 break; 1620 case OBJT_SWAP: 1621 error = vm_mmap_shm(td, size, prot, &maxprot, &flags, 1622 handle, foff, &object); 1623 break; 1624 case OBJT_DEFAULT: 1625 if (handle == NULL) { 1626 error = 0; 1627 break; 1628 } 1629 /* FALLTHROUGH */ 1630 default: 1631 error = EINVAL; 1632 break; 1633 } 1634 if (error) 1635 return (error); 1636 if (flags & MAP_ANON) { 1637 object = NULL; 1638 docow = 0; 1639 /* 1640 * Unnamed anonymous regions always start at 0. 1641 */ 1642 if (handle == 0) 1643 foff = 0; 1644 } else if (flags & MAP_PREFAULT_READ) 1645 docow = MAP_PREFAULT; 1646 else 1647 docow = MAP_PREFAULT_PARTIAL; 1648 1649 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1650 docow |= MAP_COPY_ON_WRITE; 1651 if (flags & MAP_NOSYNC) 1652 docow |= MAP_DISABLE_SYNCER; 1653 if (flags & MAP_NOCORE) 1654 docow |= MAP_DISABLE_COREDUMP; 1655 /* Shared memory is also shared with children. */ 1656 if (flags & MAP_SHARED) 1657 docow |= MAP_INHERIT_SHARE; 1658 if (writecounted) 1659 docow |= MAP_VN_WRITECOUNT; 1660 if (flags & MAP_STACK) { 1661 if (object != NULL) 1662 return (EINVAL); 1663 docow |= MAP_STACK_GROWS_DOWN; 1664 } 1665 if ((flags & MAP_EXCL) != 0) 1666 docow |= MAP_CHECK_EXCL; 1667 1668 if (fitit) { 1669 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) 1670 findspace = VMFS_SUPER_SPACE; 1671 else if ((flags & MAP_ALIGNMENT_MASK) != 0) 1672 findspace = VMFS_ALIGNED_SPACE(flags >> 1673 MAP_ALIGNMENT_SHIFT); 1674 else 1675 findspace = VMFS_OPTIMAL_SPACE; 1676 rv = vm_map_find(map, object, foff, addr, size, 1677 #ifdef MAP_32BIT 1678 flags & MAP_32BIT ? MAP_32BIT_MAX_ADDR : 1679 #endif 1680 0, findspace, prot, maxprot, docow); 1681 } else { 1682 rv = vm_map_fixed(map, object, foff, *addr, size, 1683 prot, maxprot, docow); 1684 } 1685 1686 if (rv == KERN_SUCCESS) { 1687 /* 1688 * If the process has requested that all future mappings 1689 * be wired, then heed this. 1690 */ 1691 if (map->flags & MAP_WIREFUTURE) { 1692 vm_map_wire(map, *addr, *addr + size, 1693 VM_MAP_WIRE_USER | ((flags & MAP_STACK) ? 1694 VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES)); 1695 } 1696 } else { 1697 /* 1698 * If this mapping was accounted for in the vnode's 1699 * writecount, then undo that now. 1700 */ 1701 if (writecounted) 1702 vnode_pager_release_writecount(object, 0, size); 1703 /* 1704 * Lose the object reference. Will destroy the 1705 * object if it's an unnamed anonymous mapping 1706 * or named anonymous without other references. 1707 */ 1708 vm_object_deallocate(object); 1709 } 1710 return (vm_mmap_to_errno(rv)); 1711 } 1712 1713 /* 1714 * Translate a Mach VM return code to zero on success or the appropriate errno 1715 * on failure. 1716 */ 1717 int 1718 vm_mmap_to_errno(int rv) 1719 { 1720 1721 switch (rv) { 1722 case KERN_SUCCESS: 1723 return (0); 1724 case KERN_INVALID_ADDRESS: 1725 case KERN_NO_SPACE: 1726 return (ENOMEM); 1727 case KERN_PROTECTION_FAILURE: 1728 return (EACCES); 1729 default: 1730 return (EINVAL); 1731 } 1732 } 1733