1 /*- 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35 * 36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_compat.h" 47 #include "opt_hwpmc_hooks.h" 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/capsicum.h> 52 #include <sys/kernel.h> 53 #include <sys/lock.h> 54 #include <sys/mutex.h> 55 #include <sys/sysproto.h> 56 #include <sys/filedesc.h> 57 #include <sys/priv.h> 58 #include <sys/proc.h> 59 #include <sys/procctl.h> 60 #include <sys/racct.h> 61 #include <sys/resource.h> 62 #include <sys/resourcevar.h> 63 #include <sys/rwlock.h> 64 #include <sys/sysctl.h> 65 #include <sys/vnode.h> 66 #include <sys/fcntl.h> 67 #include <sys/file.h> 68 #include <sys/mman.h> 69 #include <sys/mount.h> 70 #include <sys/conf.h> 71 #include <sys/stat.h> 72 #include <sys/syscallsubr.h> 73 #include <sys/sysent.h> 74 #include <sys/vmmeter.h> 75 76 #include <security/mac/mac_framework.h> 77 78 #include <vm/vm.h> 79 #include <vm/vm_param.h> 80 #include <vm/pmap.h> 81 #include <vm/vm_map.h> 82 #include <vm/vm_object.h> 83 #include <vm/vm_page.h> 84 #include <vm/vm_pager.h> 85 #include <vm/vm_pageout.h> 86 #include <vm/vm_extern.h> 87 #include <vm/vm_page.h> 88 #include <vm/vnode_pager.h> 89 90 #ifdef HWPMC_HOOKS 91 #include <sys/pmckern.h> 92 #endif 93 94 int old_mlock = 0; 95 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RW | CTLFLAG_TUN, &old_mlock, 0, 96 "Do not apply RLIMIT_MEMLOCK on mlockall"); 97 TUNABLE_INT("vm.old_mlock", &old_mlock); 98 99 #ifdef MAP_32BIT 100 #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) 101 #endif 102 103 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 104 int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *); 105 static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 106 int *, struct cdev *, vm_ooffset_t *, vm_object_t *); 107 static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 108 int *, struct shmfd *, vm_ooffset_t, vm_object_t *); 109 110 #ifndef _SYS_SYSPROTO_H_ 111 struct sbrk_args { 112 int incr; 113 }; 114 #endif 115 116 /* 117 * MPSAFE 118 */ 119 /* ARGSUSED */ 120 int 121 sys_sbrk(td, uap) 122 struct thread *td; 123 struct sbrk_args *uap; 124 { 125 /* Not yet implemented */ 126 return (EOPNOTSUPP); 127 } 128 129 #ifndef _SYS_SYSPROTO_H_ 130 struct sstk_args { 131 int incr; 132 }; 133 #endif 134 135 /* 136 * MPSAFE 137 */ 138 /* ARGSUSED */ 139 int 140 sys_sstk(td, uap) 141 struct thread *td; 142 struct sstk_args *uap; 143 { 144 /* Not yet implemented */ 145 return (EOPNOTSUPP); 146 } 147 148 #if defined(COMPAT_43) 149 #ifndef _SYS_SYSPROTO_H_ 150 struct getpagesize_args { 151 int dummy; 152 }; 153 #endif 154 155 int 156 ogetpagesize(td, uap) 157 struct thread *td; 158 struct getpagesize_args *uap; 159 { 160 /* MP SAFE */ 161 td->td_retval[0] = PAGE_SIZE; 162 return (0); 163 } 164 #endif /* COMPAT_43 */ 165 166 167 /* 168 * Memory Map (mmap) system call. Note that the file offset 169 * and address are allowed to be NOT page aligned, though if 170 * the MAP_FIXED flag it set, both must have the same remainder 171 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 172 * page-aligned, the actual mapping starts at trunc_page(addr) 173 * and the return value is adjusted up by the page offset. 174 * 175 * Generally speaking, only character devices which are themselves 176 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 177 * there would be no cache coherency between a descriptor and a VM mapping 178 * both to the same character device. 179 */ 180 #ifndef _SYS_SYSPROTO_H_ 181 struct mmap_args { 182 void *addr; 183 size_t len; 184 int prot; 185 int flags; 186 int fd; 187 long pad; 188 off_t pos; 189 }; 190 #endif 191 192 /* 193 * MPSAFE 194 */ 195 int 196 sys_mmap(td, uap) 197 struct thread *td; 198 struct mmap_args *uap; 199 { 200 #ifdef HWPMC_HOOKS 201 struct pmckern_map_in pkm; 202 #endif 203 struct file *fp; 204 struct vnode *vp; 205 vm_offset_t addr; 206 vm_size_t size, pageoff; 207 vm_prot_t cap_maxprot, prot, maxprot; 208 void *handle; 209 objtype_t handle_type; 210 int align, error, flags; 211 off_t pos; 212 struct vmspace *vms = td->td_proc->p_vmspace; 213 cap_rights_t rights; 214 215 addr = (vm_offset_t) uap->addr; 216 size = uap->len; 217 prot = uap->prot & VM_PROT_ALL; 218 flags = uap->flags; 219 pos = uap->pos; 220 221 fp = NULL; 222 223 /* 224 * Enforce the constraints. 225 * Mapping of length 0 is only allowed for old binaries. 226 * Anonymous mapping shall specify -1 as filedescriptor and 227 * zero position for new code. Be nice to ancient a.out 228 * binaries and correct pos for anonymous mapping, since old 229 * ld.so sometimes issues anonymous map requests with non-zero 230 * pos. 231 */ 232 if (!SV_CURPROC_FLAG(SV_AOUT)) { 233 if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) || 234 ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0))) 235 return (EINVAL); 236 } else { 237 if ((flags & MAP_ANON) != 0) 238 pos = 0; 239 } 240 241 if (flags & MAP_STACK) { 242 if ((uap->fd != -1) || 243 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 244 return (EINVAL); 245 flags |= MAP_ANON; 246 pos = 0; 247 } 248 if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) 249 return (EINVAL); 250 251 /* 252 * Align the file position to a page boundary, 253 * and save its page offset component. 254 */ 255 pageoff = (pos & PAGE_MASK); 256 pos -= pageoff; 257 258 /* Adjust size for rounding (on both ends). */ 259 size += pageoff; /* low end... */ 260 size = (vm_size_t) round_page(size); /* hi end */ 261 262 /* Ensure alignment is at least a page and fits in a pointer. */ 263 align = flags & MAP_ALIGNMENT_MASK; 264 if (align != 0 && align != MAP_ALIGNED_SUPER && 265 (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || 266 align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) 267 return (EINVAL); 268 269 /* 270 * Check for illegal addresses. Watch out for address wrap... Note 271 * that VM_*_ADDRESS are not constants due to casts (argh). 272 */ 273 if (flags & MAP_FIXED) { 274 /* 275 * The specified address must have the same remainder 276 * as the file offset taken modulo PAGE_SIZE, so it 277 * should be aligned after adjustment by pageoff. 278 */ 279 addr -= pageoff; 280 if (addr & PAGE_MASK) 281 return (EINVAL); 282 283 /* Address range must be all in user VM space. */ 284 if (addr < vm_map_min(&vms->vm_map) || 285 addr + size > vm_map_max(&vms->vm_map)) 286 return (EINVAL); 287 if (addr + size < addr) 288 return (EINVAL); 289 #ifdef MAP_32BIT 290 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) 291 return (EINVAL); 292 } else if (flags & MAP_32BIT) { 293 /* 294 * For MAP_32BIT, override the hint if it is too high and 295 * do not bother moving the mapping past the heap (since 296 * the heap is usually above 2GB). 297 */ 298 if (addr + size > MAP_32BIT_MAX_ADDR) 299 addr = 0; 300 #endif 301 } else { 302 /* 303 * XXX for non-fixed mappings where no hint is provided or 304 * the hint would fall in the potential heap space, 305 * place it after the end of the largest possible heap. 306 * 307 * There should really be a pmap call to determine a reasonable 308 * location. 309 */ 310 PROC_LOCK(td->td_proc); 311 if (addr == 0 || 312 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 313 addr < round_page((vm_offset_t)vms->vm_daddr + 314 lim_max(td->td_proc, RLIMIT_DATA)))) 315 addr = round_page((vm_offset_t)vms->vm_daddr + 316 lim_max(td->td_proc, RLIMIT_DATA)); 317 PROC_UNLOCK(td->td_proc); 318 } 319 if (flags & MAP_ANON) { 320 /* 321 * Mapping blank space is trivial. 322 */ 323 handle = NULL; 324 handle_type = OBJT_DEFAULT; 325 maxprot = VM_PROT_ALL; 326 cap_maxprot = VM_PROT_ALL; 327 } else { 328 /* 329 * Mapping file, get fp for validation and don't let the 330 * descriptor disappear on us if we block. Check capability 331 * rights, but also return the maximum rights to be combined 332 * with maxprot later. 333 */ 334 cap_rights_init(&rights, CAP_MMAP); 335 if (prot & PROT_READ) 336 cap_rights_set(&rights, CAP_MMAP_R); 337 if ((flags & MAP_SHARED) != 0) { 338 if (prot & PROT_WRITE) 339 cap_rights_set(&rights, CAP_MMAP_W); 340 } 341 if (prot & PROT_EXEC) 342 cap_rights_set(&rights, CAP_MMAP_X); 343 error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp); 344 if (error != 0) 345 goto done; 346 if (fp->f_type == DTYPE_SHM) { 347 handle = fp->f_data; 348 handle_type = OBJT_SWAP; 349 maxprot = VM_PROT_NONE; 350 351 /* FREAD should always be set. */ 352 if (fp->f_flag & FREAD) 353 maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; 354 if (fp->f_flag & FWRITE) 355 maxprot |= VM_PROT_WRITE; 356 goto map; 357 } 358 if (fp->f_type != DTYPE_VNODE) { 359 error = ENODEV; 360 goto done; 361 } 362 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ 363 defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) 364 /* 365 * POSIX shared-memory objects are defined to have 366 * kernel persistence, and are not defined to support 367 * read(2)/write(2) -- or even open(2). Thus, we can 368 * use MAP_ASYNC to trade on-disk coherence for speed. 369 * The shm_open(3) library routine turns on the FPOSIXSHM 370 * flag to request this behavior. 371 */ 372 if (fp->f_flag & FPOSIXSHM) 373 flags |= MAP_NOSYNC; 374 #endif 375 vp = fp->f_vnode; 376 /* 377 * Ensure that file and memory protections are 378 * compatible. Note that we only worry about 379 * writability if mapping is shared; in this case, 380 * current and max prot are dictated by the open file. 381 * XXX use the vnode instead? Problem is: what 382 * credentials do we use for determination? What if 383 * proc does a setuid? 384 */ 385 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 386 maxprot = VM_PROT_NONE; 387 else 388 maxprot = VM_PROT_EXECUTE; 389 if (fp->f_flag & FREAD) { 390 maxprot |= VM_PROT_READ; 391 } else if (prot & PROT_READ) { 392 error = EACCES; 393 goto done; 394 } 395 /* 396 * If we are sharing potential changes (either via 397 * MAP_SHARED or via the implicit sharing of character 398 * device mappings), and we are trying to get write 399 * permission although we opened it without asking 400 * for it, bail out. 401 */ 402 if ((flags & MAP_SHARED) != 0) { 403 if ((fp->f_flag & FWRITE) != 0) { 404 maxprot |= VM_PROT_WRITE; 405 } else if ((prot & PROT_WRITE) != 0) { 406 error = EACCES; 407 goto done; 408 } 409 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 410 maxprot |= VM_PROT_WRITE; 411 cap_maxprot |= VM_PROT_WRITE; 412 } 413 handle = (void *)vp; 414 handle_type = OBJT_VNODE; 415 } 416 map: 417 td->td_fpop = fp; 418 maxprot &= cap_maxprot; 419 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 420 flags, handle_type, handle, pos); 421 td->td_fpop = NULL; 422 #ifdef HWPMC_HOOKS 423 /* inform hwpmc(4) if an executable is being mapped */ 424 if (error == 0 && handle_type == OBJT_VNODE && 425 (prot & PROT_EXEC)) { 426 pkm.pm_file = handle; 427 pkm.pm_address = (uintptr_t) addr; 428 PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); 429 } 430 #endif 431 if (error == 0) 432 td->td_retval[0] = (register_t) (addr + pageoff); 433 done: 434 if (fp) 435 fdrop(fp, td); 436 437 return (error); 438 } 439 440 int 441 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 442 { 443 struct mmap_args oargs; 444 445 oargs.addr = uap->addr; 446 oargs.len = uap->len; 447 oargs.prot = uap->prot; 448 oargs.flags = uap->flags; 449 oargs.fd = uap->fd; 450 oargs.pos = uap->pos; 451 return (sys_mmap(td, &oargs)); 452 } 453 454 #ifdef COMPAT_43 455 #ifndef _SYS_SYSPROTO_H_ 456 struct ommap_args { 457 caddr_t addr; 458 int len; 459 int prot; 460 int flags; 461 int fd; 462 long pos; 463 }; 464 #endif 465 int 466 ommap(td, uap) 467 struct thread *td; 468 struct ommap_args *uap; 469 { 470 struct mmap_args nargs; 471 static const char cvtbsdprot[8] = { 472 0, 473 PROT_EXEC, 474 PROT_WRITE, 475 PROT_EXEC | PROT_WRITE, 476 PROT_READ, 477 PROT_EXEC | PROT_READ, 478 PROT_WRITE | PROT_READ, 479 PROT_EXEC | PROT_WRITE | PROT_READ, 480 }; 481 482 #define OMAP_ANON 0x0002 483 #define OMAP_COPY 0x0020 484 #define OMAP_SHARED 0x0010 485 #define OMAP_FIXED 0x0100 486 487 nargs.addr = uap->addr; 488 nargs.len = uap->len; 489 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 490 #ifdef COMPAT_FREEBSD32 491 #if defined(__amd64__) || defined(__ia64__) 492 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 493 nargs.prot != 0) 494 nargs.prot |= PROT_EXEC; 495 #endif 496 #endif 497 nargs.flags = 0; 498 if (uap->flags & OMAP_ANON) 499 nargs.flags |= MAP_ANON; 500 if (uap->flags & OMAP_COPY) 501 nargs.flags |= MAP_COPY; 502 if (uap->flags & OMAP_SHARED) 503 nargs.flags |= MAP_SHARED; 504 else 505 nargs.flags |= MAP_PRIVATE; 506 if (uap->flags & OMAP_FIXED) 507 nargs.flags |= MAP_FIXED; 508 nargs.fd = uap->fd; 509 nargs.pos = uap->pos; 510 return (sys_mmap(td, &nargs)); 511 } 512 #endif /* COMPAT_43 */ 513 514 515 #ifndef _SYS_SYSPROTO_H_ 516 struct msync_args { 517 void *addr; 518 size_t len; 519 int flags; 520 }; 521 #endif 522 /* 523 * MPSAFE 524 */ 525 int 526 sys_msync(td, uap) 527 struct thread *td; 528 struct msync_args *uap; 529 { 530 vm_offset_t addr; 531 vm_size_t size, pageoff; 532 int flags; 533 vm_map_t map; 534 int rv; 535 536 addr = (vm_offset_t) uap->addr; 537 size = uap->len; 538 flags = uap->flags; 539 540 pageoff = (addr & PAGE_MASK); 541 addr -= pageoff; 542 size += pageoff; 543 size = (vm_size_t) round_page(size); 544 if (addr + size < addr) 545 return (EINVAL); 546 547 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 548 return (EINVAL); 549 550 map = &td->td_proc->p_vmspace->vm_map; 551 552 /* 553 * Clean the pages and interpret the return value. 554 */ 555 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 556 (flags & MS_INVALIDATE) != 0); 557 switch (rv) { 558 case KERN_SUCCESS: 559 return (0); 560 case KERN_INVALID_ADDRESS: 561 return (ENOMEM); 562 case KERN_INVALID_ARGUMENT: 563 return (EBUSY); 564 case KERN_FAILURE: 565 return (EIO); 566 default: 567 return (EINVAL); 568 } 569 } 570 571 #ifndef _SYS_SYSPROTO_H_ 572 struct munmap_args { 573 void *addr; 574 size_t len; 575 }; 576 #endif 577 /* 578 * MPSAFE 579 */ 580 int 581 sys_munmap(td, uap) 582 struct thread *td; 583 struct munmap_args *uap; 584 { 585 #ifdef HWPMC_HOOKS 586 struct pmckern_map_out pkm; 587 vm_map_entry_t entry; 588 #endif 589 vm_offset_t addr; 590 vm_size_t size, pageoff; 591 vm_map_t map; 592 593 addr = (vm_offset_t) uap->addr; 594 size = uap->len; 595 if (size == 0) 596 return (EINVAL); 597 598 pageoff = (addr & PAGE_MASK); 599 addr -= pageoff; 600 size += pageoff; 601 size = (vm_size_t) round_page(size); 602 if (addr + size < addr) 603 return (EINVAL); 604 605 /* 606 * Check for illegal addresses. Watch out for address wrap... 607 */ 608 map = &td->td_proc->p_vmspace->vm_map; 609 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 610 return (EINVAL); 611 vm_map_lock(map); 612 #ifdef HWPMC_HOOKS 613 /* 614 * Inform hwpmc if the address range being unmapped contains 615 * an executable region. 616 */ 617 pkm.pm_address = (uintptr_t) NULL; 618 if (vm_map_lookup_entry(map, addr, &entry)) { 619 for (; 620 entry != &map->header && entry->start < addr + size; 621 entry = entry->next) { 622 if (vm_map_check_protection(map, entry->start, 623 entry->end, VM_PROT_EXECUTE) == TRUE) { 624 pkm.pm_address = (uintptr_t) addr; 625 pkm.pm_size = (size_t) size; 626 break; 627 } 628 } 629 } 630 #endif 631 vm_map_delete(map, addr, addr + size); 632 633 #ifdef HWPMC_HOOKS 634 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 635 vm_map_lock_downgrade(map); 636 if (pkm.pm_address != (uintptr_t) NULL) 637 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 638 vm_map_unlock_read(map); 639 #else 640 vm_map_unlock(map); 641 #endif 642 /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ 643 return (0); 644 } 645 646 #ifndef _SYS_SYSPROTO_H_ 647 struct mprotect_args { 648 const void *addr; 649 size_t len; 650 int prot; 651 }; 652 #endif 653 /* 654 * MPSAFE 655 */ 656 int 657 sys_mprotect(td, uap) 658 struct thread *td; 659 struct mprotect_args *uap; 660 { 661 vm_offset_t addr; 662 vm_size_t size, pageoff; 663 vm_prot_t prot; 664 665 addr = (vm_offset_t) uap->addr; 666 size = uap->len; 667 prot = uap->prot & VM_PROT_ALL; 668 669 pageoff = (addr & PAGE_MASK); 670 addr -= pageoff; 671 size += pageoff; 672 size = (vm_size_t) round_page(size); 673 if (addr + size < addr) 674 return (EINVAL); 675 676 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 677 addr + size, prot, FALSE)) { 678 case KERN_SUCCESS: 679 return (0); 680 case KERN_PROTECTION_FAILURE: 681 return (EACCES); 682 case KERN_RESOURCE_SHORTAGE: 683 return (ENOMEM); 684 } 685 return (EINVAL); 686 } 687 688 #ifndef _SYS_SYSPROTO_H_ 689 struct minherit_args { 690 void *addr; 691 size_t len; 692 int inherit; 693 }; 694 #endif 695 /* 696 * MPSAFE 697 */ 698 int 699 sys_minherit(td, uap) 700 struct thread *td; 701 struct minherit_args *uap; 702 { 703 vm_offset_t addr; 704 vm_size_t size, pageoff; 705 vm_inherit_t inherit; 706 707 addr = (vm_offset_t)uap->addr; 708 size = uap->len; 709 inherit = uap->inherit; 710 711 pageoff = (addr & PAGE_MASK); 712 addr -= pageoff; 713 size += pageoff; 714 size = (vm_size_t) round_page(size); 715 if (addr + size < addr) 716 return (EINVAL); 717 718 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 719 addr + size, inherit)) { 720 case KERN_SUCCESS: 721 return (0); 722 case KERN_PROTECTION_FAILURE: 723 return (EACCES); 724 } 725 return (EINVAL); 726 } 727 728 #ifndef _SYS_SYSPROTO_H_ 729 struct madvise_args { 730 void *addr; 731 size_t len; 732 int behav; 733 }; 734 #endif 735 736 /* 737 * MPSAFE 738 */ 739 int 740 sys_madvise(td, uap) 741 struct thread *td; 742 struct madvise_args *uap; 743 { 744 vm_offset_t start, end; 745 vm_map_t map; 746 int flags; 747 748 /* 749 * Check for our special case, advising the swap pager we are 750 * "immortal." 751 */ 752 if (uap->behav == MADV_PROTECT) { 753 flags = PPROT_SET; 754 return (kern_procctl(td, P_PID, td->td_proc->p_pid, 755 PROC_SPROTECT, &flags)); 756 } 757 758 /* 759 * Check for illegal behavior 760 */ 761 if (uap->behav < 0 || uap->behav > MADV_CORE) 762 return (EINVAL); 763 /* 764 * Check for illegal addresses. Watch out for address wrap... Note 765 * that VM_*_ADDRESS are not constants due to casts (argh). 766 */ 767 map = &td->td_proc->p_vmspace->vm_map; 768 if ((vm_offset_t)uap->addr < vm_map_min(map) || 769 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 770 return (EINVAL); 771 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 772 return (EINVAL); 773 774 /* 775 * Since this routine is only advisory, we default to conservative 776 * behavior. 777 */ 778 start = trunc_page((vm_offset_t) uap->addr); 779 end = round_page((vm_offset_t) uap->addr + uap->len); 780 781 if (vm_map_madvise(map, start, end, uap->behav)) 782 return (EINVAL); 783 return (0); 784 } 785 786 #ifndef _SYS_SYSPROTO_H_ 787 struct mincore_args { 788 const void *addr; 789 size_t len; 790 char *vec; 791 }; 792 #endif 793 794 /* 795 * MPSAFE 796 */ 797 int 798 sys_mincore(td, uap) 799 struct thread *td; 800 struct mincore_args *uap; 801 { 802 vm_offset_t addr, first_addr; 803 vm_offset_t end, cend; 804 pmap_t pmap; 805 vm_map_t map; 806 char *vec; 807 int error = 0; 808 int vecindex, lastvecindex; 809 vm_map_entry_t current; 810 vm_map_entry_t entry; 811 vm_object_t object; 812 vm_paddr_t locked_pa; 813 vm_page_t m; 814 vm_pindex_t pindex; 815 int mincoreinfo; 816 unsigned int timestamp; 817 boolean_t locked; 818 819 /* 820 * Make sure that the addresses presented are valid for user 821 * mode. 822 */ 823 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 824 end = addr + (vm_size_t)round_page(uap->len); 825 map = &td->td_proc->p_vmspace->vm_map; 826 if (end > vm_map_max(map) || end < addr) 827 return (ENOMEM); 828 829 /* 830 * Address of byte vector 831 */ 832 vec = uap->vec; 833 834 pmap = vmspace_pmap(td->td_proc->p_vmspace); 835 836 vm_map_lock_read(map); 837 RestartScan: 838 timestamp = map->timestamp; 839 840 if (!vm_map_lookup_entry(map, addr, &entry)) { 841 vm_map_unlock_read(map); 842 return (ENOMEM); 843 } 844 845 /* 846 * Do this on a map entry basis so that if the pages are not 847 * in the current processes address space, we can easily look 848 * up the pages elsewhere. 849 */ 850 lastvecindex = -1; 851 for (current = entry; 852 (current != &map->header) && (current->start < end); 853 current = current->next) { 854 855 /* 856 * check for contiguity 857 */ 858 if (current->end < end && 859 (entry->next == &map->header || 860 current->next->start > current->end)) { 861 vm_map_unlock_read(map); 862 return (ENOMEM); 863 } 864 865 /* 866 * ignore submaps (for now) or null objects 867 */ 868 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 869 current->object.vm_object == NULL) 870 continue; 871 872 /* 873 * limit this scan to the current map entry and the 874 * limits for the mincore call 875 */ 876 if (addr < current->start) 877 addr = current->start; 878 cend = current->end; 879 if (cend > end) 880 cend = end; 881 882 /* 883 * scan this entry one page at a time 884 */ 885 while (addr < cend) { 886 /* 887 * Check pmap first, it is likely faster, also 888 * it can provide info as to whether we are the 889 * one referencing or modifying the page. 890 */ 891 object = NULL; 892 locked_pa = 0; 893 retry: 894 m = NULL; 895 mincoreinfo = pmap_mincore(pmap, addr, &locked_pa); 896 if (locked_pa != 0) { 897 /* 898 * The page is mapped by this process but not 899 * both accessed and modified. It is also 900 * managed. Acquire the object lock so that 901 * other mappings might be examined. 902 */ 903 m = PHYS_TO_VM_PAGE(locked_pa); 904 if (m->object != object) { 905 if (object != NULL) 906 VM_OBJECT_WUNLOCK(object); 907 object = m->object; 908 locked = VM_OBJECT_TRYWLOCK(object); 909 vm_page_unlock(m); 910 if (!locked) { 911 VM_OBJECT_WLOCK(object); 912 vm_page_lock(m); 913 goto retry; 914 } 915 } else 916 vm_page_unlock(m); 917 KASSERT(m->valid == VM_PAGE_BITS_ALL, 918 ("mincore: page %p is mapped but invalid", 919 m)); 920 } else if (mincoreinfo == 0) { 921 /* 922 * The page is not mapped by this process. If 923 * the object implements managed pages, then 924 * determine if the page is resident so that 925 * the mappings might be examined. 926 */ 927 if (current->object.vm_object != object) { 928 if (object != NULL) 929 VM_OBJECT_WUNLOCK(object); 930 object = current->object.vm_object; 931 VM_OBJECT_WLOCK(object); 932 } 933 if (object->type == OBJT_DEFAULT || 934 object->type == OBJT_SWAP || 935 object->type == OBJT_VNODE) { 936 pindex = OFF_TO_IDX(current->offset + 937 (addr - current->start)); 938 m = vm_page_lookup(object, pindex); 939 if (m == NULL && 940 vm_page_is_cached(object, pindex)) 941 mincoreinfo = MINCORE_INCORE; 942 if (m != NULL && m->valid == 0) 943 m = NULL; 944 if (m != NULL) 945 mincoreinfo = MINCORE_INCORE; 946 } 947 } 948 if (m != NULL) { 949 /* Examine other mappings to the page. */ 950 if (m->dirty == 0 && pmap_is_modified(m)) 951 vm_page_dirty(m); 952 if (m->dirty != 0) 953 mincoreinfo |= MINCORE_MODIFIED_OTHER; 954 /* 955 * The first test for PGA_REFERENCED is an 956 * optimization. The second test is 957 * required because a concurrent pmap 958 * operation could clear the last reference 959 * and set PGA_REFERENCED before the call to 960 * pmap_is_referenced(). 961 */ 962 if ((m->aflags & PGA_REFERENCED) != 0 || 963 pmap_is_referenced(m) || 964 (m->aflags & PGA_REFERENCED) != 0) 965 mincoreinfo |= MINCORE_REFERENCED_OTHER; 966 } 967 if (object != NULL) 968 VM_OBJECT_WUNLOCK(object); 969 970 /* 971 * subyte may page fault. In case it needs to modify 972 * the map, we release the lock. 973 */ 974 vm_map_unlock_read(map); 975 976 /* 977 * calculate index into user supplied byte vector 978 */ 979 vecindex = OFF_TO_IDX(addr - first_addr); 980 981 /* 982 * If we have skipped map entries, we need to make sure that 983 * the byte vector is zeroed for those skipped entries. 984 */ 985 while ((lastvecindex + 1) < vecindex) { 986 ++lastvecindex; 987 error = subyte(vec + lastvecindex, 0); 988 if (error) { 989 error = EFAULT; 990 goto done2; 991 } 992 } 993 994 /* 995 * Pass the page information to the user 996 */ 997 error = subyte(vec + vecindex, mincoreinfo); 998 if (error) { 999 error = EFAULT; 1000 goto done2; 1001 } 1002 1003 /* 1004 * If the map has changed, due to the subyte, the previous 1005 * output may be invalid. 1006 */ 1007 vm_map_lock_read(map); 1008 if (timestamp != map->timestamp) 1009 goto RestartScan; 1010 1011 lastvecindex = vecindex; 1012 addr += PAGE_SIZE; 1013 } 1014 } 1015 1016 /* 1017 * subyte may page fault. In case it needs to modify 1018 * the map, we release the lock. 1019 */ 1020 vm_map_unlock_read(map); 1021 1022 /* 1023 * Zero the last entries in the byte vector. 1024 */ 1025 vecindex = OFF_TO_IDX(end - first_addr); 1026 while ((lastvecindex + 1) < vecindex) { 1027 ++lastvecindex; 1028 error = subyte(vec + lastvecindex, 0); 1029 if (error) { 1030 error = EFAULT; 1031 goto done2; 1032 } 1033 } 1034 1035 /* 1036 * If the map has changed, due to the subyte, the previous 1037 * output may be invalid. 1038 */ 1039 vm_map_lock_read(map); 1040 if (timestamp != map->timestamp) 1041 goto RestartScan; 1042 vm_map_unlock_read(map); 1043 done2: 1044 return (error); 1045 } 1046 1047 #ifndef _SYS_SYSPROTO_H_ 1048 struct mlock_args { 1049 const void *addr; 1050 size_t len; 1051 }; 1052 #endif 1053 /* 1054 * MPSAFE 1055 */ 1056 int 1057 sys_mlock(td, uap) 1058 struct thread *td; 1059 struct mlock_args *uap; 1060 { 1061 1062 return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len)); 1063 } 1064 1065 int 1066 vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len) 1067 { 1068 vm_offset_t addr, end, last, start; 1069 vm_size_t npages, size; 1070 vm_map_t map; 1071 unsigned long nsize; 1072 int error; 1073 1074 error = priv_check_cred(cred, PRIV_VM_MLOCK, 0); 1075 if (error) 1076 return (error); 1077 addr = (vm_offset_t)addr0; 1078 size = len; 1079 last = addr + size; 1080 start = trunc_page(addr); 1081 end = round_page(last); 1082 if (last < addr || end < addr) 1083 return (EINVAL); 1084 npages = atop(end - start); 1085 if (npages > vm_page_max_wired) 1086 return (ENOMEM); 1087 map = &proc->p_vmspace->vm_map; 1088 PROC_LOCK(proc); 1089 nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1090 if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) { 1091 PROC_UNLOCK(proc); 1092 return (ENOMEM); 1093 } 1094 PROC_UNLOCK(proc); 1095 if (npages + vm_cnt.v_wire_count > vm_page_max_wired) 1096 return (EAGAIN); 1097 #ifdef RACCT 1098 PROC_LOCK(proc); 1099 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1100 PROC_UNLOCK(proc); 1101 if (error != 0) 1102 return (ENOMEM); 1103 #endif 1104 error = vm_map_wire(map, start, end, 1105 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1106 #ifdef RACCT 1107 if (error != KERN_SUCCESS) { 1108 PROC_LOCK(proc); 1109 racct_set(proc, RACCT_MEMLOCK, 1110 ptoa(pmap_wired_count(map->pmap))); 1111 PROC_UNLOCK(proc); 1112 } 1113 #endif 1114 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1115 } 1116 1117 #ifndef _SYS_SYSPROTO_H_ 1118 struct mlockall_args { 1119 int how; 1120 }; 1121 #endif 1122 1123 /* 1124 * MPSAFE 1125 */ 1126 int 1127 sys_mlockall(td, uap) 1128 struct thread *td; 1129 struct mlockall_args *uap; 1130 { 1131 vm_map_t map; 1132 int error; 1133 1134 map = &td->td_proc->p_vmspace->vm_map; 1135 error = priv_check(td, PRIV_VM_MLOCK); 1136 if (error) 1137 return (error); 1138 1139 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1140 return (EINVAL); 1141 1142 /* 1143 * If wiring all pages in the process would cause it to exceed 1144 * a hard resource limit, return ENOMEM. 1145 */ 1146 if (!old_mlock && uap->how & MCL_CURRENT) { 1147 PROC_LOCK(td->td_proc); 1148 if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1149 PROC_UNLOCK(td->td_proc); 1150 return (ENOMEM); 1151 } 1152 PROC_UNLOCK(td->td_proc); 1153 } 1154 #ifdef RACCT 1155 PROC_LOCK(td->td_proc); 1156 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1157 PROC_UNLOCK(td->td_proc); 1158 if (error != 0) 1159 return (ENOMEM); 1160 #endif 1161 1162 if (uap->how & MCL_FUTURE) { 1163 vm_map_lock(map); 1164 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1165 vm_map_unlock(map); 1166 error = 0; 1167 } 1168 1169 if (uap->how & MCL_CURRENT) { 1170 /* 1171 * P1003.1-2001 mandates that all currently mapped pages 1172 * will be memory resident and locked (wired) upon return 1173 * from mlockall(). vm_map_wire() will wire pages, by 1174 * calling vm_fault_wire() for each page in the region. 1175 */ 1176 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1177 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1178 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1179 } 1180 #ifdef RACCT 1181 if (error != KERN_SUCCESS) { 1182 PROC_LOCK(td->td_proc); 1183 racct_set(td->td_proc, RACCT_MEMLOCK, 1184 ptoa(pmap_wired_count(map->pmap))); 1185 PROC_UNLOCK(td->td_proc); 1186 } 1187 #endif 1188 1189 return (error); 1190 } 1191 1192 #ifndef _SYS_SYSPROTO_H_ 1193 struct munlockall_args { 1194 register_t dummy; 1195 }; 1196 #endif 1197 1198 /* 1199 * MPSAFE 1200 */ 1201 int 1202 sys_munlockall(td, uap) 1203 struct thread *td; 1204 struct munlockall_args *uap; 1205 { 1206 vm_map_t map; 1207 int error; 1208 1209 map = &td->td_proc->p_vmspace->vm_map; 1210 error = priv_check(td, PRIV_VM_MUNLOCK); 1211 if (error) 1212 return (error); 1213 1214 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1215 vm_map_lock(map); 1216 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1217 vm_map_unlock(map); 1218 1219 /* Forcibly unwire all pages. */ 1220 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1221 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1222 #ifdef RACCT 1223 if (error == KERN_SUCCESS) { 1224 PROC_LOCK(td->td_proc); 1225 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1226 PROC_UNLOCK(td->td_proc); 1227 } 1228 #endif 1229 1230 return (error); 1231 } 1232 1233 #ifndef _SYS_SYSPROTO_H_ 1234 struct munlock_args { 1235 const void *addr; 1236 size_t len; 1237 }; 1238 #endif 1239 /* 1240 * MPSAFE 1241 */ 1242 int 1243 sys_munlock(td, uap) 1244 struct thread *td; 1245 struct munlock_args *uap; 1246 { 1247 vm_offset_t addr, end, last, start; 1248 vm_size_t size; 1249 #ifdef RACCT 1250 vm_map_t map; 1251 #endif 1252 int error; 1253 1254 error = priv_check(td, PRIV_VM_MUNLOCK); 1255 if (error) 1256 return (error); 1257 addr = (vm_offset_t)uap->addr; 1258 size = uap->len; 1259 last = addr + size; 1260 start = trunc_page(addr); 1261 end = round_page(last); 1262 if (last < addr || end < addr) 1263 return (EINVAL); 1264 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1265 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1266 #ifdef RACCT 1267 if (error == KERN_SUCCESS) { 1268 PROC_LOCK(td->td_proc); 1269 map = &td->td_proc->p_vmspace->vm_map; 1270 racct_set(td->td_proc, RACCT_MEMLOCK, 1271 ptoa(pmap_wired_count(map->pmap))); 1272 PROC_UNLOCK(td->td_proc); 1273 } 1274 #endif 1275 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1276 } 1277 1278 /* 1279 * vm_mmap_vnode() 1280 * 1281 * Helper function for vm_mmap. Perform sanity check specific for mmap 1282 * operations on vnodes. 1283 * 1284 * For VCHR vnodes, the vnode lock is held over the call to 1285 * vm_mmap_cdev() to keep vp->v_rdev valid. 1286 */ 1287 int 1288 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1289 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1290 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1291 boolean_t *writecounted) 1292 { 1293 struct vattr va; 1294 vm_object_t obj; 1295 vm_offset_t foff; 1296 struct mount *mp; 1297 struct ucred *cred; 1298 int error, flags, locktype; 1299 1300 mp = vp->v_mount; 1301 cred = td->td_ucred; 1302 if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED)) 1303 locktype = LK_EXCLUSIVE; 1304 else 1305 locktype = LK_SHARED; 1306 if ((error = vget(vp, locktype, td)) != 0) 1307 return (error); 1308 foff = *foffp; 1309 flags = *flagsp; 1310 obj = vp->v_object; 1311 if (vp->v_type == VREG) { 1312 /* 1313 * Get the proper underlying object 1314 */ 1315 if (obj == NULL) { 1316 error = EINVAL; 1317 goto done; 1318 } 1319 if (obj->type == OBJT_VNODE && obj->handle != vp) { 1320 vput(vp); 1321 vp = (struct vnode *)obj->handle; 1322 /* 1323 * Bypass filesystems obey the mpsafety of the 1324 * underlying fs. Tmpfs never bypasses. 1325 */ 1326 error = vget(vp, locktype, td); 1327 if (error != 0) 1328 return (error); 1329 } 1330 if (locktype == LK_EXCLUSIVE) { 1331 *writecounted = TRUE; 1332 vnode_pager_update_writecount(obj, 0, objsize); 1333 } 1334 } else if (vp->v_type == VCHR) { 1335 error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp, 1336 vp->v_rdev, foffp, objp); 1337 if (error == 0) 1338 goto mark_atime; 1339 goto done; 1340 } else { 1341 error = EINVAL; 1342 goto done; 1343 } 1344 if ((error = VOP_GETATTR(vp, &va, cred))) 1345 goto done; 1346 #ifdef MAC 1347 error = mac_vnode_check_mmap(cred, vp, prot, flags); 1348 if (error != 0) 1349 goto done; 1350 #endif 1351 if ((flags & MAP_SHARED) != 0) { 1352 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1353 if (prot & PROT_WRITE) { 1354 error = EPERM; 1355 goto done; 1356 } 1357 *maxprotp &= ~VM_PROT_WRITE; 1358 } 1359 } 1360 /* 1361 * If it is a regular file without any references 1362 * we do not need to sync it. 1363 * Adjust object size to be the size of actual file. 1364 */ 1365 objsize = round_page(va.va_size); 1366 if (va.va_nlink == 0) 1367 flags |= MAP_NOSYNC; 1368 if (obj->type == OBJT_VNODE) 1369 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, 1370 cred); 1371 else { 1372 KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP, 1373 ("wrong object type")); 1374 vm_object_reference(obj); 1375 } 1376 if (obj == NULL) { 1377 error = ENOMEM; 1378 goto done; 1379 } 1380 *objp = obj; 1381 *flagsp = flags; 1382 1383 mark_atime: 1384 vfs_mark_atime(vp, cred); 1385 1386 done: 1387 if (error != 0 && *writecounted) { 1388 *writecounted = FALSE; 1389 vnode_pager_update_writecount(obj, objsize, 0); 1390 } 1391 vput(vp); 1392 return (error); 1393 } 1394 1395 /* 1396 * vm_mmap_cdev() 1397 * 1398 * MPSAFE 1399 * 1400 * Helper function for vm_mmap. Perform sanity check specific for mmap 1401 * operations on cdevs. 1402 */ 1403 int 1404 vm_mmap_cdev(struct thread *td, vm_size_t objsize, 1405 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1406 struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp) 1407 { 1408 vm_object_t obj; 1409 struct cdevsw *dsw; 1410 int error, flags, ref; 1411 1412 flags = *flagsp; 1413 1414 dsw = dev_refthread(cdev, &ref); 1415 if (dsw == NULL) 1416 return (ENXIO); 1417 if (dsw->d_flags & D_MMAP_ANON) { 1418 dev_relthread(cdev, ref); 1419 *maxprotp = VM_PROT_ALL; 1420 *flagsp |= MAP_ANON; 1421 return (0); 1422 } 1423 /* 1424 * cdevs do not provide private mappings of any kind. 1425 */ 1426 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1427 (prot & PROT_WRITE) != 0) { 1428 dev_relthread(cdev, ref); 1429 return (EACCES); 1430 } 1431 if (flags & (MAP_PRIVATE|MAP_COPY)) { 1432 dev_relthread(cdev, ref); 1433 return (EINVAL); 1434 } 1435 /* 1436 * Force device mappings to be shared. 1437 */ 1438 flags |= MAP_SHARED; 1439 #ifdef MAC_XXX 1440 error = mac_cdev_check_mmap(td->td_ucred, cdev, prot); 1441 if (error != 0) { 1442 dev_relthread(cdev, ref); 1443 return (error); 1444 } 1445 #endif 1446 /* 1447 * First, try d_mmap_single(). If that is not implemented 1448 * (returns ENODEV), fall back to using the device pager. 1449 * Note that d_mmap_single() must return a reference to the 1450 * object (it needs to bump the reference count of the object 1451 * it returns somehow). 1452 * 1453 * XXX assumes VM_PROT_* == PROT_* 1454 */ 1455 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1456 dev_relthread(cdev, ref); 1457 if (error != ENODEV) 1458 return (error); 1459 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1460 td->td_ucred); 1461 if (obj == NULL) 1462 return (EINVAL); 1463 *objp = obj; 1464 *flagsp = flags; 1465 return (0); 1466 } 1467 1468 /* 1469 * vm_mmap_shm() 1470 * 1471 * MPSAFE 1472 * 1473 * Helper function for vm_mmap. Perform sanity check specific for mmap 1474 * operations on shm file descriptors. 1475 */ 1476 int 1477 vm_mmap_shm(struct thread *td, vm_size_t objsize, 1478 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1479 struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp) 1480 { 1481 int error; 1482 1483 if ((*flagsp & MAP_SHARED) != 0 && 1484 (*maxprotp & VM_PROT_WRITE) == 0 && 1485 (prot & PROT_WRITE) != 0) 1486 return (EACCES); 1487 #ifdef MAC 1488 error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp); 1489 if (error != 0) 1490 return (error); 1491 #endif 1492 error = shm_mmap(shmfd, objsize, foff, objp); 1493 if (error) 1494 return (error); 1495 return (0); 1496 } 1497 1498 /* 1499 * vm_mmap() 1500 * 1501 * MPSAFE 1502 * 1503 * Internal version of mmap. Currently used by mmap, exec, and sys5 1504 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1505 */ 1506 int 1507 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1508 vm_prot_t maxprot, int flags, 1509 objtype_t handle_type, void *handle, 1510 vm_ooffset_t foff) 1511 { 1512 boolean_t fitit; 1513 vm_object_t object = NULL; 1514 struct thread *td = curthread; 1515 int docow, error, findspace, rv; 1516 boolean_t writecounted; 1517 1518 if (size == 0) 1519 return (0); 1520 1521 size = round_page(size); 1522 1523 if (map == &td->td_proc->p_vmspace->vm_map) { 1524 PROC_LOCK(td->td_proc); 1525 if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) { 1526 PROC_UNLOCK(td->td_proc); 1527 return (ENOMEM); 1528 } 1529 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1530 PROC_UNLOCK(td->td_proc); 1531 return (ENOMEM); 1532 } 1533 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1534 if (ptoa(pmap_wired_count(map->pmap)) + size > 1535 lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1536 racct_set_force(td->td_proc, RACCT_VMEM, 1537 map->size); 1538 PROC_UNLOCK(td->td_proc); 1539 return (ENOMEM); 1540 } 1541 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1542 ptoa(pmap_wired_count(map->pmap)) + size); 1543 if (error != 0) { 1544 racct_set_force(td->td_proc, RACCT_VMEM, 1545 map->size); 1546 PROC_UNLOCK(td->td_proc); 1547 return (error); 1548 } 1549 } 1550 PROC_UNLOCK(td->td_proc); 1551 } 1552 1553 /* 1554 * We currently can only deal with page aligned file offsets. 1555 * The check is here rather than in the syscall because the 1556 * kernel calls this function internally for other mmaping 1557 * operations (such as in exec) and non-aligned offsets will 1558 * cause pmap inconsistencies...so we want to be sure to 1559 * disallow this in all cases. 1560 */ 1561 if (foff & PAGE_MASK) 1562 return (EINVAL); 1563 1564 if ((flags & MAP_FIXED) == 0) { 1565 fitit = TRUE; 1566 *addr = round_page(*addr); 1567 } else { 1568 if (*addr != trunc_page(*addr)) 1569 return (EINVAL); 1570 fitit = FALSE; 1571 } 1572 writecounted = FALSE; 1573 1574 /* 1575 * Lookup/allocate object. 1576 */ 1577 switch (handle_type) { 1578 case OBJT_DEVICE: 1579 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, 1580 handle, &foff, &object); 1581 break; 1582 case OBJT_VNODE: 1583 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1584 handle, &foff, &object, &writecounted); 1585 break; 1586 case OBJT_SWAP: 1587 error = vm_mmap_shm(td, size, prot, &maxprot, &flags, 1588 handle, foff, &object); 1589 break; 1590 case OBJT_DEFAULT: 1591 if (handle == NULL) { 1592 error = 0; 1593 break; 1594 } 1595 /* FALLTHROUGH */ 1596 default: 1597 error = EINVAL; 1598 break; 1599 } 1600 if (error) 1601 return (error); 1602 if (flags & MAP_ANON) { 1603 object = NULL; 1604 docow = 0; 1605 /* 1606 * Unnamed anonymous regions always start at 0. 1607 */ 1608 if (handle == 0) 1609 foff = 0; 1610 } else if (flags & MAP_PREFAULT_READ) 1611 docow = MAP_PREFAULT; 1612 else 1613 docow = MAP_PREFAULT_PARTIAL; 1614 1615 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1616 docow |= MAP_COPY_ON_WRITE; 1617 if (flags & MAP_NOSYNC) 1618 docow |= MAP_DISABLE_SYNCER; 1619 if (flags & MAP_NOCORE) 1620 docow |= MAP_DISABLE_COREDUMP; 1621 /* Shared memory is also shared with children. */ 1622 if (flags & MAP_SHARED) 1623 docow |= MAP_INHERIT_SHARE; 1624 if (writecounted) 1625 docow |= MAP_VN_WRITECOUNT; 1626 if (flags & MAP_STACK) { 1627 if (object != NULL) 1628 return (EINVAL); 1629 docow |= MAP_STACK_GROWS_DOWN; 1630 } 1631 if ((flags & MAP_EXCL) != 0) 1632 docow |= MAP_CHECK_EXCL; 1633 1634 if (fitit) { 1635 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) 1636 findspace = VMFS_SUPER_SPACE; 1637 else if ((flags & MAP_ALIGNMENT_MASK) != 0) 1638 findspace = VMFS_ALIGNED_SPACE(flags >> 1639 MAP_ALIGNMENT_SHIFT); 1640 else 1641 findspace = VMFS_OPTIMAL_SPACE; 1642 rv = vm_map_find(map, object, foff, addr, size, 1643 #ifdef MAP_32BIT 1644 flags & MAP_32BIT ? MAP_32BIT_MAX_ADDR : 1645 #endif 1646 0, findspace, prot, maxprot, docow); 1647 } else { 1648 rv = vm_map_fixed(map, object, foff, *addr, size, 1649 prot, maxprot, docow); 1650 } 1651 1652 if (rv == KERN_SUCCESS) { 1653 /* 1654 * If the process has requested that all future mappings 1655 * be wired, then heed this. 1656 */ 1657 if (map->flags & MAP_WIREFUTURE) { 1658 vm_map_wire(map, *addr, *addr + size, 1659 VM_MAP_WIRE_USER | ((flags & MAP_STACK) ? 1660 VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES)); 1661 } 1662 } else { 1663 /* 1664 * If this mapping was accounted for in the vnode's 1665 * writecount, then undo that now. 1666 */ 1667 if (writecounted) 1668 vnode_pager_release_writecount(object, 0, size); 1669 /* 1670 * Lose the object reference. Will destroy the 1671 * object if it's an unnamed anonymous mapping 1672 * or named anonymous without other references. 1673 */ 1674 vm_object_deallocate(object); 1675 } 1676 return (vm_mmap_to_errno(rv)); 1677 } 1678 1679 /* 1680 * Translate a Mach VM return code to zero on success or the appropriate errno 1681 * on failure. 1682 */ 1683 int 1684 vm_mmap_to_errno(int rv) 1685 { 1686 1687 switch (rv) { 1688 case KERN_SUCCESS: 1689 return (0); 1690 case KERN_INVALID_ADDRESS: 1691 case KERN_NO_SPACE: 1692 return (ENOMEM); 1693 case KERN_PROTECTION_FAILURE: 1694 return (EACCES); 1695 default: 1696 return (EINVAL); 1697 } 1698 } 1699