1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1988 University of Utah. 5 * Copyright (c) 1991, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * the Systems Programming Group of the University of Utah Computer 10 * Science Department. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include "opt_hwpmc_hooks.h" 44 #include "opt_hwt_hooks.h" 45 #include "opt_vm.h" 46 47 #define EXTERR_CATEGORY EXTERR_CAT_MMAP 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/capsicum.h> 51 #include <sys/exterrvar.h> 52 #include <sys/kernel.h> 53 #include <sys/lock.h> 54 #include <sys/mutex.h> 55 #include <sys/sysproto.h> 56 #include <sys/elf.h> 57 #include <sys/filedesc.h> 58 #include <sys/priv.h> 59 #include <sys/proc.h> 60 #include <sys/procctl.h> 61 #include <sys/racct.h> 62 #include <sys/resource.h> 63 #include <sys/resourcevar.h> 64 #include <sys/rwlock.h> 65 #include <sys/sysctl.h> 66 #include <sys/vnode.h> 67 #include <sys/fcntl.h> 68 #include <sys/file.h> 69 #include <sys/mman.h> 70 #include <sys/mount.h> 71 #include <sys/conf.h> 72 #include <sys/stat.h> 73 #include <sys/syscallsubr.h> 74 #include <sys/sysent.h> 75 #include <sys/vmmeter.h> 76 #if defined(__amd64__) || defined(__i386__) /* for i386_read_exec */ 77 #include <machine/md_var.h> 78 #endif 79 80 #include <security/audit/audit.h> 81 #include <security/mac/mac_framework.h> 82 83 #include <vm/vm.h> 84 #include <vm/vm_param.h> 85 #include <vm/pmap.h> 86 #include <vm/vm_map.h> 87 #include <vm/vm_object.h> 88 #include <vm/vm_page.h> 89 #include <vm/vm_pager.h> 90 #include <vm/vm_pageout.h> 91 #include <vm/vm_extern.h> 92 #include <vm/vm_page.h> 93 #include <vm/vnode_pager.h> 94 95 #ifdef HWPMC_HOOKS 96 #include <sys/pmckern.h> 97 #endif 98 99 #ifdef HWT_HOOKS 100 #include <dev/hwt/hwt_hook.h> 101 #endif 102 103 int old_mlock = 0; 104 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0, 105 "Do not apply RLIMIT_MEMLOCK on mlockall"); 106 static int mincore_mapped = 1; 107 SYSCTL_INT(_vm, OID_AUTO, mincore_mapped, CTLFLAG_RWTUN, &mincore_mapped, 0, 108 "mincore reports mappings, not residency"); 109 static int imply_prot_max = 0; 110 SYSCTL_INT(_vm, OID_AUTO, imply_prot_max, CTLFLAG_RWTUN, &imply_prot_max, 0, 111 "Imply maximum page protections in mmap() when none are specified"); 112 113 _Static_assert(MAXPAGESIZES <= 4, "MINCORE_SUPER too narrow"); 114 115 #if defined(COMPAT_43) 116 int 117 ogetpagesize(struct thread *td, struct ogetpagesize_args *uap) 118 { 119 120 td->td_retval[0] = PAGE_SIZE; 121 return (0); 122 } 123 #endif /* COMPAT_43 */ 124 125 /* 126 * Memory Map (mmap) system call. Note that the file offset 127 * and address are allowed to be NOT page aligned, though if 128 * the MAP_FIXED flag it set, both must have the same remainder 129 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 130 * page-aligned, the actual mapping starts at trunc_page(addr) 131 * and the return value is adjusted up by the page offset. 132 * 133 * Generally speaking, only character devices which are themselves 134 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 135 * there would be no cache coherency between a descriptor and a VM mapping 136 * both to the same character device. 137 */ 138 #ifndef _SYS_SYSPROTO_H_ 139 struct mmap_args { 140 void *addr; 141 size_t len; 142 int prot; 143 int flags; 144 int fd; 145 long pad; 146 off_t pos; 147 }; 148 #endif 149 150 int 151 sys_mmap(struct thread *td, struct mmap_args *uap) 152 { 153 154 return (kern_mmap(td, &(struct mmap_req){ 155 .mr_hint = (uintptr_t)uap->addr, 156 .mr_len = uap->len, 157 .mr_prot = uap->prot, 158 .mr_flags = uap->flags, 159 .mr_fd = uap->fd, 160 .mr_pos = uap->pos, 161 })); 162 } 163 164 int 165 kern_mmap_maxprot(struct proc *p, int prot) 166 { 167 168 if ((p->p_flag2 & P2_PROTMAX_DISABLE) != 0 || 169 (p->p_fctl0 & NT_FREEBSD_FCTL_PROTMAX_DISABLE) != 0) 170 return (_PROT_ALL); 171 if (((p->p_flag2 & P2_PROTMAX_ENABLE) != 0 || imply_prot_max) && 172 prot != PROT_NONE) 173 return (prot); 174 return (_PROT_ALL); 175 } 176 177 int 178 kern_mmap(struct thread *td, const struct mmap_req *mrp) 179 { 180 struct vmspace *vms; 181 struct file *fp; 182 struct proc *p; 183 off_t pos; 184 vm_offset_t addr, orig_addr; 185 vm_size_t len, pageoff, size; 186 vm_prot_t cap_maxprot; 187 int align, error, fd, flags, max_prot, prot; 188 cap_rights_t rights; 189 mmap_check_fp_fn check_fp_fn; 190 191 orig_addr = addr = mrp->mr_hint; 192 len = mrp->mr_len; 193 prot = mrp->mr_prot; 194 flags = mrp->mr_flags; 195 fd = mrp->mr_fd; 196 pos = mrp->mr_pos; 197 check_fp_fn = mrp->mr_check_fp_fn; 198 199 if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) { 200 return (EXTERROR(EINVAL, "unknown PROT bits")); 201 } 202 max_prot = PROT_MAX_EXTRACT(prot); 203 prot = PROT_EXTRACT(prot); 204 if (max_prot != 0 && (max_prot & prot) != prot) { 205 return (EXTERROR(ENOTSUP, "prot is not subset of max_prot")); 206 } 207 208 p = td->td_proc; 209 210 /* 211 * Always honor PROT_MAX if set. If not, default to all 212 * permissions unless we're implying maximum permissions. 213 */ 214 if (max_prot == 0) 215 max_prot = kern_mmap_maxprot(p, prot); 216 217 vms = p->p_vmspace; 218 fp = NULL; 219 AUDIT_ARG_FD(fd); 220 221 /* 222 * Ignore old flags that used to be defined but did not do anything. 223 */ 224 flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040); 225 226 /* 227 * Enforce the constraints. 228 * Mapping of length 0 is only allowed for old binaries. 229 * Anonymous mapping shall specify -1 as filedescriptor and 230 * zero position for new code. Be nice to ancient a.out 231 * binaries and correct pos for anonymous mapping, since old 232 * ld.so sometimes issues anonymous map requests with non-zero 233 * pos. 234 */ 235 if (!SV_CURPROC_FLAG(SV_AOUT)) { 236 if ((len == 0 && p->p_osrel >= P_OSREL_MAP_ANON) || 237 ((flags & MAP_ANON) != 0 && (fd != -1 || pos != 0))) { 238 return (EXTERROR(EINVAL, 239 "offset not zero/fd not -1 for MAP_ANON", 240 fd, pos)); 241 } 242 } else { 243 if ((flags & MAP_ANON) != 0) 244 pos = 0; 245 } 246 247 if (flags & MAP_STACK) { 248 if ((fd != -1) || ((prot & (PROT_READ | PROT_WRITE)) != 249 (PROT_READ | PROT_WRITE))) { 250 return (EXTERROR(EINVAL, "MAP_STACK with prot < rw", 251 prot)); 252 } 253 flags |= MAP_ANON; 254 pos = 0; 255 } 256 if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE | 257 MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE | 258 MAP_PREFAULT_READ | MAP_GUARD | MAP_32BIT | 259 MAP_ALIGNMENT_MASK)) != 0) { 260 return (EXTERROR(EINVAL, "reserved flag set")); 261 } 262 if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) { 263 return (EXTERROR(EINVAL, "EXCL without FIXED")); 264 } 265 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | 266 MAP_PRIVATE)) { 267 return (EXTERROR(EINVAL, "both SHARED and PRIVATE set")); 268 } 269 if (prot != PROT_NONE && 270 (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0) { 271 return (EXTERROR(EINVAL, "invalid prot", prot)); 272 } 273 if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || fd != -1 || 274 pos != 0 || (flags & ~(MAP_FIXED | MAP_GUARD | MAP_EXCL | 275 MAP_32BIT | MAP_ALIGNMENT_MASK)) != 0)) { 276 return (EXTERROR(EINVAL, "GUARD with wrong parameters")); 277 } 278 279 /* 280 * Align the file position to a page boundary, 281 * and save its page offset component. 282 */ 283 pageoff = (pos & PAGE_MASK); 284 pos -= pageoff; 285 286 /* Compute size from len by rounding (on both ends). */ 287 size = len + pageoff; /* low end... */ 288 size = round_page(size); /* hi end */ 289 /* Check for rounding up to zero. */ 290 if (len > size) 291 return (ENOMEM); 292 293 /* Ensure alignment is at least a page and fits in a pointer. */ 294 align = flags & MAP_ALIGNMENT_MASK; 295 if (align != 0 && align != MAP_ALIGNED_SUPER && 296 (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || 297 align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) { 298 return (EXTERROR(EINVAL, "bad alignment", align)); 299 } 300 301 /* 302 * Check for illegal addresses. Watch out for address wrap... Note 303 * that VM_*_ADDRESS are not constants due to casts (argh). 304 */ 305 if (flags & MAP_FIXED) { 306 /* 307 * The specified address must have the same remainder 308 * as the file offset taken modulo PAGE_SIZE, so it 309 * should be aligned after adjustment by pageoff. 310 */ 311 addr -= pageoff; 312 if ((addr & PAGE_MASK) != 0) { 313 return (EXTERROR(EINVAL, "fixed mapping not aligned", 314 addr)); 315 } 316 317 /* Address range must be all in user VM space. */ 318 if (!vm_map_range_valid(&vms->vm_map, addr, addr + size)) { 319 EXTERROR(EINVAL, "mapping outside vm_map"); 320 return (EINVAL); 321 } 322 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) { 323 return (EXTERROR(EINVAL, 324 "fixed 32bit mapping does not fit into 4G")); 325 } 326 } else if (flags & MAP_32BIT) { 327 /* 328 * For MAP_32BIT, override the hint if it is too high and 329 * do not bother moving the mapping past the heap (since 330 * the heap is usually above 2GB). 331 */ 332 if (addr + size > MAP_32BIT_MAX_ADDR) 333 addr = 0; 334 } else { 335 /* 336 * XXX for non-fixed mappings where no hint is provided or 337 * the hint would fall in the potential heap space, 338 * place it after the end of the largest possible heap. 339 * 340 * For anonymous mappings within the address space of the 341 * calling process, the absence of a hint is handled at a 342 * lower level in order to implement different clustering 343 * strategies for ASLR. 344 */ 345 if (((flags & MAP_ANON) == 0 && addr == 0) || 346 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 347 addr < round_page((vm_offset_t)vms->vm_daddr + 348 lim_max(td, RLIMIT_DATA)))) 349 addr = round_page((vm_offset_t)vms->vm_daddr + 350 lim_max(td, RLIMIT_DATA)); 351 } 352 if (len == 0) { 353 /* 354 * Return success without mapping anything for old 355 * binaries that request a page-aligned mapping of 356 * length 0. For modern binaries, this function 357 * returns an error earlier. 358 */ 359 error = 0; 360 } else if ((flags & MAP_GUARD) != 0) { 361 error = vm_mmap_object(&vms->vm_map, &addr, size, VM_PROT_NONE, 362 VM_PROT_NONE, flags, NULL, pos, FALSE, td); 363 } else if ((flags & MAP_ANON) != 0) { 364 /* 365 * Mapping blank space is trivial. 366 * 367 * This relies on VM_PROT_* matching PROT_*. 368 */ 369 error = vm_mmap_object(&vms->vm_map, &addr, size, prot, 370 max_prot, flags, NULL, pos, FALSE, td); 371 } else { 372 /* 373 * Mapping file, get fp for validation and don't let the 374 * descriptor disappear on us if we block. Check capability 375 * rights, but also return the maximum rights to be combined 376 * with maxprot later. 377 */ 378 cap_rights_init_one(&rights, CAP_MMAP); 379 if (prot & PROT_READ) 380 cap_rights_set_one(&rights, CAP_MMAP_R); 381 if ((flags & MAP_SHARED) != 0) { 382 if (prot & PROT_WRITE) 383 cap_rights_set_one(&rights, CAP_MMAP_W); 384 } 385 if (prot & PROT_EXEC) 386 cap_rights_set_one(&rights, CAP_MMAP_X); 387 error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp); 388 if (error != 0) 389 goto done; 390 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 && 391 p->p_osrel >= P_OSREL_MAP_FSTRICT) { 392 EXTERROR(EINVAL, "neither SHARED nor PRIVATE req"); 393 error = EINVAL; 394 goto done; 395 } 396 if (check_fp_fn != NULL) { 397 error = check_fp_fn(fp, prot, max_prot & cap_maxprot, 398 flags); 399 if (error != 0) 400 goto done; 401 } 402 if (fp->f_ops == &shm_ops && shm_largepage(fp->f_data)) 403 addr = orig_addr; 404 /* This relies on VM_PROT_* matching PROT_*. */ 405 error = fo_mmap(fp, &vms->vm_map, &addr, size, prot, 406 max_prot & cap_maxprot, flags, pos, td); 407 } 408 409 if (error == 0) 410 td->td_retval[0] = addr + pageoff; 411 done: 412 if (fp) 413 fdrop(fp, td); 414 415 return (error); 416 } 417 418 #if defined(COMPAT_FREEBSD6) 419 int 420 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 421 { 422 return (kern_mmap(td, &(struct mmap_req){ 423 .mr_hint = (uintptr_t)uap->addr, 424 .mr_len = uap->len, 425 .mr_prot = uap->prot, 426 .mr_flags = uap->flags, 427 .mr_fd = uap->fd, 428 .mr_pos = uap->pos, 429 })); 430 } 431 #endif 432 433 #ifdef COMPAT_43 434 #ifndef _SYS_SYSPROTO_H_ 435 struct ommap_args { 436 caddr_t addr; 437 int len; 438 int prot; 439 int flags; 440 int fd; 441 long pos; 442 }; 443 #endif 444 int 445 ommap(struct thread *td, struct ommap_args *uap) 446 { 447 return (kern_ommap(td, (uintptr_t)uap->addr, uap->len, uap->prot, 448 uap->flags, uap->fd, uap->pos)); 449 } 450 451 int 452 kern_ommap(struct thread *td, uintptr_t hint, int len, int oprot, 453 int oflags, int fd, long pos) 454 { 455 static const char cvtbsdprot[8] = { 456 0, 457 PROT_EXEC, 458 PROT_WRITE, 459 PROT_EXEC | PROT_WRITE, 460 PROT_READ, 461 PROT_EXEC | PROT_READ, 462 PROT_WRITE | PROT_READ, 463 PROT_EXEC | PROT_WRITE | PROT_READ, 464 }; 465 int flags, prot; 466 467 if (len < 0) 468 return (EINVAL); 469 470 #define OMAP_ANON 0x0002 471 #define OMAP_COPY 0x0020 472 #define OMAP_SHARED 0x0010 473 #define OMAP_FIXED 0x0100 474 475 prot = cvtbsdprot[oprot & 0x7]; 476 #if (defined(COMPAT_FREEBSD32) && defined(__amd64__)) || defined(__i386__) 477 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 478 prot != 0) 479 prot |= PROT_EXEC; 480 #endif 481 flags = 0; 482 if (oflags & OMAP_ANON) 483 flags |= MAP_ANON; 484 if (oflags & OMAP_COPY) 485 flags |= MAP_COPY; 486 if (oflags & OMAP_SHARED) 487 flags |= MAP_SHARED; 488 else 489 flags |= MAP_PRIVATE; 490 if (oflags & OMAP_FIXED) 491 flags |= MAP_FIXED; 492 return (kern_mmap(td, &(struct mmap_req){ 493 .mr_hint = hint, 494 .mr_len = len, 495 .mr_prot = prot, 496 .mr_flags = flags, 497 .mr_fd = fd, 498 .mr_pos = pos, 499 })); 500 } 501 #endif /* COMPAT_43 */ 502 503 #ifndef _SYS_SYSPROTO_H_ 504 struct msync_args { 505 void *addr; 506 size_t len; 507 int flags; 508 }; 509 #endif 510 int 511 sys_msync(struct thread *td, struct msync_args *uap) 512 { 513 514 return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags)); 515 } 516 517 int 518 kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags) 519 { 520 vm_offset_t addr; 521 vm_size_t pageoff; 522 vm_map_t map; 523 int rv; 524 525 addr = addr0; 526 pageoff = (addr & PAGE_MASK); 527 addr -= pageoff; 528 size += pageoff; 529 size = (vm_size_t) round_page(size); 530 if (addr + size < addr) 531 return (EINVAL); 532 533 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 534 return (EINVAL); 535 536 map = &td->td_proc->p_vmspace->vm_map; 537 538 /* 539 * Clean the pages and interpret the return value. 540 */ 541 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 542 (flags & MS_INVALIDATE) != 0); 543 switch (rv) { 544 case KERN_SUCCESS: 545 return (0); 546 case KERN_INVALID_ADDRESS: 547 return (ENOMEM); 548 case KERN_INVALID_ARGUMENT: 549 return (EBUSY); 550 case KERN_FAILURE: 551 return (EIO); 552 default: 553 return (EINVAL); 554 } 555 } 556 557 #ifndef _SYS_SYSPROTO_H_ 558 struct munmap_args { 559 void *addr; 560 size_t len; 561 }; 562 #endif 563 int 564 sys_munmap(struct thread *td, struct munmap_args *uap) 565 { 566 567 return (kern_munmap(td, (uintptr_t)uap->addr, uap->len)); 568 } 569 570 int 571 kern_munmap(struct thread *td, uintptr_t addr0, size_t size) 572 { 573 #ifdef HWPMC_HOOKS 574 struct pmckern_map_out pkm; 575 vm_map_entry_t entry; 576 bool pmc_handled; 577 #endif 578 vm_offset_t addr, end; 579 vm_size_t pageoff; 580 vm_map_t map; 581 int rv; 582 583 if (size == 0) 584 return (EINVAL); 585 586 addr = addr0; 587 pageoff = (addr & PAGE_MASK); 588 addr -= pageoff; 589 size += pageoff; 590 size = (vm_size_t) round_page(size); 591 end = addr + size; 592 map = &td->td_proc->p_vmspace->vm_map; 593 if (!vm_map_range_valid(map, addr, end)) 594 return (EINVAL); 595 596 vm_map_lock(map); 597 #ifdef HWPMC_HOOKS 598 pmc_handled = false; 599 if (PMC_HOOK_INSTALLED(PMC_FN_MUNMAP)) { 600 pmc_handled = true; 601 /* 602 * Inform hwpmc if the address range being unmapped contains 603 * an executable region. 604 */ 605 pkm.pm_address = (uintptr_t) NULL; 606 if (vm_map_lookup_entry(map, addr, &entry)) { 607 for (; entry->start < end; 608 entry = vm_map_entry_succ(entry)) { 609 if (vm_map_check_protection(map, entry->start, 610 entry->end, VM_PROT_EXECUTE) == TRUE) { 611 pkm.pm_address = (uintptr_t) addr; 612 pkm.pm_size = (size_t) size; 613 break; 614 } 615 } 616 } 617 } 618 #endif 619 rv = vm_map_delete(map, addr, end); 620 621 #ifdef HWT_HOOKS 622 if (HWT_HOOK_INSTALLED && rv == KERN_SUCCESS) { 623 struct hwt_record_entry ent; 624 625 ent.addr = (uintptr_t) addr; 626 ent.fullpath = NULL; 627 ent.record_type = HWT_RECORD_MUNMAP; 628 HWT_CALL_HOOK(td, HWT_RECORD, &ent); 629 } 630 #endif 631 632 #ifdef HWPMC_HOOKS 633 if (rv == KERN_SUCCESS && __predict_false(pmc_handled)) { 634 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 635 vm_map_lock_downgrade(map); 636 if (pkm.pm_address != (uintptr_t) NULL) 637 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 638 vm_map_unlock_read(map); 639 } else 640 #endif 641 vm_map_unlock(map); 642 643 return (vm_mmap_to_errno(rv)); 644 } 645 646 #ifndef _SYS_SYSPROTO_H_ 647 struct mprotect_args { 648 const void *addr; 649 size_t len; 650 int prot; 651 }; 652 #endif 653 int 654 sys_mprotect(struct thread *td, struct mprotect_args *uap) 655 { 656 657 return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len, 658 uap->prot, 0)); 659 } 660 661 int 662 kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot, 663 int flags) 664 { 665 vm_offset_t addr; 666 vm_size_t pageoff; 667 int vm_error, max_prot; 668 669 addr = addr0; 670 if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) 671 return (EINVAL); 672 max_prot = PROT_MAX_EXTRACT(prot); 673 prot = PROT_EXTRACT(prot); 674 pageoff = (addr & PAGE_MASK); 675 addr -= pageoff; 676 size += pageoff; 677 size = (vm_size_t) round_page(size); 678 #ifdef COMPAT_FREEBSD32 679 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { 680 if (((addr + size) & 0xffffffff) < addr) 681 return (EINVAL); 682 } else 683 #endif 684 if (addr + size < addr) 685 return (EINVAL); 686 687 flags |= VM_MAP_PROTECT_SET_PROT; 688 if (max_prot != 0) 689 flags |= VM_MAP_PROTECT_SET_MAXPROT; 690 vm_error = vm_map_protect(&td->td_proc->p_vmspace->vm_map, 691 addr, addr + size, prot, max_prot, flags); 692 693 switch (vm_error) { 694 case KERN_SUCCESS: 695 return (0); 696 case KERN_PROTECTION_FAILURE: 697 return (EACCES); 698 case KERN_RESOURCE_SHORTAGE: 699 return (ENOMEM); 700 case KERN_OUT_OF_BOUNDS: 701 return (ENOTSUP); 702 } 703 return (EINVAL); 704 } 705 706 #ifndef _SYS_SYSPROTO_H_ 707 struct minherit_args { 708 void *addr; 709 size_t len; 710 int inherit; 711 }; 712 #endif 713 int 714 sys_minherit(struct thread *td, struct minherit_args *uap) 715 { 716 717 return (kern_minherit(td, (uintptr_t)uap->addr, uap->len, 718 uap->inherit)); 719 } 720 721 int 722 kern_minherit(struct thread *td, uintptr_t addr0, size_t len, int inherit0) 723 { 724 vm_offset_t addr; 725 vm_size_t size, pageoff; 726 vm_inherit_t inherit; 727 728 addr = (vm_offset_t)addr0; 729 size = len; 730 inherit = inherit0; 731 732 pageoff = (addr & PAGE_MASK); 733 addr -= pageoff; 734 size += pageoff; 735 size = (vm_size_t) round_page(size); 736 if (addr + size < addr) 737 return (EINVAL); 738 739 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 740 addr + size, inherit)) { 741 case KERN_SUCCESS: 742 return (0); 743 case KERN_PROTECTION_FAILURE: 744 return (EACCES); 745 } 746 return (EINVAL); 747 } 748 749 #ifndef _SYS_SYSPROTO_H_ 750 struct madvise_args { 751 void *addr; 752 size_t len; 753 int behav; 754 }; 755 #endif 756 757 int 758 sys_madvise(struct thread *td, struct madvise_args *uap) 759 { 760 761 return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav)); 762 } 763 764 int 765 kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav) 766 { 767 vm_map_t map; 768 vm_offset_t addr, end, start; 769 int flags; 770 771 /* 772 * Check for our special case, advising the swap pager we are 773 * "immortal." 774 */ 775 if (behav == MADV_PROTECT) { 776 flags = PPROT_SET; 777 return (kern_procctl(td, P_PID, td->td_proc->p_pid, 778 PROC_SPROTECT, &flags)); 779 } 780 781 /* 782 * Check for illegal addresses. Watch out for address wrap... Note 783 * that VM_*_ADDRESS are not constants due to casts (argh). 784 */ 785 map = &td->td_proc->p_vmspace->vm_map; 786 addr = addr0; 787 if (!vm_map_range_valid(map, addr, addr + len)) 788 return (EINVAL); 789 790 /* 791 * Since this routine is only advisory, we default to conservative 792 * behavior. 793 */ 794 start = trunc_page(addr); 795 end = round_page(addr + len); 796 797 /* 798 * vm_map_madvise() checks for illegal values of behav. 799 */ 800 return (vm_map_madvise(map, start, end, behav)); 801 } 802 803 #ifndef _SYS_SYSPROTO_H_ 804 struct mincore_args { 805 const void *addr; 806 size_t len; 807 char *vec; 808 }; 809 #endif 810 811 int 812 sys_mincore(struct thread *td, struct mincore_args *uap) 813 { 814 815 return (kern_mincore(td, (uintptr_t)uap->addr, uap->len, uap->vec)); 816 } 817 818 int 819 kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec) 820 { 821 pmap_t pmap; 822 vm_map_t map; 823 vm_map_entry_t current, entry; 824 vm_object_t object; 825 vm_offset_t addr, cend, end, first_addr; 826 vm_paddr_t pa; 827 vm_page_t m; 828 vm_pindex_t pindex; 829 int error, lastvecindex, mincoreinfo, vecindex; 830 unsigned int timestamp; 831 832 /* 833 * Make sure that the addresses presented are valid for user 834 * mode. 835 */ 836 first_addr = addr = trunc_page(addr0); 837 end = round_page(addr0 + len); 838 map = &td->td_proc->p_vmspace->vm_map; 839 if (end > vm_map_max(map) || end < addr) 840 return (ENOMEM); 841 842 pmap = vmspace_pmap(td->td_proc->p_vmspace); 843 844 vm_map_lock_read(map); 845 RestartScan: 846 timestamp = map->timestamp; 847 848 if (!vm_map_lookup_entry(map, addr, &entry)) { 849 vm_map_unlock_read(map); 850 return (ENOMEM); 851 } 852 853 /* 854 * Do this on a map entry basis so that if the pages are not 855 * in the current processes address space, we can easily look 856 * up the pages elsewhere. 857 */ 858 lastvecindex = -1; 859 while (entry->start < end) { 860 /* 861 * check for contiguity 862 */ 863 current = entry; 864 entry = vm_map_entry_succ(current); 865 if (current->end < end && 866 entry->start > current->end) { 867 vm_map_unlock_read(map); 868 return (ENOMEM); 869 } 870 871 /* 872 * ignore submaps (for now) or null objects 873 */ 874 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 875 current->object.vm_object == NULL) 876 continue; 877 878 /* 879 * limit this scan to the current map entry and the 880 * limits for the mincore call 881 */ 882 if (addr < current->start) 883 addr = current->start; 884 cend = current->end; 885 if (cend > end) 886 cend = end; 887 888 for (; addr < cend; addr += PAGE_SIZE) { 889 /* 890 * Check pmap first, it is likely faster, also 891 * it can provide info as to whether we are the 892 * one referencing or modifying the page. 893 */ 894 m = NULL; 895 object = NULL; 896 retry: 897 pa = 0; 898 mincoreinfo = pmap_mincore(pmap, addr, &pa); 899 if (mincore_mapped) { 900 /* 901 * We only care about this pmap's 902 * mapping of the page, if any. 903 */ 904 ; 905 } else if (pa != 0) { 906 /* 907 * The page is mapped by this process but not 908 * both accessed and modified. It is also 909 * managed. Acquire the object lock so that 910 * other mappings might be examined. The page's 911 * identity may change at any point before its 912 * object lock is acquired, so re-validate if 913 * necessary. 914 */ 915 m = PHYS_TO_VM_PAGE(pa); 916 while (object == NULL || m->object != object) { 917 if (object != NULL) 918 VM_OBJECT_WUNLOCK(object); 919 object = atomic_load_ptr(&m->object); 920 if (object == NULL) 921 goto retry; 922 VM_OBJECT_WLOCK(object); 923 } 924 if (pa != pmap_extract(pmap, addr)) 925 goto retry; 926 KASSERT(vm_page_all_valid(m), 927 ("mincore: page %p is mapped but invalid", 928 m)); 929 } else if (mincoreinfo == 0) { 930 /* 931 * The page is not mapped by this process. If 932 * the object implements managed pages, then 933 * determine if the page is resident so that 934 * the mappings might be examined. 935 */ 936 if (current->object.vm_object != object) { 937 if (object != NULL) 938 VM_OBJECT_WUNLOCK(object); 939 object = current->object.vm_object; 940 VM_OBJECT_WLOCK(object); 941 } 942 if ((object->flags & OBJ_SWAP) != 0 || 943 object->type == OBJT_VNODE) { 944 pindex = OFF_TO_IDX(current->offset + 945 (addr - current->start)); 946 m = vm_page_lookup(object, pindex); 947 if (m != NULL && vm_page_none_valid(m)) 948 m = NULL; 949 if (m != NULL) 950 mincoreinfo = MINCORE_INCORE; 951 } 952 } 953 if (m != NULL) { 954 VM_OBJECT_ASSERT_WLOCKED(m->object); 955 956 /* Examine other mappings of the page. */ 957 if (m->dirty == 0 && pmap_is_modified(m)) 958 vm_page_dirty(m); 959 if (m->dirty != 0) 960 mincoreinfo |= MINCORE_MODIFIED_OTHER; 961 962 /* 963 * The first test for PGA_REFERENCED is an 964 * optimization. The second test is 965 * required because a concurrent pmap 966 * operation could clear the last reference 967 * and set PGA_REFERENCED before the call to 968 * pmap_is_referenced(). 969 */ 970 if ((m->a.flags & PGA_REFERENCED) != 0 || 971 pmap_is_referenced(m) || 972 (m->a.flags & PGA_REFERENCED) != 0) 973 mincoreinfo |= MINCORE_REFERENCED_OTHER; 974 } 975 if (object != NULL) 976 VM_OBJECT_WUNLOCK(object); 977 978 /* 979 * subyte may page fault. In case it needs to modify 980 * the map, we release the lock. 981 */ 982 vm_map_unlock_read(map); 983 984 /* 985 * calculate index into user supplied byte vector 986 */ 987 vecindex = atop(addr - first_addr); 988 989 /* 990 * If we have skipped map entries, we need to make sure that 991 * the byte vector is zeroed for those skipped entries. 992 */ 993 while ((lastvecindex + 1) < vecindex) { 994 ++lastvecindex; 995 error = subyte(vec + lastvecindex, 0); 996 if (error) { 997 error = EFAULT; 998 goto done2; 999 } 1000 } 1001 1002 /* 1003 * Pass the page information to the user 1004 */ 1005 error = subyte(vec + vecindex, mincoreinfo); 1006 if (error) { 1007 error = EFAULT; 1008 goto done2; 1009 } 1010 1011 /* 1012 * If the map has changed, due to the subyte, the previous 1013 * output may be invalid. 1014 */ 1015 vm_map_lock_read(map); 1016 if (timestamp != map->timestamp) 1017 goto RestartScan; 1018 1019 lastvecindex = vecindex; 1020 } 1021 } 1022 1023 /* 1024 * subyte may page fault. In case it needs to modify 1025 * the map, we release the lock. 1026 */ 1027 vm_map_unlock_read(map); 1028 1029 /* 1030 * Zero the last entries in the byte vector. 1031 */ 1032 vecindex = atop(end - first_addr); 1033 while ((lastvecindex + 1) < vecindex) { 1034 ++lastvecindex; 1035 error = subyte(vec + lastvecindex, 0); 1036 if (error) { 1037 error = EFAULT; 1038 goto done2; 1039 } 1040 } 1041 1042 /* 1043 * If the map has changed, due to the subyte, the previous 1044 * output may be invalid. 1045 */ 1046 vm_map_lock_read(map); 1047 if (timestamp != map->timestamp) 1048 goto RestartScan; 1049 vm_map_unlock_read(map); 1050 done2: 1051 return (error); 1052 } 1053 1054 #ifndef _SYS_SYSPROTO_H_ 1055 struct mlock_args { 1056 const void *addr; 1057 size_t len; 1058 }; 1059 #endif 1060 int 1061 sys_mlock(struct thread *td, struct mlock_args *uap) 1062 { 1063 1064 return (kern_mlock(td->td_proc, td->td_ucred, 1065 __DECONST(uintptr_t, uap->addr), uap->len)); 1066 } 1067 1068 int 1069 kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len) 1070 { 1071 vm_offset_t addr, end, last, start; 1072 vm_size_t npages, size; 1073 vm_map_t map; 1074 unsigned long nsize; 1075 int error; 1076 1077 error = priv_check_cred(cred, PRIV_VM_MLOCK); 1078 if (error) 1079 return (error); 1080 addr = addr0; 1081 size = len; 1082 last = addr + size; 1083 start = trunc_page(addr); 1084 end = round_page(last); 1085 if (last < addr || end < addr) 1086 return (EINVAL); 1087 npages = atop(end - start); 1088 if (npages > vm_page_max_user_wired) 1089 return (ENOMEM); 1090 map = &proc->p_vmspace->vm_map; 1091 PROC_LOCK(proc); 1092 nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1093 if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) { 1094 PROC_UNLOCK(proc); 1095 return (ENOMEM); 1096 } 1097 PROC_UNLOCK(proc); 1098 #ifdef RACCT 1099 if (racct_enable) { 1100 PROC_LOCK(proc); 1101 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1102 PROC_UNLOCK(proc); 1103 if (error != 0) 1104 return (ENOMEM); 1105 } 1106 #endif 1107 error = vm_map_wire(map, start, end, 1108 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1109 #ifdef RACCT 1110 if (racct_enable && error != KERN_SUCCESS) { 1111 PROC_LOCK(proc); 1112 racct_set(proc, RACCT_MEMLOCK, 1113 ptoa(pmap_wired_count(map->pmap))); 1114 PROC_UNLOCK(proc); 1115 } 1116 #endif 1117 switch (error) { 1118 case KERN_SUCCESS: 1119 return (0); 1120 case KERN_INVALID_ARGUMENT: 1121 return (EINVAL); 1122 default: 1123 return (ENOMEM); 1124 } 1125 } 1126 1127 #ifndef _SYS_SYSPROTO_H_ 1128 struct mlockall_args { 1129 int how; 1130 }; 1131 #endif 1132 1133 int 1134 sys_mlockall(struct thread *td, struct mlockall_args *uap) 1135 { 1136 vm_map_t map; 1137 int error; 1138 1139 map = &td->td_proc->p_vmspace->vm_map; 1140 error = priv_check(td, PRIV_VM_MLOCK); 1141 if (error) 1142 return (error); 1143 1144 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1145 return (EINVAL); 1146 1147 /* 1148 * If wiring all pages in the process would cause it to exceed 1149 * a hard resource limit, return ENOMEM. 1150 */ 1151 if (!old_mlock && uap->how & MCL_CURRENT) { 1152 if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) 1153 return (ENOMEM); 1154 } 1155 #ifdef RACCT 1156 if (racct_enable) { 1157 PROC_LOCK(td->td_proc); 1158 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1159 PROC_UNLOCK(td->td_proc); 1160 if (error != 0) 1161 return (ENOMEM); 1162 } 1163 #endif 1164 1165 if (uap->how & MCL_FUTURE) { 1166 vm_map_lock(map); 1167 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1168 vm_map_unlock(map); 1169 error = 0; 1170 } 1171 1172 if (uap->how & MCL_CURRENT) { 1173 /* 1174 * P1003.1-2001 mandates that all currently mapped pages 1175 * will be memory resident and locked (wired) upon return 1176 * from mlockall(). vm_map_wire() will wire pages, by 1177 * calling vm_fault_wire() for each page in the region. 1178 */ 1179 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1180 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1181 if (error == KERN_SUCCESS) 1182 error = 0; 1183 else if (error == KERN_RESOURCE_SHORTAGE) 1184 error = ENOMEM; 1185 else 1186 error = EAGAIN; 1187 } 1188 #ifdef RACCT 1189 if (racct_enable && error != KERN_SUCCESS) { 1190 PROC_LOCK(td->td_proc); 1191 racct_set(td->td_proc, RACCT_MEMLOCK, 1192 ptoa(pmap_wired_count(map->pmap))); 1193 PROC_UNLOCK(td->td_proc); 1194 } 1195 #endif 1196 1197 return (error); 1198 } 1199 1200 #ifndef _SYS_SYSPROTO_H_ 1201 struct munlockall_args { 1202 register_t dummy; 1203 }; 1204 #endif 1205 1206 int 1207 sys_munlockall(struct thread *td, struct munlockall_args *uap) 1208 { 1209 vm_map_t map; 1210 int error; 1211 1212 map = &td->td_proc->p_vmspace->vm_map; 1213 error = priv_check(td, PRIV_VM_MUNLOCK); 1214 if (error) 1215 return (error); 1216 1217 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1218 vm_map_lock(map); 1219 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1220 vm_map_unlock(map); 1221 1222 /* Forcibly unwire all pages. */ 1223 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1224 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1225 #ifdef RACCT 1226 if (racct_enable && error == KERN_SUCCESS) { 1227 PROC_LOCK(td->td_proc); 1228 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1229 PROC_UNLOCK(td->td_proc); 1230 } 1231 #endif 1232 1233 return (error); 1234 } 1235 1236 #ifndef _SYS_SYSPROTO_H_ 1237 struct munlock_args { 1238 const void *addr; 1239 size_t len; 1240 }; 1241 #endif 1242 int 1243 sys_munlock(struct thread *td, struct munlock_args *uap) 1244 { 1245 1246 return (kern_munlock(td, (uintptr_t)uap->addr, uap->len)); 1247 } 1248 1249 int 1250 kern_munlock(struct thread *td, uintptr_t addr0, size_t size) 1251 { 1252 vm_offset_t addr, end, last, start; 1253 #ifdef RACCT 1254 vm_map_t map; 1255 #endif 1256 int error; 1257 1258 error = priv_check(td, PRIV_VM_MUNLOCK); 1259 if (error) 1260 return (error); 1261 addr = addr0; 1262 last = addr + size; 1263 start = trunc_page(addr); 1264 end = round_page(last); 1265 if (last < addr || end < addr) 1266 return (EINVAL); 1267 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1268 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1269 #ifdef RACCT 1270 if (racct_enable && error == KERN_SUCCESS) { 1271 PROC_LOCK(td->td_proc); 1272 map = &td->td_proc->p_vmspace->vm_map; 1273 racct_set(td->td_proc, RACCT_MEMLOCK, 1274 ptoa(pmap_wired_count(map->pmap))); 1275 PROC_UNLOCK(td->td_proc); 1276 } 1277 #endif 1278 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1279 } 1280 1281 /* 1282 * vm_mmap_vnode() 1283 * 1284 * Helper function for vm_mmap. Perform sanity check specific for mmap 1285 * operations on vnodes. 1286 */ 1287 int 1288 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1289 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1290 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1291 boolean_t *writecounted) 1292 { 1293 struct vattr va; 1294 vm_object_t obj; 1295 vm_ooffset_t foff; 1296 struct ucred *cred; 1297 int error, flags; 1298 bool writex; 1299 1300 cred = td->td_ucred; 1301 writex = (*maxprotp & VM_PROT_WRITE) != 0 && 1302 (*flagsp & MAP_SHARED) != 0; 1303 if ((error = vget(vp, LK_SHARED)) != 0) 1304 return (error); 1305 AUDIT_ARG_VNODE1(vp); 1306 foff = *foffp; 1307 flags = *flagsp; 1308 obj = vp->v_object; 1309 if (vp->v_type == VREG) { 1310 /* 1311 * Get the proper underlying object 1312 */ 1313 if (obj == NULL) { 1314 error = EINVAL; 1315 goto done; 1316 } 1317 if (obj->type == OBJT_VNODE && obj->handle != vp) { 1318 vput(vp); 1319 vp = (struct vnode *)obj->handle; 1320 /* 1321 * Bypass filesystems obey the mpsafety of the 1322 * underlying fs. Tmpfs never bypasses. 1323 */ 1324 error = vget(vp, LK_SHARED); 1325 if (error != 0) 1326 return (error); 1327 } 1328 if (writex) { 1329 *writecounted = TRUE; 1330 vm_pager_update_writecount(obj, 0, objsize); 1331 } 1332 } else { 1333 error = EXTERROR(EINVAL, "non-reg file"); 1334 goto done; 1335 } 1336 if ((error = VOP_GETATTR(vp, &va, cred))) 1337 goto done; 1338 #ifdef MAC 1339 /* This relies on VM_PROT_* matching PROT_*. */ 1340 error = mac_vnode_check_mmap(cred, vp, (int)prot, flags); 1341 if (error != 0) 1342 goto done; 1343 #endif 1344 if ((flags & MAP_SHARED) != 0) { 1345 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1346 if (prot & VM_PROT_WRITE) { 1347 error = EPERM; 1348 goto done; 1349 } 1350 *maxprotp &= ~VM_PROT_WRITE; 1351 } 1352 } 1353 /* 1354 * If it is a regular file without any references 1355 * we do not need to sync it. 1356 * Adjust object size to be the size of actual file. 1357 */ 1358 objsize = round_page(va.va_size); 1359 if (va.va_nlink == 0) 1360 flags |= MAP_NOSYNC; 1361 if (obj->type == OBJT_VNODE) { 1362 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, 1363 cred); 1364 if (obj == NULL) { 1365 error = ENOMEM; 1366 goto done; 1367 } 1368 } else { 1369 KASSERT((obj->flags & OBJ_SWAP) != 0, ("wrong object type")); 1370 vm_object_reference(obj); 1371 #if VM_NRESERVLEVEL > 0 1372 if ((obj->flags & OBJ_COLORED) == 0) { 1373 VM_OBJECT_WLOCK(obj); 1374 vm_object_color(obj, 0); 1375 VM_OBJECT_WUNLOCK(obj); 1376 } 1377 #endif 1378 } 1379 *objp = obj; 1380 *flagsp = flags; 1381 1382 VOP_MMAPPED(vp); 1383 1384 done: 1385 if (error != 0 && *writecounted) { 1386 *writecounted = FALSE; 1387 vm_pager_update_writecount(obj, objsize, 0); 1388 } 1389 vput(vp); 1390 return (error); 1391 } 1392 1393 /* 1394 * vm_mmap_cdev() 1395 * 1396 * Helper function for vm_mmap. Perform sanity check specific for mmap 1397 * operations on cdevs. 1398 */ 1399 int 1400 vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot, 1401 vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw, 1402 vm_ooffset_t *foff, vm_object_t *objp) 1403 { 1404 vm_object_t obj; 1405 int error, flags; 1406 1407 flags = *flagsp; 1408 1409 if (dsw->d_flags & D_MMAP_ANON) { 1410 *objp = NULL; 1411 *foff = 0; 1412 *maxprotp = VM_PROT_ALL; 1413 *flagsp |= MAP_ANON; 1414 return (0); 1415 } 1416 1417 /* 1418 * cdevs do not provide private mappings of any kind. 1419 */ 1420 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1421 (prot & VM_PROT_WRITE) != 0) 1422 return (EACCES); 1423 if ((flags & (MAP_PRIVATE | MAP_COPY)) != 0) { 1424 return (EXTERROR(EINVAL, "cdev mapping must be shared")); 1425 } 1426 1427 /* 1428 * Force device mappings to be shared. 1429 */ 1430 flags |= MAP_SHARED; 1431 #ifdef MAC_XXX 1432 error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot); 1433 if (error != 0) 1434 return (error); 1435 #endif 1436 /* 1437 * First, try d_mmap_single(). If that is not implemented 1438 * (returns ENODEV), fall back to using the device pager. 1439 * Note that d_mmap_single() must return a reference to the 1440 * object (it needs to bump the reference count of the object 1441 * it returns somehow). 1442 * 1443 * XXX assumes VM_PROT_* == PROT_* 1444 */ 1445 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1446 if (error != ENODEV) 1447 return (error); 1448 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1449 td->td_ucred); 1450 if (obj == NULL) { 1451 return (EXTERROR(EINVAL, 1452 "cdev driver does not support mmap")); 1453 } 1454 *objp = obj; 1455 *flagsp = flags; 1456 return (0); 1457 } 1458 1459 int 1460 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1461 vm_prot_t maxprot, int flags, 1462 objtype_t handle_type, void *handle, 1463 vm_ooffset_t foff) 1464 { 1465 vm_object_t object; 1466 struct thread *td = curthread; 1467 int error; 1468 boolean_t writecounted; 1469 1470 if (size == 0) { 1471 return (EXTERROR(EINVAL, "zero-sized req")); 1472 } 1473 1474 size = round_page(size); 1475 object = NULL; 1476 writecounted = FALSE; 1477 1478 switch (handle_type) { 1479 case OBJT_DEVICE: { 1480 struct cdevsw *dsw; 1481 struct cdev *cdev; 1482 int ref; 1483 1484 cdev = handle; 1485 dsw = dev_refthread(cdev, &ref); 1486 if (dsw == NULL) 1487 return (ENXIO); 1488 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev, 1489 dsw, &foff, &object); 1490 dev_relthread(cdev, ref); 1491 break; 1492 } 1493 case OBJT_VNODE: 1494 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1495 handle, &foff, &object, &writecounted); 1496 break; 1497 default: 1498 error = EXTERROR(EINVAL, "unsupported backing obj type", 1499 handle_type); 1500 break; 1501 } 1502 if (error) 1503 return (error); 1504 1505 error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, 1506 foff, writecounted, td); 1507 if (error != 0 && object != NULL) { 1508 /* 1509 * If this mapping was accounted for in the vnode's 1510 * writecount, then undo that now. 1511 */ 1512 if (writecounted) 1513 vm_pager_release_writecount(object, 0, size); 1514 vm_object_deallocate(object); 1515 } 1516 return (error); 1517 } 1518 1519 int 1520 kern_mmap_racct_check(struct thread *td, vm_map_t map, vm_size_t size) 1521 { 1522 int error; 1523 1524 RACCT_PROC_LOCK(td->td_proc); 1525 if (map->size + size > lim_cur(td, RLIMIT_VMEM)) { 1526 RACCT_PROC_UNLOCK(td->td_proc); 1527 return (ENOMEM); 1528 } 1529 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1530 RACCT_PROC_UNLOCK(td->td_proc); 1531 return (ENOMEM); 1532 } 1533 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1534 if (ptoa(pmap_wired_count(map->pmap)) + size > 1535 lim_cur(td, RLIMIT_MEMLOCK)) { 1536 racct_set_force(td->td_proc, RACCT_VMEM, map->size); 1537 RACCT_PROC_UNLOCK(td->td_proc); 1538 return (ENOMEM); 1539 } 1540 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1541 ptoa(pmap_wired_count(map->pmap)) + size); 1542 if (error != 0) { 1543 racct_set_force(td->td_proc, RACCT_VMEM, map->size); 1544 RACCT_PROC_UNLOCK(td->td_proc); 1545 return (error); 1546 } 1547 } 1548 RACCT_PROC_UNLOCK(td->td_proc); 1549 return (0); 1550 } 1551 1552 /* 1553 * Internal version of mmap that maps a specific VM object into an 1554 * map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap. 1555 */ 1556 int 1557 vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1558 vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff, 1559 boolean_t writecounted, struct thread *td) 1560 { 1561 vm_offset_t default_addr, max_addr; 1562 int docow, error, findspace, rv; 1563 bool curmap, fitit; 1564 1565 curmap = map == &td->td_proc->p_vmspace->vm_map; 1566 if (curmap) { 1567 error = kern_mmap_racct_check(td, map, size); 1568 if (error != 0) 1569 return (error); 1570 } 1571 1572 /* 1573 * We currently can only deal with page aligned file offsets. 1574 * The mmap() system call already enforces this by subtracting 1575 * the page offset from the file offset, but checking here 1576 * catches errors in device drivers (e.g. d_single_mmap() 1577 * callbacks) and other internal mapping requests (such as in 1578 * exec). 1579 */ 1580 if ((foff & PAGE_MASK) != 0) { 1581 return (EXTERROR(EINVAL, "offset not page-aligned", foff)); 1582 } 1583 1584 if ((flags & MAP_FIXED) == 0) { 1585 fitit = true; 1586 *addr = round_page(*addr); 1587 } else { 1588 if (*addr != trunc_page(*addr)) { 1589 return (EXTERROR(EINVAL, 1590 "non-fixed mapping address not aligned", *addr)); 1591 } 1592 fitit = false; 1593 } 1594 1595 if (flags & MAP_ANON) { 1596 if (object != NULL) { 1597 return (EXTERROR(EINVAL, 1598 "anon mapping backed by an object")); 1599 } 1600 if (foff != 0) { 1601 return (EXTERROR(EINVAL, 1602 "anon mapping with non-zero offset")); 1603 } 1604 docow = 0; 1605 } else if (flags & MAP_PREFAULT_READ) 1606 docow = MAP_PREFAULT; 1607 else 1608 docow = MAP_PREFAULT_PARTIAL; 1609 1610 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1611 docow |= MAP_COPY_ON_WRITE; 1612 if (flags & MAP_NOSYNC) 1613 docow |= MAP_DISABLE_SYNCER; 1614 if (flags & MAP_NOCORE) 1615 docow |= MAP_DISABLE_COREDUMP; 1616 /* Shared memory is also shared with children. */ 1617 if (flags & MAP_SHARED) 1618 docow |= MAP_INHERIT_SHARE; 1619 if (writecounted) 1620 docow |= MAP_WRITECOUNT; 1621 if (flags & MAP_STACK) { 1622 if (object != NULL) { 1623 return (EXTERROR(EINVAL, 1624 "stack mapping backed by an object")); 1625 } 1626 docow |= MAP_STACK_AREA; 1627 } 1628 if ((flags & MAP_EXCL) != 0) 1629 docow |= MAP_CHECK_EXCL; 1630 if ((flags & MAP_GUARD) != 0) 1631 docow |= MAP_CREATE_GUARD; 1632 1633 if (fitit) { 1634 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) 1635 findspace = VMFS_SUPER_SPACE; 1636 else if ((flags & MAP_ALIGNMENT_MASK) != 0) 1637 findspace = VMFS_ALIGNED_SPACE(flags >> 1638 MAP_ALIGNMENT_SHIFT); 1639 else 1640 findspace = VMFS_OPTIMAL_SPACE; 1641 max_addr = 0; 1642 if ((flags & MAP_32BIT) != 0) 1643 max_addr = MAP_32BIT_MAX_ADDR; 1644 if (curmap) { 1645 default_addr = 1646 round_page((vm_offset_t)td->td_proc->p_vmspace-> 1647 vm_daddr + lim_max(td, RLIMIT_DATA)); 1648 if ((flags & MAP_32BIT) != 0) 1649 default_addr = 0; 1650 rv = vm_map_find_min(map, object, foff, addr, size, 1651 default_addr, max_addr, findspace, prot, maxprot, 1652 docow); 1653 } else { 1654 rv = vm_map_find(map, object, foff, addr, size, 1655 max_addr, findspace, prot, maxprot, docow); 1656 } 1657 } else { 1658 rv = vm_map_fixed(map, object, foff, *addr, size, 1659 prot, maxprot, docow); 1660 } 1661 1662 if (rv == KERN_SUCCESS) { 1663 /* 1664 * If the process has requested that all future mappings 1665 * be wired, then heed this. 1666 */ 1667 if ((map->flags & MAP_WIREFUTURE) != 0) { 1668 vm_map_lock(map); 1669 if ((map->flags & MAP_WIREFUTURE) != 0) 1670 (void)vm_map_wire_locked(map, *addr, 1671 *addr + size, VM_MAP_WIRE_USER | 1672 ((flags & MAP_STACK) ? VM_MAP_WIRE_HOLESOK : 1673 VM_MAP_WIRE_NOHOLES)); 1674 vm_map_unlock(map); 1675 } 1676 } 1677 return (vm_mmap_to_errno(rv)); 1678 } 1679 1680 /* 1681 * Translate a Mach VM return code to zero on success or the appropriate errno 1682 * on failure. 1683 */ 1684 int 1685 vm_mmap_to_errno(int rv) 1686 { 1687 int error; 1688 1689 switch (rv) { 1690 case KERN_SUCCESS: 1691 return (0); 1692 case KERN_INVALID_ADDRESS: 1693 case KERN_NO_SPACE: 1694 error = ENOMEM; 1695 break; 1696 case KERN_PROTECTION_FAILURE: 1697 error = EACCES; 1698 break; 1699 default: 1700 error = EINVAL; 1701 break; 1702 } 1703 if ((curthread->td_pflags2 & (TDP2_UEXTERR | TDP2_EXTERR)) == 1704 TDP2_UEXTERR) 1705 EXTERROR(error, "mach error", rv); 1706 return (error); 1707 } 1708