1 /*- 2 * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) 3 * 4 * Copyright (c) 1991, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * The Mach Operating System project at Carnegie-Mellon University. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: @(#)vm_map.c 8.3 (Berkeley) 1/12/94 35 * 36 * 37 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 38 * All rights reserved. 39 * 40 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 41 * 42 * Permission to use, copy, modify and distribute this software and 43 * its documentation is hereby granted, provided that both the copyright 44 * notice and this permission notice appear in all copies of the 45 * software, derivative works or modified versions, and any portions 46 * thereof, and that both notices appear in supporting documentation. 47 * 48 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 49 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 50 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 51 * 52 * Carnegie Mellon requests users of this software to return to 53 * 54 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 55 * School of Computer Science 56 * Carnegie Mellon University 57 * Pittsburgh PA 15213-3890 58 * 59 * any improvements or extensions that they make and grant Carnegie the 60 * rights to redistribute these changes. 61 */ 62 63 /* 64 * Virtual memory mapping module. 65 */ 66 67 #include <sys/cdefs.h> 68 __FBSDID("$FreeBSD$"); 69 70 #include <sys/param.h> 71 #include <sys/systm.h> 72 #include <sys/elf.h> 73 #include <sys/kernel.h> 74 #include <sys/ktr.h> 75 #include <sys/lock.h> 76 #include <sys/mutex.h> 77 #include <sys/proc.h> 78 #include <sys/vmmeter.h> 79 #include <sys/mman.h> 80 #include <sys/vnode.h> 81 #include <sys/racct.h> 82 #include <sys/resourcevar.h> 83 #include <sys/rwlock.h> 84 #include <sys/file.h> 85 #include <sys/sysctl.h> 86 #include <sys/sysent.h> 87 #include <sys/shm.h> 88 89 #include <vm/vm.h> 90 #include <vm/vm_param.h> 91 #include <vm/pmap.h> 92 #include <vm/vm_map.h> 93 #include <vm/vm_page.h> 94 #include <vm/vm_pageout.h> 95 #include <vm/vm_object.h> 96 #include <vm/vm_pager.h> 97 #include <vm/vm_kern.h> 98 #include <vm/vm_extern.h> 99 #include <vm/vnode_pager.h> 100 #include <vm/swap_pager.h> 101 #include <vm/uma.h> 102 103 /* 104 * Virtual memory maps provide for the mapping, protection, 105 * and sharing of virtual memory objects. In addition, 106 * this module provides for an efficient virtual copy of 107 * memory from one map to another. 108 * 109 * Synchronization is required prior to most operations. 110 * 111 * Maps consist of an ordered doubly-linked list of simple 112 * entries; a self-adjusting binary search tree of these 113 * entries is used to speed up lookups. 114 * 115 * Since portions of maps are specified by start/end addresses, 116 * which may not align with existing map entries, all 117 * routines merely "clip" entries to these start/end values. 118 * [That is, an entry is split into two, bordering at a 119 * start or end value.] Note that these clippings may not 120 * always be necessary (as the two resulting entries are then 121 * not changed); however, the clipping is done for convenience. 122 * 123 * As mentioned above, virtual copy operations are performed 124 * by copying VM object references from one map to 125 * another, and then marking both regions as copy-on-write. 126 */ 127 128 static struct mtx map_sleep_mtx; 129 static uma_zone_t mapentzone; 130 static uma_zone_t kmapentzone; 131 static uma_zone_t vmspace_zone; 132 static int vmspace_zinit(void *mem, int size, int flags); 133 static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, 134 vm_offset_t max); 135 static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map); 136 static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry); 137 static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry); 138 static int vm_map_growstack(vm_map_t map, vm_offset_t addr, 139 vm_map_entry_t gap_entry); 140 static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot, 141 vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags); 142 #ifdef INVARIANTS 143 static void vmspace_zdtor(void *mem, int size, void *arg); 144 #endif 145 static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, 146 vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max, 147 int cow); 148 static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry, 149 vm_offset_t failed_addr); 150 151 #define ENTRY_CHARGED(e) ((e)->cred != NULL || \ 152 ((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \ 153 !((e)->eflags & MAP_ENTRY_NEEDS_COPY))) 154 155 /* 156 * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type 157 * stable. 158 */ 159 #define PROC_VMSPACE_LOCK(p) do { } while (0) 160 #define PROC_VMSPACE_UNLOCK(p) do { } while (0) 161 162 /* 163 * VM_MAP_RANGE_CHECK: [ internal use only ] 164 * 165 * Asserts that the starting and ending region 166 * addresses fall within the valid range of the map. 167 */ 168 #define VM_MAP_RANGE_CHECK(map, start, end) \ 169 { \ 170 if (start < vm_map_min(map)) \ 171 start = vm_map_min(map); \ 172 if (end > vm_map_max(map)) \ 173 end = vm_map_max(map); \ 174 if (start > end) \ 175 start = end; \ 176 } 177 178 /* 179 * vm_map_startup: 180 * 181 * Initialize the vm_map module. Must be called before 182 * any other vm_map routines. 183 * 184 * Map and entry structures are allocated from the general 185 * purpose memory pool with some exceptions: 186 * 187 * - The kernel map and kmem submap are allocated statically. 188 * - Kernel map entries are allocated out of a static pool. 189 * 190 * These restrictions are necessary since malloc() uses the 191 * maps and requires map entries. 192 */ 193 194 void 195 vm_map_startup(void) 196 { 197 mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF); 198 kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry), 199 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 200 UMA_ZONE_MTXCLASS | UMA_ZONE_VM); 201 mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry), 202 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 203 vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL, 204 #ifdef INVARIANTS 205 vmspace_zdtor, 206 #else 207 NULL, 208 #endif 209 vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 210 } 211 212 static int 213 vmspace_zinit(void *mem, int size, int flags) 214 { 215 struct vmspace *vm; 216 vm_map_t map; 217 218 vm = (struct vmspace *)mem; 219 map = &vm->vm_map; 220 221 memset(map, 0, sizeof(*map)); 222 mtx_init(&map->system_mtx, "vm map (system)", NULL, 223 MTX_DEF | MTX_DUPOK); 224 sx_init(&map->lock, "vm map (user)"); 225 PMAP_LOCK_INIT(vmspace_pmap(vm)); 226 return (0); 227 } 228 229 #ifdef INVARIANTS 230 static void 231 vmspace_zdtor(void *mem, int size, void *arg) 232 { 233 struct vmspace *vm; 234 235 vm = (struct vmspace *)mem; 236 KASSERT(vm->vm_map.nentries == 0, 237 ("vmspace %p nentries == %d on free", vm, vm->vm_map.nentries)); 238 KASSERT(vm->vm_map.size == 0, 239 ("vmspace %p size == %ju on free", vm, (uintmax_t)vm->vm_map.size)); 240 } 241 #endif /* INVARIANTS */ 242 243 /* 244 * Allocate a vmspace structure, including a vm_map and pmap, 245 * and initialize those structures. The refcnt is set to 1. 246 */ 247 struct vmspace * 248 vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit) 249 { 250 struct vmspace *vm; 251 252 vm = uma_zalloc(vmspace_zone, M_WAITOK); 253 KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL")); 254 if (!pinit(vmspace_pmap(vm))) { 255 uma_zfree(vmspace_zone, vm); 256 return (NULL); 257 } 258 CTR1(KTR_VM, "vmspace_alloc: %p", vm); 259 _vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max); 260 refcount_init(&vm->vm_refcnt, 1); 261 vm->vm_shm = NULL; 262 vm->vm_swrss = 0; 263 vm->vm_tsize = 0; 264 vm->vm_dsize = 0; 265 vm->vm_ssize = 0; 266 vm->vm_taddr = 0; 267 vm->vm_daddr = 0; 268 vm->vm_maxsaddr = 0; 269 return (vm); 270 } 271 272 #ifdef RACCT 273 static void 274 vmspace_container_reset(struct proc *p) 275 { 276 277 PROC_LOCK(p); 278 racct_set(p, RACCT_DATA, 0); 279 racct_set(p, RACCT_STACK, 0); 280 racct_set(p, RACCT_RSS, 0); 281 racct_set(p, RACCT_MEMLOCK, 0); 282 racct_set(p, RACCT_VMEM, 0); 283 PROC_UNLOCK(p); 284 } 285 #endif 286 287 static inline void 288 vmspace_dofree(struct vmspace *vm) 289 { 290 291 CTR1(KTR_VM, "vmspace_free: %p", vm); 292 293 /* 294 * Make sure any SysV shm is freed, it might not have been in 295 * exit1(). 296 */ 297 shmexit(vm); 298 299 /* 300 * Lock the map, to wait out all other references to it. 301 * Delete all of the mappings and pages they hold, then call 302 * the pmap module to reclaim anything left. 303 */ 304 (void)vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map), 305 vm_map_max(&vm->vm_map)); 306 307 pmap_release(vmspace_pmap(vm)); 308 vm->vm_map.pmap = NULL; 309 uma_zfree(vmspace_zone, vm); 310 } 311 312 void 313 vmspace_free(struct vmspace *vm) 314 { 315 316 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 317 "vmspace_free() called"); 318 319 if (refcount_release(&vm->vm_refcnt)) 320 vmspace_dofree(vm); 321 } 322 323 void 324 vmspace_exitfree(struct proc *p) 325 { 326 struct vmspace *vm; 327 328 PROC_VMSPACE_LOCK(p); 329 vm = p->p_vmspace; 330 p->p_vmspace = NULL; 331 PROC_VMSPACE_UNLOCK(p); 332 KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace")); 333 vmspace_free(vm); 334 } 335 336 void 337 vmspace_exit(struct thread *td) 338 { 339 struct vmspace *vm; 340 struct proc *p; 341 bool released; 342 343 p = td->td_proc; 344 vm = p->p_vmspace; 345 346 /* 347 * Prepare to release the vmspace reference. The thread that releases 348 * the last reference is responsible for tearing down the vmspace. 349 * However, threads not releasing the final reference must switch to the 350 * kernel's vmspace0 before the decrement so that the subsequent pmap 351 * deactivation does not modify a freed vmspace. 352 */ 353 refcount_acquire(&vmspace0.vm_refcnt); 354 if (!(released = refcount_release_if_last(&vm->vm_refcnt))) { 355 if (p->p_vmspace != &vmspace0) { 356 PROC_VMSPACE_LOCK(p); 357 p->p_vmspace = &vmspace0; 358 PROC_VMSPACE_UNLOCK(p); 359 pmap_activate(td); 360 } 361 released = refcount_release(&vm->vm_refcnt); 362 } 363 if (released) { 364 /* 365 * pmap_remove_pages() expects the pmap to be active, so switch 366 * back first if necessary. 367 */ 368 if (p->p_vmspace != vm) { 369 PROC_VMSPACE_LOCK(p); 370 p->p_vmspace = vm; 371 PROC_VMSPACE_UNLOCK(p); 372 pmap_activate(td); 373 } 374 pmap_remove_pages(vmspace_pmap(vm)); 375 PROC_VMSPACE_LOCK(p); 376 p->p_vmspace = &vmspace0; 377 PROC_VMSPACE_UNLOCK(p); 378 pmap_activate(td); 379 vmspace_dofree(vm); 380 } 381 #ifdef RACCT 382 if (racct_enable) 383 vmspace_container_reset(p); 384 #endif 385 } 386 387 /* Acquire reference to vmspace owned by another process. */ 388 389 struct vmspace * 390 vmspace_acquire_ref(struct proc *p) 391 { 392 struct vmspace *vm; 393 394 PROC_VMSPACE_LOCK(p); 395 vm = p->p_vmspace; 396 if (vm == NULL || !refcount_acquire_if_not_zero(&vm->vm_refcnt)) { 397 PROC_VMSPACE_UNLOCK(p); 398 return (NULL); 399 } 400 if (vm != p->p_vmspace) { 401 PROC_VMSPACE_UNLOCK(p); 402 vmspace_free(vm); 403 return (NULL); 404 } 405 PROC_VMSPACE_UNLOCK(p); 406 return (vm); 407 } 408 409 /* 410 * Switch between vmspaces in an AIO kernel process. 411 * 412 * The new vmspace is either the vmspace of a user process obtained 413 * from an active AIO request or the initial vmspace of the AIO kernel 414 * process (when it is idling). Because user processes will block to 415 * drain any active AIO requests before proceeding in exit() or 416 * execve(), the reference count for vmspaces from AIO requests can 417 * never be 0. Similarly, AIO kernel processes hold an extra 418 * reference on their initial vmspace for the life of the process. As 419 * a result, the 'newvm' vmspace always has a non-zero reference 420 * count. This permits an additional reference on 'newvm' to be 421 * acquired via a simple atomic increment rather than the loop in 422 * vmspace_acquire_ref() above. 423 */ 424 void 425 vmspace_switch_aio(struct vmspace *newvm) 426 { 427 struct vmspace *oldvm; 428 429 /* XXX: Need some way to assert that this is an aio daemon. */ 430 431 KASSERT(refcount_load(&newvm->vm_refcnt) > 0, 432 ("vmspace_switch_aio: newvm unreferenced")); 433 434 oldvm = curproc->p_vmspace; 435 if (oldvm == newvm) 436 return; 437 438 /* 439 * Point to the new address space and refer to it. 440 */ 441 curproc->p_vmspace = newvm; 442 refcount_acquire(&newvm->vm_refcnt); 443 444 /* Activate the new mapping. */ 445 pmap_activate(curthread); 446 447 vmspace_free(oldvm); 448 } 449 450 void 451 _vm_map_lock(vm_map_t map, const char *file, int line) 452 { 453 454 if (map->system_map) 455 mtx_lock_flags_(&map->system_mtx, 0, file, line); 456 else 457 sx_xlock_(&map->lock, file, line); 458 map->timestamp++; 459 } 460 461 void 462 vm_map_entry_set_vnode_text(vm_map_entry_t entry, bool add) 463 { 464 vm_object_t object; 465 struct vnode *vp; 466 bool vp_held; 467 468 if ((entry->eflags & MAP_ENTRY_VN_EXEC) == 0) 469 return; 470 KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0, 471 ("Submap with execs")); 472 object = entry->object.vm_object; 473 KASSERT(object != NULL, ("No object for text, entry %p", entry)); 474 if ((object->flags & OBJ_ANON) != 0) 475 object = object->handle; 476 else 477 KASSERT(object->backing_object == NULL, 478 ("non-anon object %p shadows", object)); 479 KASSERT(object != NULL, ("No content object for text, entry %p obj %p", 480 entry, entry->object.vm_object)); 481 482 /* 483 * Mostly, we do not lock the backing object. It is 484 * referenced by the entry we are processing, so it cannot go 485 * away. 486 */ 487 vp = NULL; 488 vp_held = false; 489 if (object->type == OBJT_DEAD) { 490 /* 491 * For OBJT_DEAD objects, v_writecount was handled in 492 * vnode_pager_dealloc(). 493 */ 494 } else if (object->type == OBJT_VNODE) { 495 vp = object->handle; 496 } else if (object->type == OBJT_SWAP) { 497 KASSERT((object->flags & OBJ_TMPFS_NODE) != 0, 498 ("vm_map_entry_set_vnode_text: swap and !TMPFS " 499 "entry %p, object %p, add %d", entry, object, add)); 500 /* 501 * Tmpfs VREG node, which was reclaimed, has 502 * OBJ_TMPFS_NODE flag set, but not OBJ_TMPFS. In 503 * this case there is no v_writecount to adjust. 504 */ 505 VM_OBJECT_RLOCK(object); 506 if ((object->flags & OBJ_TMPFS) != 0) { 507 vp = object->un_pager.swp.swp_tmpfs; 508 if (vp != NULL) { 509 vhold(vp); 510 vp_held = true; 511 } 512 } 513 VM_OBJECT_RUNLOCK(object); 514 } else { 515 KASSERT(0, 516 ("vm_map_entry_set_vnode_text: wrong object type, " 517 "entry %p, object %p, add %d", entry, object, add)); 518 } 519 if (vp != NULL) { 520 if (add) { 521 VOP_SET_TEXT_CHECKED(vp); 522 } else { 523 vn_lock(vp, LK_SHARED | LK_RETRY); 524 VOP_UNSET_TEXT_CHECKED(vp); 525 VOP_UNLOCK(vp); 526 } 527 if (vp_held) 528 vdrop(vp); 529 } 530 } 531 532 /* 533 * Use a different name for this vm_map_entry field when it's use 534 * is not consistent with its use as part of an ordered search tree. 535 */ 536 #define defer_next right 537 538 static void 539 vm_map_process_deferred(void) 540 { 541 struct thread *td; 542 vm_map_entry_t entry, next; 543 vm_object_t object; 544 545 td = curthread; 546 entry = td->td_map_def_user; 547 td->td_map_def_user = NULL; 548 while (entry != NULL) { 549 next = entry->defer_next; 550 MPASS((entry->eflags & (MAP_ENTRY_WRITECNT | 551 MAP_ENTRY_VN_EXEC)) != (MAP_ENTRY_WRITECNT | 552 MAP_ENTRY_VN_EXEC)); 553 if ((entry->eflags & MAP_ENTRY_WRITECNT) != 0) { 554 /* 555 * Decrement the object's writemappings and 556 * possibly the vnode's v_writecount. 557 */ 558 KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0, 559 ("Submap with writecount")); 560 object = entry->object.vm_object; 561 KASSERT(object != NULL, ("No object for writecount")); 562 vm_pager_release_writecount(object, entry->start, 563 entry->end); 564 } 565 vm_map_entry_set_vnode_text(entry, false); 566 vm_map_entry_deallocate(entry, FALSE); 567 entry = next; 568 } 569 } 570 571 #ifdef INVARIANTS 572 static void 573 _vm_map_assert_locked(vm_map_t map, const char *file, int line) 574 { 575 576 if (map->system_map) 577 mtx_assert_(&map->system_mtx, MA_OWNED, file, line); 578 else 579 sx_assert_(&map->lock, SA_XLOCKED, file, line); 580 } 581 582 #define VM_MAP_ASSERT_LOCKED(map) \ 583 _vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE) 584 585 enum { VMMAP_CHECK_NONE, VMMAP_CHECK_UNLOCK, VMMAP_CHECK_ALL }; 586 #ifdef DIAGNOSTIC 587 static int enable_vmmap_check = VMMAP_CHECK_UNLOCK; 588 #else 589 static int enable_vmmap_check = VMMAP_CHECK_NONE; 590 #endif 591 SYSCTL_INT(_debug, OID_AUTO, vmmap_check, CTLFLAG_RWTUN, 592 &enable_vmmap_check, 0, "Enable vm map consistency checking"); 593 594 static void _vm_map_assert_consistent(vm_map_t map, int check); 595 596 #define VM_MAP_ASSERT_CONSISTENT(map) \ 597 _vm_map_assert_consistent(map, VMMAP_CHECK_ALL) 598 #ifdef DIAGNOSTIC 599 #define VM_MAP_UNLOCK_CONSISTENT(map) do { \ 600 if (map->nupdates > map->nentries) { \ 601 _vm_map_assert_consistent(map, VMMAP_CHECK_UNLOCK); \ 602 map->nupdates = 0; \ 603 } \ 604 } while (0) 605 #else 606 #define VM_MAP_UNLOCK_CONSISTENT(map) 607 #endif 608 #else 609 #define VM_MAP_ASSERT_LOCKED(map) 610 #define VM_MAP_ASSERT_CONSISTENT(map) 611 #define VM_MAP_UNLOCK_CONSISTENT(map) 612 #endif /* INVARIANTS */ 613 614 void 615 _vm_map_unlock(vm_map_t map, const char *file, int line) 616 { 617 618 VM_MAP_UNLOCK_CONSISTENT(map); 619 if (map->system_map) 620 mtx_unlock_flags_(&map->system_mtx, 0, file, line); 621 else { 622 sx_xunlock_(&map->lock, file, line); 623 vm_map_process_deferred(); 624 } 625 } 626 627 void 628 _vm_map_lock_read(vm_map_t map, const char *file, int line) 629 { 630 631 if (map->system_map) 632 mtx_lock_flags_(&map->system_mtx, 0, file, line); 633 else 634 sx_slock_(&map->lock, file, line); 635 } 636 637 void 638 _vm_map_unlock_read(vm_map_t map, const char *file, int line) 639 { 640 641 if (map->system_map) 642 mtx_unlock_flags_(&map->system_mtx, 0, file, line); 643 else { 644 sx_sunlock_(&map->lock, file, line); 645 vm_map_process_deferred(); 646 } 647 } 648 649 int 650 _vm_map_trylock(vm_map_t map, const char *file, int line) 651 { 652 int error; 653 654 error = map->system_map ? 655 !mtx_trylock_flags_(&map->system_mtx, 0, file, line) : 656 !sx_try_xlock_(&map->lock, file, line); 657 if (error == 0) 658 map->timestamp++; 659 return (error == 0); 660 } 661 662 int 663 _vm_map_trylock_read(vm_map_t map, const char *file, int line) 664 { 665 int error; 666 667 error = map->system_map ? 668 !mtx_trylock_flags_(&map->system_mtx, 0, file, line) : 669 !sx_try_slock_(&map->lock, file, line); 670 return (error == 0); 671 } 672 673 /* 674 * _vm_map_lock_upgrade: [ internal use only ] 675 * 676 * Tries to upgrade a read (shared) lock on the specified map to a write 677 * (exclusive) lock. Returns the value "0" if the upgrade succeeds and a 678 * non-zero value if the upgrade fails. If the upgrade fails, the map is 679 * returned without a read or write lock held. 680 * 681 * Requires that the map be read locked. 682 */ 683 int 684 _vm_map_lock_upgrade(vm_map_t map, const char *file, int line) 685 { 686 unsigned int last_timestamp; 687 688 if (map->system_map) { 689 mtx_assert_(&map->system_mtx, MA_OWNED, file, line); 690 } else { 691 if (!sx_try_upgrade_(&map->lock, file, line)) { 692 last_timestamp = map->timestamp; 693 sx_sunlock_(&map->lock, file, line); 694 vm_map_process_deferred(); 695 /* 696 * If the map's timestamp does not change while the 697 * map is unlocked, then the upgrade succeeds. 698 */ 699 sx_xlock_(&map->lock, file, line); 700 if (last_timestamp != map->timestamp) { 701 sx_xunlock_(&map->lock, file, line); 702 return (1); 703 } 704 } 705 } 706 map->timestamp++; 707 return (0); 708 } 709 710 void 711 _vm_map_lock_downgrade(vm_map_t map, const char *file, int line) 712 { 713 714 if (map->system_map) { 715 mtx_assert_(&map->system_mtx, MA_OWNED, file, line); 716 } else { 717 VM_MAP_UNLOCK_CONSISTENT(map); 718 sx_downgrade_(&map->lock, file, line); 719 } 720 } 721 722 /* 723 * vm_map_locked: 724 * 725 * Returns a non-zero value if the caller holds a write (exclusive) lock 726 * on the specified map and the value "0" otherwise. 727 */ 728 int 729 vm_map_locked(vm_map_t map) 730 { 731 732 if (map->system_map) 733 return (mtx_owned(&map->system_mtx)); 734 else 735 return (sx_xlocked(&map->lock)); 736 } 737 738 /* 739 * _vm_map_unlock_and_wait: 740 * 741 * Atomically releases the lock on the specified map and puts the calling 742 * thread to sleep. The calling thread will remain asleep until either 743 * vm_map_wakeup() is performed on the map or the specified timeout is 744 * exceeded. 745 * 746 * WARNING! This function does not perform deferred deallocations of 747 * objects and map entries. Therefore, the calling thread is expected to 748 * reacquire the map lock after reawakening and later perform an ordinary 749 * unlock operation, such as vm_map_unlock(), before completing its 750 * operation on the map. 751 */ 752 int 753 _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line) 754 { 755 756 VM_MAP_UNLOCK_CONSISTENT(map); 757 mtx_lock(&map_sleep_mtx); 758 if (map->system_map) 759 mtx_unlock_flags_(&map->system_mtx, 0, file, line); 760 else 761 sx_xunlock_(&map->lock, file, line); 762 return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps", 763 timo)); 764 } 765 766 /* 767 * vm_map_wakeup: 768 * 769 * Awaken any threads that have slept on the map using 770 * vm_map_unlock_and_wait(). 771 */ 772 void 773 vm_map_wakeup(vm_map_t map) 774 { 775 776 /* 777 * Acquire and release map_sleep_mtx to prevent a wakeup() 778 * from being performed (and lost) between the map unlock 779 * and the msleep() in _vm_map_unlock_and_wait(). 780 */ 781 mtx_lock(&map_sleep_mtx); 782 mtx_unlock(&map_sleep_mtx); 783 wakeup(&map->root); 784 } 785 786 void 787 vm_map_busy(vm_map_t map) 788 { 789 790 VM_MAP_ASSERT_LOCKED(map); 791 map->busy++; 792 } 793 794 void 795 vm_map_unbusy(vm_map_t map) 796 { 797 798 VM_MAP_ASSERT_LOCKED(map); 799 KASSERT(map->busy, ("vm_map_unbusy: not busy")); 800 if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) { 801 vm_map_modflags(map, 0, MAP_BUSY_WAKEUP); 802 wakeup(&map->busy); 803 } 804 } 805 806 void 807 vm_map_wait_busy(vm_map_t map) 808 { 809 810 VM_MAP_ASSERT_LOCKED(map); 811 while (map->busy) { 812 vm_map_modflags(map, MAP_BUSY_WAKEUP, 0); 813 if (map->system_map) 814 msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0); 815 else 816 sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0); 817 } 818 map->timestamp++; 819 } 820 821 long 822 vmspace_resident_count(struct vmspace *vmspace) 823 { 824 return pmap_resident_count(vmspace_pmap(vmspace)); 825 } 826 827 /* 828 * Initialize an existing vm_map structure 829 * such as that in the vmspace structure. 830 */ 831 static void 832 _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max) 833 { 834 835 map->header.eflags = MAP_ENTRY_HEADER; 836 map->needs_wakeup = FALSE; 837 map->system_map = 0; 838 map->pmap = pmap; 839 map->header.end = min; 840 map->header.start = max; 841 map->flags = 0; 842 map->header.left = map->header.right = &map->header; 843 map->root = NULL; 844 map->timestamp = 0; 845 map->busy = 0; 846 map->anon_loc = 0; 847 #ifdef DIAGNOSTIC 848 map->nupdates = 0; 849 #endif 850 } 851 852 void 853 vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max) 854 { 855 856 _vm_map_init(map, pmap, min, max); 857 mtx_init(&map->system_mtx, "vm map (system)", NULL, 858 MTX_DEF | MTX_DUPOK); 859 sx_init(&map->lock, "vm map (user)"); 860 } 861 862 /* 863 * vm_map_entry_dispose: [ internal use only ] 864 * 865 * Inverse of vm_map_entry_create. 866 */ 867 static void 868 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry) 869 { 870 uma_zfree(map->system_map ? kmapentzone : mapentzone, entry); 871 } 872 873 /* 874 * vm_map_entry_create: [ internal use only ] 875 * 876 * Allocates a VM map entry for insertion. 877 * No entry fields are filled in. 878 */ 879 static vm_map_entry_t 880 vm_map_entry_create(vm_map_t map) 881 { 882 vm_map_entry_t new_entry; 883 884 if (map->system_map) 885 new_entry = uma_zalloc(kmapentzone, M_NOWAIT); 886 else 887 new_entry = uma_zalloc(mapentzone, M_WAITOK); 888 if (new_entry == NULL) 889 panic("vm_map_entry_create: kernel resources exhausted"); 890 return (new_entry); 891 } 892 893 /* 894 * vm_map_entry_set_behavior: 895 * 896 * Set the expected access behavior, either normal, random, or 897 * sequential. 898 */ 899 static inline void 900 vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior) 901 { 902 entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) | 903 (behavior & MAP_ENTRY_BEHAV_MASK); 904 } 905 906 /* 907 * vm_map_entry_max_free_{left,right}: 908 * 909 * Compute the size of the largest free gap between two entries, 910 * one the root of a tree and the other the ancestor of that root 911 * that is the least or greatest ancestor found on the search path. 912 */ 913 static inline vm_size_t 914 vm_map_entry_max_free_left(vm_map_entry_t root, vm_map_entry_t left_ancestor) 915 { 916 917 return (root->left != left_ancestor ? 918 root->left->max_free : root->start - left_ancestor->end); 919 } 920 921 static inline vm_size_t 922 vm_map_entry_max_free_right(vm_map_entry_t root, vm_map_entry_t right_ancestor) 923 { 924 925 return (root->right != right_ancestor ? 926 root->right->max_free : right_ancestor->start - root->end); 927 } 928 929 /* 930 * vm_map_entry_{pred,succ}: 931 * 932 * Find the {predecessor, successor} of the entry by taking one step 933 * in the appropriate direction and backtracking as much as necessary. 934 * vm_map_entry_succ is defined in vm_map.h. 935 */ 936 static inline vm_map_entry_t 937 vm_map_entry_pred(vm_map_entry_t entry) 938 { 939 vm_map_entry_t prior; 940 941 prior = entry->left; 942 if (prior->right->start < entry->start) { 943 do 944 prior = prior->right; 945 while (prior->right != entry); 946 } 947 return (prior); 948 } 949 950 static inline vm_size_t 951 vm_size_max(vm_size_t a, vm_size_t b) 952 { 953 954 return (a > b ? a : b); 955 } 956 957 #define SPLAY_LEFT_STEP(root, y, llist, rlist, test) do { \ 958 vm_map_entry_t z; \ 959 vm_size_t max_free; \ 960 \ 961 /* \ 962 * Infer root->right->max_free == root->max_free when \ 963 * y->max_free < root->max_free || root->max_free == 0. \ 964 * Otherwise, look right to find it. \ 965 */ \ 966 y = root->left; \ 967 max_free = root->max_free; \ 968 KASSERT(max_free == vm_size_max( \ 969 vm_map_entry_max_free_left(root, llist), \ 970 vm_map_entry_max_free_right(root, rlist)), \ 971 ("%s: max_free invariant fails", __func__)); \ 972 if (max_free - 1 < vm_map_entry_max_free_left(root, llist)) \ 973 max_free = vm_map_entry_max_free_right(root, rlist); \ 974 if (y != llist && (test)) { \ 975 /* Rotate right and make y root. */ \ 976 z = y->right; \ 977 if (z != root) { \ 978 root->left = z; \ 979 y->right = root; \ 980 if (max_free < y->max_free) \ 981 root->max_free = max_free = \ 982 vm_size_max(max_free, z->max_free); \ 983 } else if (max_free < y->max_free) \ 984 root->max_free = max_free = \ 985 vm_size_max(max_free, root->start - y->end);\ 986 root = y; \ 987 y = root->left; \ 988 } \ 989 /* Copy right->max_free. Put root on rlist. */ \ 990 root->max_free = max_free; \ 991 KASSERT(max_free == vm_map_entry_max_free_right(root, rlist), \ 992 ("%s: max_free not copied from right", __func__)); \ 993 root->left = rlist; \ 994 rlist = root; \ 995 root = y != llist ? y : NULL; \ 996 } while (0) 997 998 #define SPLAY_RIGHT_STEP(root, y, llist, rlist, test) do { \ 999 vm_map_entry_t z; \ 1000 vm_size_t max_free; \ 1001 \ 1002 /* \ 1003 * Infer root->left->max_free == root->max_free when \ 1004 * y->max_free < root->max_free || root->max_free == 0. \ 1005 * Otherwise, look left to find it. \ 1006 */ \ 1007 y = root->right; \ 1008 max_free = root->max_free; \ 1009 KASSERT(max_free == vm_size_max( \ 1010 vm_map_entry_max_free_left(root, llist), \ 1011 vm_map_entry_max_free_right(root, rlist)), \ 1012 ("%s: max_free invariant fails", __func__)); \ 1013 if (max_free - 1 < vm_map_entry_max_free_right(root, rlist)) \ 1014 max_free = vm_map_entry_max_free_left(root, llist); \ 1015 if (y != rlist && (test)) { \ 1016 /* Rotate left and make y root. */ \ 1017 z = y->left; \ 1018 if (z != root) { \ 1019 root->right = z; \ 1020 y->left = root; \ 1021 if (max_free < y->max_free) \ 1022 root->max_free = max_free = \ 1023 vm_size_max(max_free, z->max_free); \ 1024 } else if (max_free < y->max_free) \ 1025 root->max_free = max_free = \ 1026 vm_size_max(max_free, y->start - root->end);\ 1027 root = y; \ 1028 y = root->right; \ 1029 } \ 1030 /* Copy left->max_free. Put root on llist. */ \ 1031 root->max_free = max_free; \ 1032 KASSERT(max_free == vm_map_entry_max_free_left(root, llist), \ 1033 ("%s: max_free not copied from left", __func__)); \ 1034 root->right = llist; \ 1035 llist = root; \ 1036 root = y != rlist ? y : NULL; \ 1037 } while (0) 1038 1039 /* 1040 * Walk down the tree until we find addr or a gap where addr would go, breaking 1041 * off left and right subtrees of nodes less than, or greater than addr. Treat 1042 * subtrees with root->max_free < length as empty trees. llist and rlist are 1043 * the two sides in reverse order (bottom-up), with llist linked by the right 1044 * pointer and rlist linked by the left pointer in the vm_map_entry, and both 1045 * lists terminated by &map->header. This function, and the subsequent call to 1046 * vm_map_splay_merge_{left,right,pred,succ}, rely on the start and end address 1047 * values in &map->header. 1048 */ 1049 static __always_inline vm_map_entry_t 1050 vm_map_splay_split(vm_map_t map, vm_offset_t addr, vm_size_t length, 1051 vm_map_entry_t *llist, vm_map_entry_t *rlist) 1052 { 1053 vm_map_entry_t left, right, root, y; 1054 1055 left = right = &map->header; 1056 root = map->root; 1057 while (root != NULL && root->max_free >= length) { 1058 KASSERT(left->end <= root->start && 1059 root->end <= right->start, 1060 ("%s: root not within tree bounds", __func__)); 1061 if (addr < root->start) { 1062 SPLAY_LEFT_STEP(root, y, left, right, 1063 y->max_free >= length && addr < y->start); 1064 } else if (addr >= root->end) { 1065 SPLAY_RIGHT_STEP(root, y, left, right, 1066 y->max_free >= length && addr >= y->end); 1067 } else 1068 break; 1069 } 1070 *llist = left; 1071 *rlist = right; 1072 return (root); 1073 } 1074 1075 static __always_inline void 1076 vm_map_splay_findnext(vm_map_entry_t root, vm_map_entry_t *rlist) 1077 { 1078 vm_map_entry_t hi, right, y; 1079 1080 right = *rlist; 1081 hi = root->right == right ? NULL : root->right; 1082 if (hi == NULL) 1083 return; 1084 do 1085 SPLAY_LEFT_STEP(hi, y, root, right, true); 1086 while (hi != NULL); 1087 *rlist = right; 1088 } 1089 1090 static __always_inline void 1091 vm_map_splay_findprev(vm_map_entry_t root, vm_map_entry_t *llist) 1092 { 1093 vm_map_entry_t left, lo, y; 1094 1095 left = *llist; 1096 lo = root->left == left ? NULL : root->left; 1097 if (lo == NULL) 1098 return; 1099 do 1100 SPLAY_RIGHT_STEP(lo, y, left, root, true); 1101 while (lo != NULL); 1102 *llist = left; 1103 } 1104 1105 static inline void 1106 vm_map_entry_swap(vm_map_entry_t *a, vm_map_entry_t *b) 1107 { 1108 vm_map_entry_t tmp; 1109 1110 tmp = *b; 1111 *b = *a; 1112 *a = tmp; 1113 } 1114 1115 /* 1116 * Walk back up the two spines, flip the pointers and set max_free. The 1117 * subtrees of the root go at the bottom of llist and rlist. 1118 */ 1119 static vm_size_t 1120 vm_map_splay_merge_left_walk(vm_map_entry_t header, vm_map_entry_t root, 1121 vm_map_entry_t tail, vm_size_t max_free, vm_map_entry_t llist) 1122 { 1123 do { 1124 /* 1125 * The max_free values of the children of llist are in 1126 * llist->max_free and max_free. Update with the 1127 * max value. 1128 */ 1129 llist->max_free = max_free = 1130 vm_size_max(llist->max_free, max_free); 1131 vm_map_entry_swap(&llist->right, &tail); 1132 vm_map_entry_swap(&tail, &llist); 1133 } while (llist != header); 1134 root->left = tail; 1135 return (max_free); 1136 } 1137 1138 /* 1139 * When llist is known to be the predecessor of root. 1140 */ 1141 static inline vm_size_t 1142 vm_map_splay_merge_pred(vm_map_entry_t header, vm_map_entry_t root, 1143 vm_map_entry_t llist) 1144 { 1145 vm_size_t max_free; 1146 1147 max_free = root->start - llist->end; 1148 if (llist != header) { 1149 max_free = vm_map_splay_merge_left_walk(header, root, 1150 root, max_free, llist); 1151 } else { 1152 root->left = header; 1153 header->right = root; 1154 } 1155 return (max_free); 1156 } 1157 1158 /* 1159 * When llist may or may not be the predecessor of root. 1160 */ 1161 static inline vm_size_t 1162 vm_map_splay_merge_left(vm_map_entry_t header, vm_map_entry_t root, 1163 vm_map_entry_t llist) 1164 { 1165 vm_size_t max_free; 1166 1167 max_free = vm_map_entry_max_free_left(root, llist); 1168 if (llist != header) { 1169 max_free = vm_map_splay_merge_left_walk(header, root, 1170 root->left == llist ? root : root->left, 1171 max_free, llist); 1172 } 1173 return (max_free); 1174 } 1175 1176 static vm_size_t 1177 vm_map_splay_merge_right_walk(vm_map_entry_t header, vm_map_entry_t root, 1178 vm_map_entry_t tail, vm_size_t max_free, vm_map_entry_t rlist) 1179 { 1180 do { 1181 /* 1182 * The max_free values of the children of rlist are in 1183 * rlist->max_free and max_free. Update with the 1184 * max value. 1185 */ 1186 rlist->max_free = max_free = 1187 vm_size_max(rlist->max_free, max_free); 1188 vm_map_entry_swap(&rlist->left, &tail); 1189 vm_map_entry_swap(&tail, &rlist); 1190 } while (rlist != header); 1191 root->right = tail; 1192 return (max_free); 1193 } 1194 1195 /* 1196 * When rlist is known to be the succecessor of root. 1197 */ 1198 static inline vm_size_t 1199 vm_map_splay_merge_succ(vm_map_entry_t header, vm_map_entry_t root, 1200 vm_map_entry_t rlist) 1201 { 1202 vm_size_t max_free; 1203 1204 max_free = rlist->start - root->end; 1205 if (rlist != header) { 1206 max_free = vm_map_splay_merge_right_walk(header, root, 1207 root, max_free, rlist); 1208 } else { 1209 root->right = header; 1210 header->left = root; 1211 } 1212 return (max_free); 1213 } 1214 1215 /* 1216 * When rlist may or may not be the succecessor of root. 1217 */ 1218 static inline vm_size_t 1219 vm_map_splay_merge_right(vm_map_entry_t header, vm_map_entry_t root, 1220 vm_map_entry_t rlist) 1221 { 1222 vm_size_t max_free; 1223 1224 max_free = vm_map_entry_max_free_right(root, rlist); 1225 if (rlist != header) { 1226 max_free = vm_map_splay_merge_right_walk(header, root, 1227 root->right == rlist ? root : root->right, 1228 max_free, rlist); 1229 } 1230 return (max_free); 1231 } 1232 1233 /* 1234 * vm_map_splay: 1235 * 1236 * The Sleator and Tarjan top-down splay algorithm with the 1237 * following variation. Max_free must be computed bottom-up, so 1238 * on the downward pass, maintain the left and right spines in 1239 * reverse order. Then, make a second pass up each side to fix 1240 * the pointers and compute max_free. The time bound is O(log n) 1241 * amortized. 1242 * 1243 * The tree is threaded, which means that there are no null pointers. 1244 * When a node has no left child, its left pointer points to its 1245 * predecessor, which the last ancestor on the search path from the root 1246 * where the search branched right. Likewise, when a node has no right 1247 * child, its right pointer points to its successor. The map header node 1248 * is the predecessor of the first map entry, and the successor of the 1249 * last. 1250 * 1251 * The new root is the vm_map_entry containing "addr", or else an 1252 * adjacent entry (lower if possible) if addr is not in the tree. 1253 * 1254 * The map must be locked, and leaves it so. 1255 * 1256 * Returns: the new root. 1257 */ 1258 static vm_map_entry_t 1259 vm_map_splay(vm_map_t map, vm_offset_t addr) 1260 { 1261 vm_map_entry_t header, llist, rlist, root; 1262 vm_size_t max_free_left, max_free_right; 1263 1264 header = &map->header; 1265 root = vm_map_splay_split(map, addr, 0, &llist, &rlist); 1266 if (root != NULL) { 1267 max_free_left = vm_map_splay_merge_left(header, root, llist); 1268 max_free_right = vm_map_splay_merge_right(header, root, rlist); 1269 } else if (llist != header) { 1270 /* 1271 * Recover the greatest node in the left 1272 * subtree and make it the root. 1273 */ 1274 root = llist; 1275 llist = root->right; 1276 max_free_left = vm_map_splay_merge_left(header, root, llist); 1277 max_free_right = vm_map_splay_merge_succ(header, root, rlist); 1278 } else if (rlist != header) { 1279 /* 1280 * Recover the least node in the right 1281 * subtree and make it the root. 1282 */ 1283 root = rlist; 1284 rlist = root->left; 1285 max_free_left = vm_map_splay_merge_pred(header, root, llist); 1286 max_free_right = vm_map_splay_merge_right(header, root, rlist); 1287 } else { 1288 /* There is no root. */ 1289 return (NULL); 1290 } 1291 root->max_free = vm_size_max(max_free_left, max_free_right); 1292 map->root = root; 1293 VM_MAP_ASSERT_CONSISTENT(map); 1294 return (root); 1295 } 1296 1297 /* 1298 * vm_map_entry_{un,}link: 1299 * 1300 * Insert/remove entries from maps. On linking, if new entry clips 1301 * existing entry, trim existing entry to avoid overlap, and manage 1302 * offsets. On unlinking, merge disappearing entry with neighbor, if 1303 * called for, and manage offsets. Callers should not modify fields in 1304 * entries already mapped. 1305 */ 1306 static void 1307 vm_map_entry_link(vm_map_t map, vm_map_entry_t entry) 1308 { 1309 vm_map_entry_t header, llist, rlist, root; 1310 vm_size_t max_free_left, max_free_right; 1311 1312 CTR3(KTR_VM, 1313 "vm_map_entry_link: map %p, nentries %d, entry %p", map, 1314 map->nentries, entry); 1315 VM_MAP_ASSERT_LOCKED(map); 1316 map->nentries++; 1317 header = &map->header; 1318 root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist); 1319 if (root == NULL) { 1320 /* 1321 * The new entry does not overlap any existing entry in the 1322 * map, so it becomes the new root of the map tree. 1323 */ 1324 max_free_left = vm_map_splay_merge_pred(header, entry, llist); 1325 max_free_right = vm_map_splay_merge_succ(header, entry, rlist); 1326 } else if (entry->start == root->start) { 1327 /* 1328 * The new entry is a clone of root, with only the end field 1329 * changed. The root entry will be shrunk to abut the new 1330 * entry, and will be the right child of the new root entry in 1331 * the modified map. 1332 */ 1333 KASSERT(entry->end < root->end, 1334 ("%s: clip_start not within entry", __func__)); 1335 vm_map_splay_findprev(root, &llist); 1336 root->offset += entry->end - root->start; 1337 root->start = entry->end; 1338 max_free_left = vm_map_splay_merge_pred(header, entry, llist); 1339 max_free_right = root->max_free = vm_size_max( 1340 vm_map_splay_merge_pred(entry, root, entry), 1341 vm_map_splay_merge_right(header, root, rlist)); 1342 } else { 1343 /* 1344 * The new entry is a clone of root, with only the start field 1345 * changed. The root entry will be shrunk to abut the new 1346 * entry, and will be the left child of the new root entry in 1347 * the modified map. 1348 */ 1349 KASSERT(entry->end == root->end, 1350 ("%s: clip_start not within entry", __func__)); 1351 vm_map_splay_findnext(root, &rlist); 1352 entry->offset += entry->start - root->start; 1353 root->end = entry->start; 1354 max_free_left = root->max_free = vm_size_max( 1355 vm_map_splay_merge_left(header, root, llist), 1356 vm_map_splay_merge_succ(entry, root, entry)); 1357 max_free_right = vm_map_splay_merge_succ(header, entry, rlist); 1358 } 1359 entry->max_free = vm_size_max(max_free_left, max_free_right); 1360 map->root = entry; 1361 VM_MAP_ASSERT_CONSISTENT(map); 1362 } 1363 1364 enum unlink_merge_type { 1365 UNLINK_MERGE_NONE, 1366 UNLINK_MERGE_NEXT 1367 }; 1368 1369 static void 1370 vm_map_entry_unlink(vm_map_t map, vm_map_entry_t entry, 1371 enum unlink_merge_type op) 1372 { 1373 vm_map_entry_t header, llist, rlist, root; 1374 vm_size_t max_free_left, max_free_right; 1375 1376 VM_MAP_ASSERT_LOCKED(map); 1377 header = &map->header; 1378 root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist); 1379 KASSERT(root != NULL, 1380 ("vm_map_entry_unlink: unlink object not mapped")); 1381 1382 vm_map_splay_findprev(root, &llist); 1383 vm_map_splay_findnext(root, &rlist); 1384 if (op == UNLINK_MERGE_NEXT) { 1385 rlist->start = root->start; 1386 rlist->offset = root->offset; 1387 } 1388 if (llist != header) { 1389 root = llist; 1390 llist = root->right; 1391 max_free_left = vm_map_splay_merge_left(header, root, llist); 1392 max_free_right = vm_map_splay_merge_succ(header, root, rlist); 1393 } else if (rlist != header) { 1394 root = rlist; 1395 rlist = root->left; 1396 max_free_left = vm_map_splay_merge_pred(header, root, llist); 1397 max_free_right = vm_map_splay_merge_right(header, root, rlist); 1398 } else { 1399 header->left = header->right = header; 1400 root = NULL; 1401 } 1402 if (root != NULL) 1403 root->max_free = vm_size_max(max_free_left, max_free_right); 1404 map->root = root; 1405 VM_MAP_ASSERT_CONSISTENT(map); 1406 map->nentries--; 1407 CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map, 1408 map->nentries, entry); 1409 } 1410 1411 /* 1412 * vm_map_entry_resize: 1413 * 1414 * Resize a vm_map_entry, recompute the amount of free space that 1415 * follows it and propagate that value up the tree. 1416 * 1417 * The map must be locked, and leaves it so. 1418 */ 1419 static void 1420 vm_map_entry_resize(vm_map_t map, vm_map_entry_t entry, vm_size_t grow_amount) 1421 { 1422 vm_map_entry_t header, llist, rlist, root; 1423 1424 VM_MAP_ASSERT_LOCKED(map); 1425 header = &map->header; 1426 root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist); 1427 KASSERT(root != NULL, ("%s: resize object not mapped", __func__)); 1428 vm_map_splay_findnext(root, &rlist); 1429 entry->end += grow_amount; 1430 root->max_free = vm_size_max( 1431 vm_map_splay_merge_left(header, root, llist), 1432 vm_map_splay_merge_succ(header, root, rlist)); 1433 map->root = root; 1434 VM_MAP_ASSERT_CONSISTENT(map); 1435 CTR4(KTR_VM, "%s: map %p, nentries %d, entry %p", 1436 __func__, map, map->nentries, entry); 1437 } 1438 1439 /* 1440 * vm_map_lookup_entry: [ internal use only ] 1441 * 1442 * Finds the map entry containing (or 1443 * immediately preceding) the specified address 1444 * in the given map; the entry is returned 1445 * in the "entry" parameter. The boolean 1446 * result indicates whether the address is 1447 * actually contained in the map. 1448 */ 1449 boolean_t 1450 vm_map_lookup_entry( 1451 vm_map_t map, 1452 vm_offset_t address, 1453 vm_map_entry_t *entry) /* OUT */ 1454 { 1455 vm_map_entry_t cur, header, lbound, ubound; 1456 boolean_t locked; 1457 1458 /* 1459 * If the map is empty, then the map entry immediately preceding 1460 * "address" is the map's header. 1461 */ 1462 header = &map->header; 1463 cur = map->root; 1464 if (cur == NULL) { 1465 *entry = header; 1466 return (FALSE); 1467 } 1468 if (address >= cur->start && cur->end > address) { 1469 *entry = cur; 1470 return (TRUE); 1471 } 1472 if ((locked = vm_map_locked(map)) || 1473 sx_try_upgrade(&map->lock)) { 1474 /* 1475 * Splay requires a write lock on the map. However, it only 1476 * restructures the binary search tree; it does not otherwise 1477 * change the map. Thus, the map's timestamp need not change 1478 * on a temporary upgrade. 1479 */ 1480 cur = vm_map_splay(map, address); 1481 if (!locked) { 1482 VM_MAP_UNLOCK_CONSISTENT(map); 1483 sx_downgrade(&map->lock); 1484 } 1485 1486 /* 1487 * If "address" is contained within a map entry, the new root 1488 * is that map entry. Otherwise, the new root is a map entry 1489 * immediately before or after "address". 1490 */ 1491 if (address < cur->start) { 1492 *entry = header; 1493 return (FALSE); 1494 } 1495 *entry = cur; 1496 return (address < cur->end); 1497 } 1498 /* 1499 * Since the map is only locked for read access, perform a 1500 * standard binary search tree lookup for "address". 1501 */ 1502 lbound = ubound = header; 1503 for (;;) { 1504 if (address < cur->start) { 1505 ubound = cur; 1506 cur = cur->left; 1507 if (cur == lbound) 1508 break; 1509 } else if (cur->end <= address) { 1510 lbound = cur; 1511 cur = cur->right; 1512 if (cur == ubound) 1513 break; 1514 } else { 1515 *entry = cur; 1516 return (TRUE); 1517 } 1518 } 1519 *entry = lbound; 1520 return (FALSE); 1521 } 1522 1523 /* 1524 * vm_map_insert: 1525 * 1526 * Inserts the given whole VM object into the target 1527 * map at the specified address range. The object's 1528 * size should match that of the address range. 1529 * 1530 * Requires that the map be locked, and leaves it so. 1531 * 1532 * If object is non-NULL, ref count must be bumped by caller 1533 * prior to making call to account for the new entry. 1534 */ 1535 int 1536 vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, 1537 vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow) 1538 { 1539 vm_map_entry_t new_entry, next_entry, prev_entry; 1540 struct ucred *cred; 1541 vm_eflags_t protoeflags; 1542 vm_inherit_t inheritance; 1543 u_long bdry; 1544 u_int bidx; 1545 1546 VM_MAP_ASSERT_LOCKED(map); 1547 KASSERT(object != kernel_object || 1548 (cow & MAP_COPY_ON_WRITE) == 0, 1549 ("vm_map_insert: kernel object and COW")); 1550 KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0 || 1551 (cow & MAP_SPLIT_BOUNDARY_MASK) != 0, 1552 ("vm_map_insert: paradoxical MAP_NOFAULT request, obj %p cow %#x", 1553 object, cow)); 1554 KASSERT((prot & ~max) == 0, 1555 ("prot %#x is not subset of max_prot %#x", prot, max)); 1556 1557 /* 1558 * Check that the start and end points are not bogus. 1559 */ 1560 if (start == end || !vm_map_range_valid(map, start, end)) 1561 return (KERN_INVALID_ADDRESS); 1562 1563 /* 1564 * Find the entry prior to the proposed starting address; if it's part 1565 * of an existing entry, this range is bogus. 1566 */ 1567 if (vm_map_lookup_entry(map, start, &prev_entry)) 1568 return (KERN_NO_SPACE); 1569 1570 /* 1571 * Assert that the next entry doesn't overlap the end point. 1572 */ 1573 next_entry = vm_map_entry_succ(prev_entry); 1574 if (next_entry->start < end) 1575 return (KERN_NO_SPACE); 1576 1577 if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL || 1578 max != VM_PROT_NONE)) 1579 return (KERN_INVALID_ARGUMENT); 1580 1581 protoeflags = 0; 1582 if (cow & MAP_COPY_ON_WRITE) 1583 protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY; 1584 if (cow & MAP_NOFAULT) 1585 protoeflags |= MAP_ENTRY_NOFAULT; 1586 if (cow & MAP_DISABLE_SYNCER) 1587 protoeflags |= MAP_ENTRY_NOSYNC; 1588 if (cow & MAP_DISABLE_COREDUMP) 1589 protoeflags |= MAP_ENTRY_NOCOREDUMP; 1590 if (cow & MAP_STACK_GROWS_DOWN) 1591 protoeflags |= MAP_ENTRY_GROWS_DOWN; 1592 if (cow & MAP_STACK_GROWS_UP) 1593 protoeflags |= MAP_ENTRY_GROWS_UP; 1594 if (cow & MAP_WRITECOUNT) 1595 protoeflags |= MAP_ENTRY_WRITECNT; 1596 if (cow & MAP_VN_EXEC) 1597 protoeflags |= MAP_ENTRY_VN_EXEC; 1598 if ((cow & MAP_CREATE_GUARD) != 0) 1599 protoeflags |= MAP_ENTRY_GUARD; 1600 if ((cow & MAP_CREATE_STACK_GAP_DN) != 0) 1601 protoeflags |= MAP_ENTRY_STACK_GAP_DN; 1602 if ((cow & MAP_CREATE_STACK_GAP_UP) != 0) 1603 protoeflags |= MAP_ENTRY_STACK_GAP_UP; 1604 if (cow & MAP_INHERIT_SHARE) 1605 inheritance = VM_INHERIT_SHARE; 1606 else 1607 inheritance = VM_INHERIT_DEFAULT; 1608 if ((cow & MAP_SPLIT_BOUNDARY_MASK) != 0) { 1609 /* This magically ignores index 0, for usual page size. */ 1610 bidx = (cow & MAP_SPLIT_BOUNDARY_MASK) >> 1611 MAP_SPLIT_BOUNDARY_SHIFT; 1612 if (bidx >= MAXPAGESIZES) 1613 return (KERN_INVALID_ARGUMENT); 1614 bdry = pagesizes[bidx] - 1; 1615 if ((start & bdry) != 0 || (end & bdry) != 0) 1616 return (KERN_INVALID_ARGUMENT); 1617 protoeflags |= bidx << MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; 1618 } 1619 1620 cred = NULL; 1621 if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0) 1622 goto charged; 1623 if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) && 1624 ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) { 1625 if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start)) 1626 return (KERN_RESOURCE_SHORTAGE); 1627 KASSERT(object == NULL || 1628 (protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 || 1629 object->cred == NULL, 1630 ("overcommit: vm_map_insert o %p", object)); 1631 cred = curthread->td_ucred; 1632 } 1633 1634 charged: 1635 /* Expand the kernel pmap, if necessary. */ 1636 if (map == kernel_map && end > kernel_vm_end) 1637 pmap_growkernel(end); 1638 if (object != NULL) { 1639 /* 1640 * OBJ_ONEMAPPING must be cleared unless this mapping 1641 * is trivially proven to be the only mapping for any 1642 * of the object's pages. (Object granularity 1643 * reference counting is insufficient to recognize 1644 * aliases with precision.) 1645 */ 1646 if ((object->flags & OBJ_ANON) != 0) { 1647 VM_OBJECT_WLOCK(object); 1648 if (object->ref_count > 1 || object->shadow_count != 0) 1649 vm_object_clear_flag(object, OBJ_ONEMAPPING); 1650 VM_OBJECT_WUNLOCK(object); 1651 } 1652 } else if ((prev_entry->eflags & ~MAP_ENTRY_USER_WIRED) == 1653 protoeflags && 1654 (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP | 1655 MAP_VN_EXEC)) == 0 && 1656 prev_entry->end == start && (prev_entry->cred == cred || 1657 (prev_entry->object.vm_object != NULL && 1658 prev_entry->object.vm_object->cred == cred)) && 1659 vm_object_coalesce(prev_entry->object.vm_object, 1660 prev_entry->offset, 1661 (vm_size_t)(prev_entry->end - prev_entry->start), 1662 (vm_size_t)(end - prev_entry->end), cred != NULL && 1663 (protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)) { 1664 /* 1665 * We were able to extend the object. Determine if we 1666 * can extend the previous map entry to include the 1667 * new range as well. 1668 */ 1669 if (prev_entry->inheritance == inheritance && 1670 prev_entry->protection == prot && 1671 prev_entry->max_protection == max && 1672 prev_entry->wired_count == 0) { 1673 KASSERT((prev_entry->eflags & MAP_ENTRY_USER_WIRED) == 1674 0, ("prev_entry %p has incoherent wiring", 1675 prev_entry)); 1676 if ((prev_entry->eflags & MAP_ENTRY_GUARD) == 0) 1677 map->size += end - prev_entry->end; 1678 vm_map_entry_resize(map, prev_entry, 1679 end - prev_entry->end); 1680 vm_map_try_merge_entries(map, prev_entry, next_entry); 1681 return (KERN_SUCCESS); 1682 } 1683 1684 /* 1685 * If we can extend the object but cannot extend the 1686 * map entry, we have to create a new map entry. We 1687 * must bump the ref count on the extended object to 1688 * account for it. object may be NULL. 1689 */ 1690 object = prev_entry->object.vm_object; 1691 offset = prev_entry->offset + 1692 (prev_entry->end - prev_entry->start); 1693 vm_object_reference(object); 1694 if (cred != NULL && object != NULL && object->cred != NULL && 1695 !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) { 1696 /* Object already accounts for this uid. */ 1697 cred = NULL; 1698 } 1699 } 1700 if (cred != NULL) 1701 crhold(cred); 1702 1703 /* 1704 * Create a new entry 1705 */ 1706 new_entry = vm_map_entry_create(map); 1707 new_entry->start = start; 1708 new_entry->end = end; 1709 new_entry->cred = NULL; 1710 1711 new_entry->eflags = protoeflags; 1712 new_entry->object.vm_object = object; 1713 new_entry->offset = offset; 1714 1715 new_entry->inheritance = inheritance; 1716 new_entry->protection = prot; 1717 new_entry->max_protection = max; 1718 new_entry->wired_count = 0; 1719 new_entry->wiring_thread = NULL; 1720 new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT; 1721 new_entry->next_read = start; 1722 1723 KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry), 1724 ("overcommit: vm_map_insert leaks vm_map %p", new_entry)); 1725 new_entry->cred = cred; 1726 1727 /* 1728 * Insert the new entry into the list 1729 */ 1730 vm_map_entry_link(map, new_entry); 1731 if ((new_entry->eflags & MAP_ENTRY_GUARD) == 0) 1732 map->size += new_entry->end - new_entry->start; 1733 1734 /* 1735 * Try to coalesce the new entry with both the previous and next 1736 * entries in the list. Previously, we only attempted to coalesce 1737 * with the previous entry when object is NULL. Here, we handle the 1738 * other cases, which are less common. 1739 */ 1740 vm_map_try_merge_entries(map, prev_entry, new_entry); 1741 vm_map_try_merge_entries(map, new_entry, next_entry); 1742 1743 if ((cow & (MAP_PREFAULT | MAP_PREFAULT_PARTIAL)) != 0) { 1744 vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset), 1745 end - start, cow & MAP_PREFAULT_PARTIAL); 1746 } 1747 1748 return (KERN_SUCCESS); 1749 } 1750 1751 /* 1752 * vm_map_findspace: 1753 * 1754 * Find the first fit (lowest VM address) for "length" free bytes 1755 * beginning at address >= start in the given map. 1756 * 1757 * In a vm_map_entry, "max_free" is the maximum amount of 1758 * contiguous free space between an entry in its subtree and a 1759 * neighbor of that entry. This allows finding a free region in 1760 * one path down the tree, so O(log n) amortized with splay 1761 * trees. 1762 * 1763 * The map must be locked, and leaves it so. 1764 * 1765 * Returns: starting address if sufficient space, 1766 * vm_map_max(map)-length+1 if insufficient space. 1767 */ 1768 vm_offset_t 1769 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length) 1770 { 1771 vm_map_entry_t header, llist, rlist, root, y; 1772 vm_size_t left_length, max_free_left, max_free_right; 1773 vm_offset_t gap_end; 1774 1775 /* 1776 * Request must fit within min/max VM address and must avoid 1777 * address wrap. 1778 */ 1779 start = MAX(start, vm_map_min(map)); 1780 if (start >= vm_map_max(map) || length > vm_map_max(map) - start) 1781 return (vm_map_max(map) - length + 1); 1782 1783 /* Empty tree means wide open address space. */ 1784 if (map->root == NULL) 1785 return (start); 1786 1787 /* 1788 * After splay_split, if start is within an entry, push it to the start 1789 * of the following gap. If rlist is at the end of the gap containing 1790 * start, save the end of that gap in gap_end to see if the gap is big 1791 * enough; otherwise set gap_end to start skip gap-checking and move 1792 * directly to a search of the right subtree. 1793 */ 1794 header = &map->header; 1795 root = vm_map_splay_split(map, start, length, &llist, &rlist); 1796 gap_end = rlist->start; 1797 if (root != NULL) { 1798 start = root->end; 1799 if (root->right != rlist) 1800 gap_end = start; 1801 max_free_left = vm_map_splay_merge_left(header, root, llist); 1802 max_free_right = vm_map_splay_merge_right(header, root, rlist); 1803 } else if (rlist != header) { 1804 root = rlist; 1805 rlist = root->left; 1806 max_free_left = vm_map_splay_merge_pred(header, root, llist); 1807 max_free_right = vm_map_splay_merge_right(header, root, rlist); 1808 } else { 1809 root = llist; 1810 llist = root->right; 1811 max_free_left = vm_map_splay_merge_left(header, root, llist); 1812 max_free_right = vm_map_splay_merge_succ(header, root, rlist); 1813 } 1814 root->max_free = vm_size_max(max_free_left, max_free_right); 1815 map->root = root; 1816 VM_MAP_ASSERT_CONSISTENT(map); 1817 if (length <= gap_end - start) 1818 return (start); 1819 1820 /* With max_free, can immediately tell if no solution. */ 1821 if (root->right == header || length > root->right->max_free) 1822 return (vm_map_max(map) - length + 1); 1823 1824 /* 1825 * Splay for the least large-enough gap in the right subtree. 1826 */ 1827 llist = rlist = header; 1828 for (left_length = 0;; 1829 left_length = vm_map_entry_max_free_left(root, llist)) { 1830 if (length <= left_length) 1831 SPLAY_LEFT_STEP(root, y, llist, rlist, 1832 length <= vm_map_entry_max_free_left(y, llist)); 1833 else 1834 SPLAY_RIGHT_STEP(root, y, llist, rlist, 1835 length > vm_map_entry_max_free_left(y, root)); 1836 if (root == NULL) 1837 break; 1838 } 1839 root = llist; 1840 llist = root->right; 1841 max_free_left = vm_map_splay_merge_left(header, root, llist); 1842 if (rlist == header) { 1843 root->max_free = vm_size_max(max_free_left, 1844 vm_map_splay_merge_succ(header, root, rlist)); 1845 } else { 1846 y = rlist; 1847 rlist = y->left; 1848 y->max_free = vm_size_max( 1849 vm_map_splay_merge_pred(root, y, root), 1850 vm_map_splay_merge_right(header, y, rlist)); 1851 root->max_free = vm_size_max(max_free_left, y->max_free); 1852 } 1853 map->root = root; 1854 VM_MAP_ASSERT_CONSISTENT(map); 1855 return (root->end); 1856 } 1857 1858 int 1859 vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset, 1860 vm_offset_t start, vm_size_t length, vm_prot_t prot, 1861 vm_prot_t max, int cow) 1862 { 1863 vm_offset_t end; 1864 int result; 1865 1866 end = start + length; 1867 KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 || 1868 object == NULL, 1869 ("vm_map_fixed: non-NULL backing object for stack")); 1870 vm_map_lock(map); 1871 VM_MAP_RANGE_CHECK(map, start, end); 1872 if ((cow & MAP_CHECK_EXCL) == 0) { 1873 result = vm_map_delete(map, start, end); 1874 if (result != KERN_SUCCESS) 1875 goto out; 1876 } 1877 if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) { 1878 result = vm_map_stack_locked(map, start, length, sgrowsiz, 1879 prot, max, cow); 1880 } else { 1881 result = vm_map_insert(map, object, offset, start, end, 1882 prot, max, cow); 1883 } 1884 out: 1885 vm_map_unlock(map); 1886 return (result); 1887 } 1888 1889 static const int aslr_pages_rnd_64[2] = {0x1000, 0x10}; 1890 static const int aslr_pages_rnd_32[2] = {0x100, 0x4}; 1891 1892 static int cluster_anon = 1; 1893 SYSCTL_INT(_vm, OID_AUTO, cluster_anon, CTLFLAG_RW, 1894 &cluster_anon, 0, 1895 "Cluster anonymous mappings: 0 = no, 1 = yes if no hint, 2 = always"); 1896 1897 static bool 1898 clustering_anon_allowed(vm_offset_t addr) 1899 { 1900 1901 switch (cluster_anon) { 1902 case 0: 1903 return (false); 1904 case 1: 1905 return (addr == 0); 1906 case 2: 1907 default: 1908 return (true); 1909 } 1910 } 1911 1912 static long aslr_restarts; 1913 SYSCTL_LONG(_vm, OID_AUTO, aslr_restarts, CTLFLAG_RD, 1914 &aslr_restarts, 0, 1915 "Number of aslr failures"); 1916 1917 /* 1918 * Searches for the specified amount of free space in the given map with the 1919 * specified alignment. Performs an address-ordered, first-fit search from 1920 * the given address "*addr", with an optional upper bound "max_addr". If the 1921 * parameter "alignment" is zero, then the alignment is computed from the 1922 * given (object, offset) pair so as to enable the greatest possible use of 1923 * superpage mappings. Returns KERN_SUCCESS and the address of the free space 1924 * in "*addr" if successful. Otherwise, returns KERN_NO_SPACE. 1925 * 1926 * The map must be locked. Initially, there must be at least "length" bytes 1927 * of free space at the given address. 1928 */ 1929 static int 1930 vm_map_alignspace(vm_map_t map, vm_object_t object, vm_ooffset_t offset, 1931 vm_offset_t *addr, vm_size_t length, vm_offset_t max_addr, 1932 vm_offset_t alignment) 1933 { 1934 vm_offset_t aligned_addr, free_addr; 1935 1936 VM_MAP_ASSERT_LOCKED(map); 1937 free_addr = *addr; 1938 KASSERT(free_addr == vm_map_findspace(map, free_addr, length), 1939 ("caller failed to provide space %#jx at address %p", 1940 (uintmax_t)length, (void *)free_addr)); 1941 for (;;) { 1942 /* 1943 * At the start of every iteration, the free space at address 1944 * "*addr" is at least "length" bytes. 1945 */ 1946 if (alignment == 0) 1947 pmap_align_superpage(object, offset, addr, length); 1948 else if ((*addr & (alignment - 1)) != 0) { 1949 *addr &= ~(alignment - 1); 1950 *addr += alignment; 1951 } 1952 aligned_addr = *addr; 1953 if (aligned_addr == free_addr) { 1954 /* 1955 * Alignment did not change "*addr", so "*addr" must 1956 * still provide sufficient free space. 1957 */ 1958 return (KERN_SUCCESS); 1959 } 1960 1961 /* 1962 * Test for address wrap on "*addr". A wrapped "*addr" could 1963 * be a valid address, in which case vm_map_findspace() cannot 1964 * be relied upon to fail. 1965 */ 1966 if (aligned_addr < free_addr) 1967 return (KERN_NO_SPACE); 1968 *addr = vm_map_findspace(map, aligned_addr, length); 1969 if (*addr + length > vm_map_max(map) || 1970 (max_addr != 0 && *addr + length > max_addr)) 1971 return (KERN_NO_SPACE); 1972 free_addr = *addr; 1973 if (free_addr == aligned_addr) { 1974 /* 1975 * If a successful call to vm_map_findspace() did not 1976 * change "*addr", then "*addr" must still be aligned 1977 * and provide sufficient free space. 1978 */ 1979 return (KERN_SUCCESS); 1980 } 1981 } 1982 } 1983 1984 int 1985 vm_map_find_aligned(vm_map_t map, vm_offset_t *addr, vm_size_t length, 1986 vm_offset_t max_addr, vm_offset_t alignment) 1987 { 1988 /* XXXKIB ASLR eh ? */ 1989 *addr = vm_map_findspace(map, *addr, length); 1990 if (*addr + length > vm_map_max(map) || 1991 (max_addr != 0 && *addr + length > max_addr)) 1992 return (KERN_NO_SPACE); 1993 return (vm_map_alignspace(map, NULL, 0, addr, length, max_addr, 1994 alignment)); 1995 } 1996 1997 /* 1998 * vm_map_find finds an unallocated region in the target address 1999 * map with the given length. The search is defined to be 2000 * first-fit from the specified address; the region found is 2001 * returned in the same parameter. 2002 * 2003 * If object is non-NULL, ref count must be bumped by caller 2004 * prior to making call to account for the new entry. 2005 */ 2006 int 2007 vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset, 2008 vm_offset_t *addr, /* IN/OUT */ 2009 vm_size_t length, vm_offset_t max_addr, int find_space, 2010 vm_prot_t prot, vm_prot_t max, int cow) 2011 { 2012 vm_offset_t alignment, curr_min_addr, min_addr; 2013 int gap, pidx, rv, try; 2014 bool cluster, en_aslr, update_anon; 2015 2016 KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 || 2017 object == NULL, 2018 ("vm_map_find: non-NULL backing object for stack")); 2019 MPASS((cow & MAP_REMAP) == 0 || (find_space == VMFS_NO_SPACE && 2020 (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0)); 2021 if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL || 2022 (object->flags & OBJ_COLORED) == 0)) 2023 find_space = VMFS_ANY_SPACE; 2024 if (find_space >> 8 != 0) { 2025 KASSERT((find_space & 0xff) == 0, ("bad VMFS flags")); 2026 alignment = (vm_offset_t)1 << (find_space >> 8); 2027 } else 2028 alignment = 0; 2029 en_aslr = (map->flags & MAP_ASLR) != 0; 2030 update_anon = cluster = clustering_anon_allowed(*addr) && 2031 (map->flags & MAP_IS_SUB_MAP) == 0 && max_addr == 0 && 2032 find_space != VMFS_NO_SPACE && object == NULL && 2033 (cow & (MAP_INHERIT_SHARE | MAP_STACK_GROWS_UP | 2034 MAP_STACK_GROWS_DOWN)) == 0 && prot != PROT_NONE; 2035 curr_min_addr = min_addr = *addr; 2036 if (en_aslr && min_addr == 0 && !cluster && 2037 find_space != VMFS_NO_SPACE && 2038 (map->flags & MAP_ASLR_IGNSTART) != 0) 2039 curr_min_addr = min_addr = vm_map_min(map); 2040 try = 0; 2041 vm_map_lock(map); 2042 if (cluster) { 2043 curr_min_addr = map->anon_loc; 2044 if (curr_min_addr == 0) 2045 cluster = false; 2046 } 2047 if (find_space != VMFS_NO_SPACE) { 2048 KASSERT(find_space == VMFS_ANY_SPACE || 2049 find_space == VMFS_OPTIMAL_SPACE || 2050 find_space == VMFS_SUPER_SPACE || 2051 alignment != 0, ("unexpected VMFS flag")); 2052 again: 2053 /* 2054 * When creating an anonymous mapping, try clustering 2055 * with an existing anonymous mapping first. 2056 * 2057 * We make up to two attempts to find address space 2058 * for a given find_space value. The first attempt may 2059 * apply randomization or may cluster with an existing 2060 * anonymous mapping. If this first attempt fails, 2061 * perform a first-fit search of the available address 2062 * space. 2063 * 2064 * If all tries failed, and find_space is 2065 * VMFS_OPTIMAL_SPACE, fallback to VMFS_ANY_SPACE. 2066 * Again enable clustering and randomization. 2067 */ 2068 try++; 2069 MPASS(try <= 2); 2070 2071 if (try == 2) { 2072 /* 2073 * Second try: we failed either to find a 2074 * suitable region for randomizing the 2075 * allocation, or to cluster with an existing 2076 * mapping. Retry with free run. 2077 */ 2078 curr_min_addr = (map->flags & MAP_ASLR_IGNSTART) != 0 ? 2079 vm_map_min(map) : min_addr; 2080 atomic_add_long(&aslr_restarts, 1); 2081 } 2082 2083 if (try == 1 && en_aslr && !cluster) { 2084 /* 2085 * Find space for allocation, including 2086 * gap needed for later randomization. 2087 */ 2088 pidx = MAXPAGESIZES > 1 && pagesizes[1] != 0 && 2089 (find_space == VMFS_SUPER_SPACE || find_space == 2090 VMFS_OPTIMAL_SPACE) ? 1 : 0; 2091 gap = vm_map_max(map) > MAP_32BIT_MAX_ADDR && 2092 (max_addr == 0 || max_addr > MAP_32BIT_MAX_ADDR) ? 2093 aslr_pages_rnd_64[pidx] : aslr_pages_rnd_32[pidx]; 2094 *addr = vm_map_findspace(map, curr_min_addr, 2095 length + gap * pagesizes[pidx]); 2096 if (*addr + length + gap * pagesizes[pidx] > 2097 vm_map_max(map)) 2098 goto again; 2099 /* And randomize the start address. */ 2100 *addr += (arc4random() % gap) * pagesizes[pidx]; 2101 if (max_addr != 0 && *addr + length > max_addr) 2102 goto again; 2103 } else { 2104 *addr = vm_map_findspace(map, curr_min_addr, length); 2105 if (*addr + length > vm_map_max(map) || 2106 (max_addr != 0 && *addr + length > max_addr)) { 2107 if (cluster) { 2108 cluster = false; 2109 MPASS(try == 1); 2110 goto again; 2111 } 2112 rv = KERN_NO_SPACE; 2113 goto done; 2114 } 2115 } 2116 2117 if (find_space != VMFS_ANY_SPACE && 2118 (rv = vm_map_alignspace(map, object, offset, addr, length, 2119 max_addr, alignment)) != KERN_SUCCESS) { 2120 if (find_space == VMFS_OPTIMAL_SPACE) { 2121 find_space = VMFS_ANY_SPACE; 2122 curr_min_addr = min_addr; 2123 cluster = update_anon; 2124 try = 0; 2125 goto again; 2126 } 2127 goto done; 2128 } 2129 } else if ((cow & MAP_REMAP) != 0) { 2130 if (!vm_map_range_valid(map, *addr, *addr + length)) { 2131 rv = KERN_INVALID_ADDRESS; 2132 goto done; 2133 } 2134 rv = vm_map_delete(map, *addr, *addr + length); 2135 if (rv != KERN_SUCCESS) 2136 goto done; 2137 } 2138 if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) { 2139 rv = vm_map_stack_locked(map, *addr, length, sgrowsiz, prot, 2140 max, cow); 2141 } else { 2142 rv = vm_map_insert(map, object, offset, *addr, *addr + length, 2143 prot, max, cow); 2144 } 2145 if (rv == KERN_SUCCESS && update_anon) 2146 map->anon_loc = *addr + length; 2147 done: 2148 vm_map_unlock(map); 2149 return (rv); 2150 } 2151 2152 /* 2153 * vm_map_find_min() is a variant of vm_map_find() that takes an 2154 * additional parameter (min_addr) and treats the given address 2155 * (*addr) differently. Specifically, it treats *addr as a hint 2156 * and not as the minimum address where the mapping is created. 2157 * 2158 * This function works in two phases. First, it tries to 2159 * allocate above the hint. If that fails and the hint is 2160 * greater than min_addr, it performs a second pass, replacing 2161 * the hint with min_addr as the minimum address for the 2162 * allocation. 2163 */ 2164 int 2165 vm_map_find_min(vm_map_t map, vm_object_t object, vm_ooffset_t offset, 2166 vm_offset_t *addr, vm_size_t length, vm_offset_t min_addr, 2167 vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max, 2168 int cow) 2169 { 2170 vm_offset_t hint; 2171 int rv; 2172 2173 hint = *addr; 2174 for (;;) { 2175 rv = vm_map_find(map, object, offset, addr, length, max_addr, 2176 find_space, prot, max, cow); 2177 if (rv == KERN_SUCCESS || min_addr >= hint) 2178 return (rv); 2179 *addr = hint = min_addr; 2180 } 2181 } 2182 2183 /* 2184 * A map entry with any of the following flags set must not be merged with 2185 * another entry. 2186 */ 2187 #define MAP_ENTRY_NOMERGE_MASK (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP | \ 2188 MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP | MAP_ENTRY_VN_EXEC) 2189 2190 static bool 2191 vm_map_mergeable_neighbors(vm_map_entry_t prev, vm_map_entry_t entry) 2192 { 2193 2194 KASSERT((prev->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 || 2195 (entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0, 2196 ("vm_map_mergeable_neighbors: neither %p nor %p are mergeable", 2197 prev, entry)); 2198 return (prev->end == entry->start && 2199 prev->object.vm_object == entry->object.vm_object && 2200 (prev->object.vm_object == NULL || 2201 prev->offset + (prev->end - prev->start) == entry->offset) && 2202 prev->eflags == entry->eflags && 2203 prev->protection == entry->protection && 2204 prev->max_protection == entry->max_protection && 2205 prev->inheritance == entry->inheritance && 2206 prev->wired_count == entry->wired_count && 2207 prev->cred == entry->cred); 2208 } 2209 2210 static void 2211 vm_map_merged_neighbor_dispose(vm_map_t map, vm_map_entry_t entry) 2212 { 2213 2214 /* 2215 * If the backing object is a vnode object, vm_object_deallocate() 2216 * calls vrele(). However, vrele() does not lock the vnode because 2217 * the vnode has additional references. Thus, the map lock can be 2218 * kept without causing a lock-order reversal with the vnode lock. 2219 * 2220 * Since we count the number of virtual page mappings in 2221 * object->un_pager.vnp.writemappings, the writemappings value 2222 * should not be adjusted when the entry is disposed of. 2223 */ 2224 if (entry->object.vm_object != NULL) 2225 vm_object_deallocate(entry->object.vm_object); 2226 if (entry->cred != NULL) 2227 crfree(entry->cred); 2228 vm_map_entry_dispose(map, entry); 2229 } 2230 2231 /* 2232 * vm_map_try_merge_entries: 2233 * 2234 * Compare the given map entry to its predecessor, and merge its precessor 2235 * into it if possible. The entry remains valid, and may be extended. 2236 * The predecessor may be deleted. 2237 * 2238 * The map must be locked. 2239 */ 2240 void 2241 vm_map_try_merge_entries(vm_map_t map, vm_map_entry_t prev_entry, 2242 vm_map_entry_t entry) 2243 { 2244 2245 VM_MAP_ASSERT_LOCKED(map); 2246 if ((entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 && 2247 vm_map_mergeable_neighbors(prev_entry, entry)) { 2248 vm_map_entry_unlink(map, prev_entry, UNLINK_MERGE_NEXT); 2249 vm_map_merged_neighbor_dispose(map, prev_entry); 2250 } 2251 } 2252 2253 /* 2254 * vm_map_entry_back: 2255 * 2256 * Allocate an object to back a map entry. 2257 */ 2258 static inline void 2259 vm_map_entry_back(vm_map_entry_t entry) 2260 { 2261 vm_object_t object; 2262 2263 KASSERT(entry->object.vm_object == NULL, 2264 ("map entry %p has backing object", entry)); 2265 KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0, 2266 ("map entry %p is a submap", entry)); 2267 object = vm_object_allocate_anon(atop(entry->end - entry->start), NULL, 2268 entry->cred, entry->end - entry->start); 2269 entry->object.vm_object = object; 2270 entry->offset = 0; 2271 entry->cred = NULL; 2272 } 2273 2274 /* 2275 * vm_map_entry_charge_object 2276 * 2277 * If there is no object backing this entry, create one. Otherwise, if 2278 * the entry has cred, give it to the backing object. 2279 */ 2280 static inline void 2281 vm_map_entry_charge_object(vm_map_t map, vm_map_entry_t entry) 2282 { 2283 2284 VM_MAP_ASSERT_LOCKED(map); 2285 KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0, 2286 ("map entry %p is a submap", entry)); 2287 if (entry->object.vm_object == NULL && !map->system_map && 2288 (entry->eflags & MAP_ENTRY_GUARD) == 0) 2289 vm_map_entry_back(entry); 2290 else if (entry->object.vm_object != NULL && 2291 ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) && 2292 entry->cred != NULL) { 2293 VM_OBJECT_WLOCK(entry->object.vm_object); 2294 KASSERT(entry->object.vm_object->cred == NULL, 2295 ("OVERCOMMIT: %s: both cred e %p", __func__, entry)); 2296 entry->object.vm_object->cred = entry->cred; 2297 entry->object.vm_object->charge = entry->end - entry->start; 2298 VM_OBJECT_WUNLOCK(entry->object.vm_object); 2299 entry->cred = NULL; 2300 } 2301 } 2302 2303 /* 2304 * vm_map_entry_clone 2305 * 2306 * Create a duplicate map entry for clipping. 2307 */ 2308 static vm_map_entry_t 2309 vm_map_entry_clone(vm_map_t map, vm_map_entry_t entry) 2310 { 2311 vm_map_entry_t new_entry; 2312 2313 VM_MAP_ASSERT_LOCKED(map); 2314 2315 /* 2316 * Create a backing object now, if none exists, so that more individual 2317 * objects won't be created after the map entry is split. 2318 */ 2319 vm_map_entry_charge_object(map, entry); 2320 2321 /* Clone the entry. */ 2322 new_entry = vm_map_entry_create(map); 2323 *new_entry = *entry; 2324 if (new_entry->cred != NULL) 2325 crhold(entry->cred); 2326 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { 2327 vm_object_reference(new_entry->object.vm_object); 2328 vm_map_entry_set_vnode_text(new_entry, true); 2329 /* 2330 * The object->un_pager.vnp.writemappings for the object of 2331 * MAP_ENTRY_WRITECNT type entry shall be kept as is here. The 2332 * virtual pages are re-distributed among the clipped entries, 2333 * so the sum is left the same. 2334 */ 2335 } 2336 return (new_entry); 2337 } 2338 2339 /* 2340 * vm_map_clip_start: [ internal use only ] 2341 * 2342 * Asserts that the given entry begins at or after 2343 * the specified address; if necessary, 2344 * it splits the entry into two. 2345 */ 2346 static int 2347 vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t startaddr) 2348 { 2349 vm_map_entry_t new_entry; 2350 int bdry_idx; 2351 2352 if (!map->system_map) 2353 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 2354 "%s: map %p entry %p start 0x%jx", __func__, map, entry, 2355 (uintmax_t)startaddr); 2356 2357 if (startaddr <= entry->start) 2358 return (KERN_SUCCESS); 2359 2360 VM_MAP_ASSERT_LOCKED(map); 2361 KASSERT(entry->end > startaddr && entry->start < startaddr, 2362 ("%s: invalid clip of entry %p", __func__, entry)); 2363 2364 bdry_idx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >> 2365 MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; 2366 if (bdry_idx != 0) { 2367 if ((startaddr & (pagesizes[bdry_idx] - 1)) != 0) 2368 return (KERN_INVALID_ARGUMENT); 2369 } 2370 2371 new_entry = vm_map_entry_clone(map, entry); 2372 2373 /* 2374 * Split off the front portion. Insert the new entry BEFORE this one, 2375 * so that this entry has the specified starting address. 2376 */ 2377 new_entry->end = startaddr; 2378 vm_map_entry_link(map, new_entry); 2379 return (KERN_SUCCESS); 2380 } 2381 2382 /* 2383 * vm_map_lookup_clip_start: 2384 * 2385 * Find the entry at or just after 'start', and clip it if 'start' is in 2386 * the interior of the entry. Return entry after 'start', and in 2387 * prev_entry set the entry before 'start'. 2388 */ 2389 static int 2390 vm_map_lookup_clip_start(vm_map_t map, vm_offset_t start, 2391 vm_map_entry_t *res_entry, vm_map_entry_t *prev_entry) 2392 { 2393 vm_map_entry_t entry; 2394 int rv; 2395 2396 if (!map->system_map) 2397 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 2398 "%s: map %p start 0x%jx prev %p", __func__, map, 2399 (uintmax_t)start, prev_entry); 2400 2401 if (vm_map_lookup_entry(map, start, prev_entry)) { 2402 entry = *prev_entry; 2403 rv = vm_map_clip_start(map, entry, start); 2404 if (rv != KERN_SUCCESS) 2405 return (rv); 2406 *prev_entry = vm_map_entry_pred(entry); 2407 } else 2408 entry = vm_map_entry_succ(*prev_entry); 2409 *res_entry = entry; 2410 return (KERN_SUCCESS); 2411 } 2412 2413 /* 2414 * vm_map_clip_end: [ internal use only ] 2415 * 2416 * Asserts that the given entry ends at or before 2417 * the specified address; if necessary, 2418 * it splits the entry into two. 2419 */ 2420 static int 2421 vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t endaddr) 2422 { 2423 vm_map_entry_t new_entry; 2424 int bdry_idx; 2425 2426 if (!map->system_map) 2427 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 2428 "%s: map %p entry %p end 0x%jx", __func__, map, entry, 2429 (uintmax_t)endaddr); 2430 2431 if (endaddr >= entry->end) 2432 return (KERN_SUCCESS); 2433 2434 VM_MAP_ASSERT_LOCKED(map); 2435 KASSERT(entry->start < endaddr && entry->end > endaddr, 2436 ("%s: invalid clip of entry %p", __func__, entry)); 2437 2438 bdry_idx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >> 2439 MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; 2440 if (bdry_idx != 0) { 2441 if ((endaddr & (pagesizes[bdry_idx] - 1)) != 0) 2442 return (KERN_INVALID_ARGUMENT); 2443 } 2444 2445 new_entry = vm_map_entry_clone(map, entry); 2446 2447 /* 2448 * Split off the back portion. Insert the new entry AFTER this one, 2449 * so that this entry has the specified ending address. 2450 */ 2451 new_entry->start = endaddr; 2452 vm_map_entry_link(map, new_entry); 2453 2454 return (KERN_SUCCESS); 2455 } 2456 2457 /* 2458 * vm_map_submap: [ kernel use only ] 2459 * 2460 * Mark the given range as handled by a subordinate map. 2461 * 2462 * This range must have been created with vm_map_find, 2463 * and no other operations may have been performed on this 2464 * range prior to calling vm_map_submap. 2465 * 2466 * Only a limited number of operations can be performed 2467 * within this rage after calling vm_map_submap: 2468 * vm_fault 2469 * [Don't try vm_map_copy!] 2470 * 2471 * To remove a submapping, one must first remove the 2472 * range from the superior map, and then destroy the 2473 * submap (if desired). [Better yet, don't try it.] 2474 */ 2475 int 2476 vm_map_submap( 2477 vm_map_t map, 2478 vm_offset_t start, 2479 vm_offset_t end, 2480 vm_map_t submap) 2481 { 2482 vm_map_entry_t entry; 2483 int result; 2484 2485 result = KERN_INVALID_ARGUMENT; 2486 2487 vm_map_lock(submap); 2488 submap->flags |= MAP_IS_SUB_MAP; 2489 vm_map_unlock(submap); 2490 2491 vm_map_lock(map); 2492 VM_MAP_RANGE_CHECK(map, start, end); 2493 if (vm_map_lookup_entry(map, start, &entry) && entry->end >= end && 2494 (entry->eflags & MAP_ENTRY_COW) == 0 && 2495 entry->object.vm_object == NULL) { 2496 result = vm_map_clip_start(map, entry, start); 2497 if (result != KERN_SUCCESS) 2498 goto unlock; 2499 result = vm_map_clip_end(map, entry, end); 2500 if (result != KERN_SUCCESS) 2501 goto unlock; 2502 entry->object.sub_map = submap; 2503 entry->eflags |= MAP_ENTRY_IS_SUB_MAP; 2504 result = KERN_SUCCESS; 2505 } 2506 unlock: 2507 vm_map_unlock(map); 2508 2509 if (result != KERN_SUCCESS) { 2510 vm_map_lock(submap); 2511 submap->flags &= ~MAP_IS_SUB_MAP; 2512 vm_map_unlock(submap); 2513 } 2514 return (result); 2515 } 2516 2517 /* 2518 * The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified 2519 */ 2520 #define MAX_INIT_PT 96 2521 2522 /* 2523 * vm_map_pmap_enter: 2524 * 2525 * Preload the specified map's pmap with mappings to the specified 2526 * object's memory-resident pages. No further physical pages are 2527 * allocated, and no further virtual pages are retrieved from secondary 2528 * storage. If the specified flags include MAP_PREFAULT_PARTIAL, then a 2529 * limited number of page mappings are created at the low-end of the 2530 * specified address range. (For this purpose, a superpage mapping 2531 * counts as one page mapping.) Otherwise, all resident pages within 2532 * the specified address range are mapped. 2533 */ 2534 static void 2535 vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot, 2536 vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags) 2537 { 2538 vm_offset_t start; 2539 vm_page_t p, p_start; 2540 vm_pindex_t mask, psize, threshold, tmpidx; 2541 2542 if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL) 2543 return; 2544 if (object->type == OBJT_DEVICE || object->type == OBJT_SG) { 2545 VM_OBJECT_WLOCK(object); 2546 if (object->type == OBJT_DEVICE || object->type == OBJT_SG) { 2547 pmap_object_init_pt(map->pmap, addr, object, pindex, 2548 size); 2549 VM_OBJECT_WUNLOCK(object); 2550 return; 2551 } 2552 VM_OBJECT_LOCK_DOWNGRADE(object); 2553 } else 2554 VM_OBJECT_RLOCK(object); 2555 2556 psize = atop(size); 2557 if (psize + pindex > object->size) { 2558 if (pindex >= object->size) { 2559 VM_OBJECT_RUNLOCK(object); 2560 return; 2561 } 2562 psize = object->size - pindex; 2563 } 2564 2565 start = 0; 2566 p_start = NULL; 2567 threshold = MAX_INIT_PT; 2568 2569 p = vm_page_find_least(object, pindex); 2570 /* 2571 * Assert: the variable p is either (1) the page with the 2572 * least pindex greater than or equal to the parameter pindex 2573 * or (2) NULL. 2574 */ 2575 for (; 2576 p != NULL && (tmpidx = p->pindex - pindex) < psize; 2577 p = TAILQ_NEXT(p, listq)) { 2578 /* 2579 * don't allow an madvise to blow away our really 2580 * free pages allocating pv entries. 2581 */ 2582 if (((flags & MAP_PREFAULT_MADVISE) != 0 && 2583 vm_page_count_severe()) || 2584 ((flags & MAP_PREFAULT_PARTIAL) != 0 && 2585 tmpidx >= threshold)) { 2586 psize = tmpidx; 2587 break; 2588 } 2589 if (vm_page_all_valid(p)) { 2590 if (p_start == NULL) { 2591 start = addr + ptoa(tmpidx); 2592 p_start = p; 2593 } 2594 /* Jump ahead if a superpage mapping is possible. */ 2595 if (p->psind > 0 && ((addr + ptoa(tmpidx)) & 2596 (pagesizes[p->psind] - 1)) == 0) { 2597 mask = atop(pagesizes[p->psind]) - 1; 2598 if (tmpidx + mask < psize && 2599 vm_page_ps_test(p, PS_ALL_VALID, NULL)) { 2600 p += mask; 2601 threshold += mask; 2602 } 2603 } 2604 } else if (p_start != NULL) { 2605 pmap_enter_object(map->pmap, start, addr + 2606 ptoa(tmpidx), p_start, prot); 2607 p_start = NULL; 2608 } 2609 } 2610 if (p_start != NULL) 2611 pmap_enter_object(map->pmap, start, addr + ptoa(psize), 2612 p_start, prot); 2613 VM_OBJECT_RUNLOCK(object); 2614 } 2615 2616 /* 2617 * vm_map_protect: 2618 * 2619 * Sets the protection of the specified address 2620 * region in the target map. If "set_max" is 2621 * specified, the maximum protection is to be set; 2622 * otherwise, only the current protection is affected. 2623 */ 2624 int 2625 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end, 2626 vm_prot_t new_prot, boolean_t set_max) 2627 { 2628 vm_map_entry_t entry, first_entry, in_tran, prev_entry; 2629 vm_object_t obj; 2630 struct ucred *cred; 2631 vm_prot_t old_prot; 2632 int rv; 2633 2634 if (start == end) 2635 return (KERN_SUCCESS); 2636 2637 again: 2638 in_tran = NULL; 2639 vm_map_lock(map); 2640 2641 /* 2642 * Ensure that we are not concurrently wiring pages. vm_map_wire() may 2643 * need to fault pages into the map and will drop the map lock while 2644 * doing so, and the VM object may end up in an inconsistent state if we 2645 * update the protection on the map entry in between faults. 2646 */ 2647 vm_map_wait_busy(map); 2648 2649 VM_MAP_RANGE_CHECK(map, start, end); 2650 2651 if (!vm_map_lookup_entry(map, start, &first_entry)) 2652 first_entry = vm_map_entry_succ(first_entry); 2653 2654 /* 2655 * Make a first pass to check for protection violations. 2656 */ 2657 for (entry = first_entry; entry->start < end; 2658 entry = vm_map_entry_succ(entry)) { 2659 if ((entry->eflags & MAP_ENTRY_GUARD) != 0) 2660 continue; 2661 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) { 2662 vm_map_unlock(map); 2663 return (KERN_INVALID_ARGUMENT); 2664 } 2665 if ((new_prot & entry->max_protection) != new_prot) { 2666 vm_map_unlock(map); 2667 return (KERN_PROTECTION_FAILURE); 2668 } 2669 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0) 2670 in_tran = entry; 2671 } 2672 2673 /* 2674 * Postpone the operation until all in-transition map entries have 2675 * stabilized. An in-transition entry might already have its pages 2676 * wired and wired_count incremented, but not yet have its 2677 * MAP_ENTRY_USER_WIRED flag set. In which case, we would fail to call 2678 * vm_fault_copy_entry() in the final loop below. 2679 */ 2680 if (in_tran != NULL) { 2681 in_tran->eflags |= MAP_ENTRY_NEEDS_WAKEUP; 2682 vm_map_unlock_and_wait(map, 0); 2683 goto again; 2684 } 2685 2686 /* 2687 * Before changing the protections, try to reserve swap space for any 2688 * private (i.e., copy-on-write) mappings that are transitioning from 2689 * read-only to read/write access. If a reservation fails, break out 2690 * of this loop early and let the next loop simplify the entries, since 2691 * some may now be mergeable. 2692 */ 2693 rv = vm_map_clip_start(map, first_entry, start); 2694 if (rv != KERN_SUCCESS) { 2695 vm_map_unlock(map); 2696 return (rv); 2697 } 2698 for (entry = first_entry; entry->start < end; 2699 entry = vm_map_entry_succ(entry)) { 2700 rv = vm_map_clip_end(map, entry, end); 2701 if (rv != KERN_SUCCESS) { 2702 vm_map_unlock(map); 2703 return (rv); 2704 } 2705 2706 if (set_max || 2707 ((new_prot & ~entry->protection) & VM_PROT_WRITE) == 0 || 2708 ENTRY_CHARGED(entry) || 2709 (entry->eflags & MAP_ENTRY_GUARD) != 0) { 2710 continue; 2711 } 2712 2713 cred = curthread->td_ucred; 2714 obj = entry->object.vm_object; 2715 2716 if (obj == NULL || 2717 (entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0) { 2718 if (!swap_reserve(entry->end - entry->start)) { 2719 rv = KERN_RESOURCE_SHORTAGE; 2720 end = entry->end; 2721 break; 2722 } 2723 crhold(cred); 2724 entry->cred = cred; 2725 continue; 2726 } 2727 2728 if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP) 2729 continue; 2730 VM_OBJECT_WLOCK(obj); 2731 if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP) { 2732 VM_OBJECT_WUNLOCK(obj); 2733 continue; 2734 } 2735 2736 /* 2737 * Charge for the whole object allocation now, since 2738 * we cannot distinguish between non-charged and 2739 * charged clipped mapping of the same object later. 2740 */ 2741 KASSERT(obj->charge == 0, 2742 ("vm_map_protect: object %p overcharged (entry %p)", 2743 obj, entry)); 2744 if (!swap_reserve(ptoa(obj->size))) { 2745 VM_OBJECT_WUNLOCK(obj); 2746 rv = KERN_RESOURCE_SHORTAGE; 2747 end = entry->end; 2748 break; 2749 } 2750 2751 crhold(cred); 2752 obj->cred = cred; 2753 obj->charge = ptoa(obj->size); 2754 VM_OBJECT_WUNLOCK(obj); 2755 } 2756 2757 /* 2758 * If enough swap space was available, go back and fix up protections. 2759 * Otherwise, just simplify entries, since some may have been modified. 2760 * [Note that clipping is not necessary the second time.] 2761 */ 2762 for (prev_entry = vm_map_entry_pred(first_entry), entry = first_entry; 2763 entry->start < end; 2764 vm_map_try_merge_entries(map, prev_entry, entry), 2765 prev_entry = entry, entry = vm_map_entry_succ(entry)) { 2766 if (rv != KERN_SUCCESS || 2767 (entry->eflags & MAP_ENTRY_GUARD) != 0) 2768 continue; 2769 2770 old_prot = entry->protection; 2771 2772 if (set_max) 2773 entry->protection = 2774 (entry->max_protection = new_prot) & 2775 old_prot; 2776 else 2777 entry->protection = new_prot; 2778 2779 /* 2780 * For user wired map entries, the normal lazy evaluation of 2781 * write access upgrades through soft page faults is 2782 * undesirable. Instead, immediately copy any pages that are 2783 * copy-on-write and enable write access in the physical map. 2784 */ 2785 if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0 && 2786 (entry->protection & VM_PROT_WRITE) != 0 && 2787 (old_prot & VM_PROT_WRITE) == 0) 2788 vm_fault_copy_entry(map, map, entry, entry, NULL); 2789 2790 /* 2791 * When restricting access, update the physical map. Worry 2792 * about copy-on-write here. 2793 */ 2794 if ((old_prot & ~entry->protection) != 0) { 2795 #define MASK(entry) (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \ 2796 VM_PROT_ALL) 2797 pmap_protect(map->pmap, entry->start, 2798 entry->end, 2799 entry->protection & MASK(entry)); 2800 #undef MASK 2801 } 2802 } 2803 vm_map_try_merge_entries(map, prev_entry, entry); 2804 vm_map_unlock(map); 2805 return (rv); 2806 } 2807 2808 /* 2809 * vm_map_madvise: 2810 * 2811 * This routine traverses a processes map handling the madvise 2812 * system call. Advisories are classified as either those effecting 2813 * the vm_map_entry structure, or those effecting the underlying 2814 * objects. 2815 */ 2816 int 2817 vm_map_madvise( 2818 vm_map_t map, 2819 vm_offset_t start, 2820 vm_offset_t end, 2821 int behav) 2822 { 2823 vm_map_entry_t entry, prev_entry; 2824 int rv; 2825 bool modify_map; 2826 2827 /* 2828 * Some madvise calls directly modify the vm_map_entry, in which case 2829 * we need to use an exclusive lock on the map and we need to perform 2830 * various clipping operations. Otherwise we only need a read-lock 2831 * on the map. 2832 */ 2833 switch(behav) { 2834 case MADV_NORMAL: 2835 case MADV_SEQUENTIAL: 2836 case MADV_RANDOM: 2837 case MADV_NOSYNC: 2838 case MADV_AUTOSYNC: 2839 case MADV_NOCORE: 2840 case MADV_CORE: 2841 if (start == end) 2842 return (0); 2843 modify_map = true; 2844 vm_map_lock(map); 2845 break; 2846 case MADV_WILLNEED: 2847 case MADV_DONTNEED: 2848 case MADV_FREE: 2849 if (start == end) 2850 return (0); 2851 modify_map = false; 2852 vm_map_lock_read(map); 2853 break; 2854 default: 2855 return (EINVAL); 2856 } 2857 2858 /* 2859 * Locate starting entry and clip if necessary. 2860 */ 2861 VM_MAP_RANGE_CHECK(map, start, end); 2862 2863 if (modify_map) { 2864 /* 2865 * madvise behaviors that are implemented in the vm_map_entry. 2866 * 2867 * We clip the vm_map_entry so that behavioral changes are 2868 * limited to the specified address range. 2869 */ 2870 rv = vm_map_lookup_clip_start(map, start, &entry, &prev_entry); 2871 if (rv != KERN_SUCCESS) { 2872 vm_map_unlock(map); 2873 return (vm_mmap_to_errno(rv)); 2874 } 2875 2876 for (; entry->start < end; prev_entry = entry, 2877 entry = vm_map_entry_succ(entry)) { 2878 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) 2879 continue; 2880 2881 rv = vm_map_clip_end(map, entry, end); 2882 if (rv != KERN_SUCCESS) { 2883 vm_map_unlock(map); 2884 return (vm_mmap_to_errno(rv)); 2885 } 2886 2887 switch (behav) { 2888 case MADV_NORMAL: 2889 vm_map_entry_set_behavior(entry, 2890 MAP_ENTRY_BEHAV_NORMAL); 2891 break; 2892 case MADV_SEQUENTIAL: 2893 vm_map_entry_set_behavior(entry, 2894 MAP_ENTRY_BEHAV_SEQUENTIAL); 2895 break; 2896 case MADV_RANDOM: 2897 vm_map_entry_set_behavior(entry, 2898 MAP_ENTRY_BEHAV_RANDOM); 2899 break; 2900 case MADV_NOSYNC: 2901 entry->eflags |= MAP_ENTRY_NOSYNC; 2902 break; 2903 case MADV_AUTOSYNC: 2904 entry->eflags &= ~MAP_ENTRY_NOSYNC; 2905 break; 2906 case MADV_NOCORE: 2907 entry->eflags |= MAP_ENTRY_NOCOREDUMP; 2908 break; 2909 case MADV_CORE: 2910 entry->eflags &= ~MAP_ENTRY_NOCOREDUMP; 2911 break; 2912 default: 2913 break; 2914 } 2915 vm_map_try_merge_entries(map, prev_entry, entry); 2916 } 2917 vm_map_try_merge_entries(map, prev_entry, entry); 2918 vm_map_unlock(map); 2919 } else { 2920 vm_pindex_t pstart, pend; 2921 2922 /* 2923 * madvise behaviors that are implemented in the underlying 2924 * vm_object. 2925 * 2926 * Since we don't clip the vm_map_entry, we have to clip 2927 * the vm_object pindex and count. 2928 */ 2929 if (!vm_map_lookup_entry(map, start, &entry)) 2930 entry = vm_map_entry_succ(entry); 2931 for (; entry->start < end; 2932 entry = vm_map_entry_succ(entry)) { 2933 vm_offset_t useEnd, useStart; 2934 2935 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) 2936 continue; 2937 2938 /* 2939 * MADV_FREE would otherwise rewind time to 2940 * the creation of the shadow object. Because 2941 * we hold the VM map read-locked, neither the 2942 * entry's object nor the presence of a 2943 * backing object can change. 2944 */ 2945 if (behav == MADV_FREE && 2946 entry->object.vm_object != NULL && 2947 entry->object.vm_object->backing_object != NULL) 2948 continue; 2949 2950 pstart = OFF_TO_IDX(entry->offset); 2951 pend = pstart + atop(entry->end - entry->start); 2952 useStart = entry->start; 2953 useEnd = entry->end; 2954 2955 if (entry->start < start) { 2956 pstart += atop(start - entry->start); 2957 useStart = start; 2958 } 2959 if (entry->end > end) { 2960 pend -= atop(entry->end - end); 2961 useEnd = end; 2962 } 2963 2964 if (pstart >= pend) 2965 continue; 2966 2967 /* 2968 * Perform the pmap_advise() before clearing 2969 * PGA_REFERENCED in vm_page_advise(). Otherwise, a 2970 * concurrent pmap operation, such as pmap_remove(), 2971 * could clear a reference in the pmap and set 2972 * PGA_REFERENCED on the page before the pmap_advise() 2973 * had completed. Consequently, the page would appear 2974 * referenced based upon an old reference that 2975 * occurred before this pmap_advise() ran. 2976 */ 2977 if (behav == MADV_DONTNEED || behav == MADV_FREE) 2978 pmap_advise(map->pmap, useStart, useEnd, 2979 behav); 2980 2981 vm_object_madvise(entry->object.vm_object, pstart, 2982 pend, behav); 2983 2984 /* 2985 * Pre-populate paging structures in the 2986 * WILLNEED case. For wired entries, the 2987 * paging structures are already populated. 2988 */ 2989 if (behav == MADV_WILLNEED && 2990 entry->wired_count == 0) { 2991 vm_map_pmap_enter(map, 2992 useStart, 2993 entry->protection, 2994 entry->object.vm_object, 2995 pstart, 2996 ptoa(pend - pstart), 2997 MAP_PREFAULT_MADVISE 2998 ); 2999 } 3000 } 3001 vm_map_unlock_read(map); 3002 } 3003 return (0); 3004 } 3005 3006 /* 3007 * vm_map_inherit: 3008 * 3009 * Sets the inheritance of the specified address 3010 * range in the target map. Inheritance 3011 * affects how the map will be shared with 3012 * child maps at the time of vmspace_fork. 3013 */ 3014 int 3015 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end, 3016 vm_inherit_t new_inheritance) 3017 { 3018 vm_map_entry_t entry, lentry, prev_entry, start_entry; 3019 int rv; 3020 3021 switch (new_inheritance) { 3022 case VM_INHERIT_NONE: 3023 case VM_INHERIT_COPY: 3024 case VM_INHERIT_SHARE: 3025 case VM_INHERIT_ZERO: 3026 break; 3027 default: 3028 return (KERN_INVALID_ARGUMENT); 3029 } 3030 if (start == end) 3031 return (KERN_SUCCESS); 3032 vm_map_lock(map); 3033 VM_MAP_RANGE_CHECK(map, start, end); 3034 rv = vm_map_lookup_clip_start(map, start, &start_entry, &prev_entry); 3035 if (rv != KERN_SUCCESS) 3036 goto unlock; 3037 if (vm_map_lookup_entry(map, end - 1, &lentry)) { 3038 rv = vm_map_clip_end(map, lentry, end); 3039 if (rv != KERN_SUCCESS) 3040 goto unlock; 3041 } 3042 if (new_inheritance == VM_INHERIT_COPY) { 3043 for (entry = start_entry; entry->start < end; 3044 prev_entry = entry, entry = vm_map_entry_succ(entry)) { 3045 if ((entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) 3046 != 0) { 3047 rv = KERN_INVALID_ARGUMENT; 3048 goto unlock; 3049 } 3050 } 3051 } 3052 for (entry = start_entry; entry->start < end; prev_entry = entry, 3053 entry = vm_map_entry_succ(entry)) { 3054 KASSERT(entry->end <= end, ("non-clipped entry %p end %jx %jx", 3055 entry, (uintmax_t)entry->end, (uintmax_t)end)); 3056 if ((entry->eflags & MAP_ENTRY_GUARD) == 0 || 3057 new_inheritance != VM_INHERIT_ZERO) 3058 entry->inheritance = new_inheritance; 3059 vm_map_try_merge_entries(map, prev_entry, entry); 3060 } 3061 vm_map_try_merge_entries(map, prev_entry, entry); 3062 unlock: 3063 vm_map_unlock(map); 3064 return (rv); 3065 } 3066 3067 /* 3068 * vm_map_entry_in_transition: 3069 * 3070 * Release the map lock, and sleep until the entry is no longer in 3071 * transition. Awake and acquire the map lock. If the map changed while 3072 * another held the lock, lookup a possibly-changed entry at or after the 3073 * 'start' position of the old entry. 3074 */ 3075 static vm_map_entry_t 3076 vm_map_entry_in_transition(vm_map_t map, vm_offset_t in_start, 3077 vm_offset_t *io_end, bool holes_ok, vm_map_entry_t in_entry) 3078 { 3079 vm_map_entry_t entry; 3080 vm_offset_t start; 3081 u_int last_timestamp; 3082 3083 VM_MAP_ASSERT_LOCKED(map); 3084 KASSERT((in_entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0, 3085 ("not in-tranition map entry %p", in_entry)); 3086 /* 3087 * We have not yet clipped the entry. 3088 */ 3089 start = MAX(in_start, in_entry->start); 3090 in_entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; 3091 last_timestamp = map->timestamp; 3092 if (vm_map_unlock_and_wait(map, 0)) { 3093 /* 3094 * Allow interruption of user wiring/unwiring? 3095 */ 3096 } 3097 vm_map_lock(map); 3098 if (last_timestamp + 1 == map->timestamp) 3099 return (in_entry); 3100 3101 /* 3102 * Look again for the entry because the map was modified while it was 3103 * unlocked. Specifically, the entry may have been clipped, merged, or 3104 * deleted. 3105 */ 3106 if (!vm_map_lookup_entry(map, start, &entry)) { 3107 if (!holes_ok) { 3108 *io_end = start; 3109 return (NULL); 3110 } 3111 entry = vm_map_entry_succ(entry); 3112 } 3113 return (entry); 3114 } 3115 3116 /* 3117 * vm_map_unwire: 3118 * 3119 * Implements both kernel and user unwiring. 3120 */ 3121 int 3122 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end, 3123 int flags) 3124 { 3125 vm_map_entry_t entry, first_entry, next_entry, prev_entry; 3126 int rv; 3127 bool holes_ok, need_wakeup, user_unwire; 3128 3129 if (start == end) 3130 return (KERN_SUCCESS); 3131 holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0; 3132 user_unwire = (flags & VM_MAP_WIRE_USER) != 0; 3133 vm_map_lock(map); 3134 VM_MAP_RANGE_CHECK(map, start, end); 3135 if (!vm_map_lookup_entry(map, start, &first_entry)) { 3136 if (holes_ok) 3137 first_entry = vm_map_entry_succ(first_entry); 3138 else { 3139 vm_map_unlock(map); 3140 return (KERN_INVALID_ADDRESS); 3141 } 3142 } 3143 rv = KERN_SUCCESS; 3144 for (entry = first_entry; entry->start < end; entry = next_entry) { 3145 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { 3146 /* 3147 * We have not yet clipped the entry. 3148 */ 3149 next_entry = vm_map_entry_in_transition(map, start, 3150 &end, holes_ok, entry); 3151 if (next_entry == NULL) { 3152 if (entry == first_entry) { 3153 vm_map_unlock(map); 3154 return (KERN_INVALID_ADDRESS); 3155 } 3156 rv = KERN_INVALID_ADDRESS; 3157 break; 3158 } 3159 first_entry = (entry == first_entry) ? 3160 next_entry : NULL; 3161 continue; 3162 } 3163 rv = vm_map_clip_start(map, entry, start); 3164 if (rv != KERN_SUCCESS) 3165 break; 3166 rv = vm_map_clip_end(map, entry, end); 3167 if (rv != KERN_SUCCESS) 3168 break; 3169 3170 /* 3171 * Mark the entry in case the map lock is released. (See 3172 * above.) 3173 */ 3174 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 && 3175 entry->wiring_thread == NULL, 3176 ("owned map entry %p", entry)); 3177 entry->eflags |= MAP_ENTRY_IN_TRANSITION; 3178 entry->wiring_thread = curthread; 3179 next_entry = vm_map_entry_succ(entry); 3180 /* 3181 * Check the map for holes in the specified region. 3182 * If holes_ok, skip this check. 3183 */ 3184 if (!holes_ok && 3185 entry->end < end && next_entry->start > entry->end) { 3186 end = entry->end; 3187 rv = KERN_INVALID_ADDRESS; 3188 break; 3189 } 3190 /* 3191 * If system unwiring, require that the entry is system wired. 3192 */ 3193 if (!user_unwire && 3194 vm_map_entry_system_wired_count(entry) == 0) { 3195 end = entry->end; 3196 rv = KERN_INVALID_ARGUMENT; 3197 break; 3198 } 3199 } 3200 need_wakeup = false; 3201 if (first_entry == NULL && 3202 !vm_map_lookup_entry(map, start, &first_entry)) { 3203 KASSERT(holes_ok, ("vm_map_unwire: lookup failed")); 3204 prev_entry = first_entry; 3205 entry = vm_map_entry_succ(first_entry); 3206 } else { 3207 prev_entry = vm_map_entry_pred(first_entry); 3208 entry = first_entry; 3209 } 3210 for (; entry->start < end; 3211 prev_entry = entry, entry = vm_map_entry_succ(entry)) { 3212 /* 3213 * If holes_ok was specified, an empty 3214 * space in the unwired region could have been mapped 3215 * while the map lock was dropped for draining 3216 * MAP_ENTRY_IN_TRANSITION. Moreover, another thread 3217 * could be simultaneously wiring this new mapping 3218 * entry. Detect these cases and skip any entries 3219 * marked as in transition by us. 3220 */ 3221 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 || 3222 entry->wiring_thread != curthread) { 3223 KASSERT(holes_ok, 3224 ("vm_map_unwire: !HOLESOK and new/changed entry")); 3225 continue; 3226 } 3227 3228 if (rv == KERN_SUCCESS && (!user_unwire || 3229 (entry->eflags & MAP_ENTRY_USER_WIRED))) { 3230 if (entry->wired_count == 1) 3231 vm_map_entry_unwire(map, entry); 3232 else 3233 entry->wired_count--; 3234 if (user_unwire) 3235 entry->eflags &= ~MAP_ENTRY_USER_WIRED; 3236 } 3237 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0, 3238 ("vm_map_unwire: in-transition flag missing %p", entry)); 3239 KASSERT(entry->wiring_thread == curthread, 3240 ("vm_map_unwire: alien wire %p", entry)); 3241 entry->eflags &= ~MAP_ENTRY_IN_TRANSITION; 3242 entry->wiring_thread = NULL; 3243 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) { 3244 entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP; 3245 need_wakeup = true; 3246 } 3247 vm_map_try_merge_entries(map, prev_entry, entry); 3248 } 3249 vm_map_try_merge_entries(map, prev_entry, entry); 3250 vm_map_unlock(map); 3251 if (need_wakeup) 3252 vm_map_wakeup(map); 3253 return (rv); 3254 } 3255 3256 static void 3257 vm_map_wire_user_count_sub(u_long npages) 3258 { 3259 3260 atomic_subtract_long(&vm_user_wire_count, npages); 3261 } 3262 3263 static bool 3264 vm_map_wire_user_count_add(u_long npages) 3265 { 3266 u_long wired; 3267 3268 wired = vm_user_wire_count; 3269 do { 3270 if (npages + wired > vm_page_max_user_wired) 3271 return (false); 3272 } while (!atomic_fcmpset_long(&vm_user_wire_count, &wired, 3273 npages + wired)); 3274 3275 return (true); 3276 } 3277 3278 /* 3279 * vm_map_wire_entry_failure: 3280 * 3281 * Handle a wiring failure on the given entry. 3282 * 3283 * The map should be locked. 3284 */ 3285 static void 3286 vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry, 3287 vm_offset_t failed_addr) 3288 { 3289 3290 VM_MAP_ASSERT_LOCKED(map); 3291 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 && 3292 entry->wired_count == 1, 3293 ("vm_map_wire_entry_failure: entry %p isn't being wired", entry)); 3294 KASSERT(failed_addr < entry->end, 3295 ("vm_map_wire_entry_failure: entry %p was fully wired", entry)); 3296 3297 /* 3298 * If any pages at the start of this entry were successfully wired, 3299 * then unwire them. 3300 */ 3301 if (failed_addr > entry->start) { 3302 pmap_unwire(map->pmap, entry->start, failed_addr); 3303 vm_object_unwire(entry->object.vm_object, entry->offset, 3304 failed_addr - entry->start, PQ_ACTIVE); 3305 } 3306 3307 /* 3308 * Assign an out-of-range value to represent the failure to wire this 3309 * entry. 3310 */ 3311 entry->wired_count = -1; 3312 } 3313 3314 int 3315 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags) 3316 { 3317 int rv; 3318 3319 vm_map_lock(map); 3320 rv = vm_map_wire_locked(map, start, end, flags); 3321 vm_map_unlock(map); 3322 return (rv); 3323 } 3324 3325 /* 3326 * vm_map_wire_locked: 3327 * 3328 * Implements both kernel and user wiring. Returns with the map locked, 3329 * the map lock may be dropped. 3330 */ 3331 int 3332 vm_map_wire_locked(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags) 3333 { 3334 vm_map_entry_t entry, first_entry, next_entry, prev_entry; 3335 vm_offset_t faddr, saved_end, saved_start; 3336 u_long incr, npages; 3337 u_int bidx, last_timestamp; 3338 int rv; 3339 bool holes_ok, need_wakeup, user_wire; 3340 vm_prot_t prot; 3341 3342 VM_MAP_ASSERT_LOCKED(map); 3343 3344 if (start == end) 3345 return (KERN_SUCCESS); 3346 prot = 0; 3347 if (flags & VM_MAP_WIRE_WRITE) 3348 prot |= VM_PROT_WRITE; 3349 holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0; 3350 user_wire = (flags & VM_MAP_WIRE_USER) != 0; 3351 VM_MAP_RANGE_CHECK(map, start, end); 3352 if (!vm_map_lookup_entry(map, start, &first_entry)) { 3353 if (holes_ok) 3354 first_entry = vm_map_entry_succ(first_entry); 3355 else 3356 return (KERN_INVALID_ADDRESS); 3357 } 3358 for (entry = first_entry; entry->start < end; entry = next_entry) { 3359 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { 3360 /* 3361 * We have not yet clipped the entry. 3362 */ 3363 next_entry = vm_map_entry_in_transition(map, start, 3364 &end, holes_ok, entry); 3365 if (next_entry == NULL) { 3366 if (entry == first_entry) 3367 return (KERN_INVALID_ADDRESS); 3368 rv = KERN_INVALID_ADDRESS; 3369 goto done; 3370 } 3371 first_entry = (entry == first_entry) ? 3372 next_entry : NULL; 3373 continue; 3374 } 3375 rv = vm_map_clip_start(map, entry, start); 3376 if (rv != KERN_SUCCESS) 3377 goto done; 3378 rv = vm_map_clip_end(map, entry, end); 3379 if (rv != KERN_SUCCESS) 3380 goto done; 3381 3382 /* 3383 * Mark the entry in case the map lock is released. (See 3384 * above.) 3385 */ 3386 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 && 3387 entry->wiring_thread == NULL, 3388 ("owned map entry %p", entry)); 3389 entry->eflags |= MAP_ENTRY_IN_TRANSITION; 3390 entry->wiring_thread = curthread; 3391 if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 3392 || (entry->protection & prot) != prot) { 3393 entry->eflags |= MAP_ENTRY_WIRE_SKIPPED; 3394 if (!holes_ok) { 3395 end = entry->end; 3396 rv = KERN_INVALID_ADDRESS; 3397 goto done; 3398 } 3399 } else if (entry->wired_count == 0) { 3400 entry->wired_count++; 3401 3402 npages = atop(entry->end - entry->start); 3403 if (user_wire && !vm_map_wire_user_count_add(npages)) { 3404 vm_map_wire_entry_failure(map, entry, 3405 entry->start); 3406 end = entry->end; 3407 rv = KERN_RESOURCE_SHORTAGE; 3408 goto done; 3409 } 3410 3411 /* 3412 * Release the map lock, relying on the in-transition 3413 * mark. Mark the map busy for fork. 3414 */ 3415 saved_start = entry->start; 3416 saved_end = entry->end; 3417 last_timestamp = map->timestamp; 3418 bidx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) 3419 >> MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; 3420 incr = pagesizes[bidx]; 3421 vm_map_busy(map); 3422 vm_map_unlock(map); 3423 3424 for (faddr = saved_start; faddr < saved_end; 3425 faddr += incr) { 3426 /* 3427 * Simulate a fault to get the page and enter 3428 * it into the physical map. 3429 */ 3430 rv = vm_fault(map, faddr, VM_PROT_NONE, 3431 VM_FAULT_WIRE, NULL); 3432 if (rv != KERN_SUCCESS) 3433 break; 3434 } 3435 vm_map_lock(map); 3436 vm_map_unbusy(map); 3437 if (last_timestamp + 1 != map->timestamp) { 3438 /* 3439 * Look again for the entry because the map was 3440 * modified while it was unlocked. The entry 3441 * may have been clipped, but NOT merged or 3442 * deleted. 3443 */ 3444 if (!vm_map_lookup_entry(map, saved_start, 3445 &next_entry)) 3446 KASSERT(false, 3447 ("vm_map_wire: lookup failed")); 3448 first_entry = (entry == first_entry) ? 3449 next_entry : NULL; 3450 for (entry = next_entry; entry->end < saved_end; 3451 entry = vm_map_entry_succ(entry)) { 3452 /* 3453 * In case of failure, handle entries 3454 * that were not fully wired here; 3455 * fully wired entries are handled 3456 * later. 3457 */ 3458 if (rv != KERN_SUCCESS && 3459 faddr < entry->end) 3460 vm_map_wire_entry_failure(map, 3461 entry, faddr); 3462 } 3463 } 3464 if (rv != KERN_SUCCESS) { 3465 vm_map_wire_entry_failure(map, entry, faddr); 3466 if (user_wire) 3467 vm_map_wire_user_count_sub(npages); 3468 end = entry->end; 3469 goto done; 3470 } 3471 } else if (!user_wire || 3472 (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) { 3473 entry->wired_count++; 3474 } 3475 /* 3476 * Check the map for holes in the specified region. 3477 * If holes_ok was specified, skip this check. 3478 */ 3479 next_entry = vm_map_entry_succ(entry); 3480 if (!holes_ok && 3481 entry->end < end && next_entry->start > entry->end) { 3482 end = entry->end; 3483 rv = KERN_INVALID_ADDRESS; 3484 goto done; 3485 } 3486 } 3487 rv = KERN_SUCCESS; 3488 done: 3489 need_wakeup = false; 3490 if (first_entry == NULL && 3491 !vm_map_lookup_entry(map, start, &first_entry)) { 3492 KASSERT(holes_ok, ("vm_map_wire: lookup failed")); 3493 prev_entry = first_entry; 3494 entry = vm_map_entry_succ(first_entry); 3495 } else { 3496 prev_entry = vm_map_entry_pred(first_entry); 3497 entry = first_entry; 3498 } 3499 for (; entry->start < end; 3500 prev_entry = entry, entry = vm_map_entry_succ(entry)) { 3501 /* 3502 * If holes_ok was specified, an empty 3503 * space in the unwired region could have been mapped 3504 * while the map lock was dropped for faulting in the 3505 * pages or draining MAP_ENTRY_IN_TRANSITION. 3506 * Moreover, another thread could be simultaneously 3507 * wiring this new mapping entry. Detect these cases 3508 * and skip any entries marked as in transition not by us. 3509 * 3510 * Another way to get an entry not marked with 3511 * MAP_ENTRY_IN_TRANSITION is after failed clipping, 3512 * which set rv to KERN_INVALID_ARGUMENT. 3513 */ 3514 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 || 3515 entry->wiring_thread != curthread) { 3516 KASSERT(holes_ok || rv == KERN_INVALID_ARGUMENT, 3517 ("vm_map_wire: !HOLESOK and new/changed entry")); 3518 continue; 3519 } 3520 3521 if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0) { 3522 /* do nothing */ 3523 } else if (rv == KERN_SUCCESS) { 3524 if (user_wire) 3525 entry->eflags |= MAP_ENTRY_USER_WIRED; 3526 } else if (entry->wired_count == -1) { 3527 /* 3528 * Wiring failed on this entry. Thus, unwiring is 3529 * unnecessary. 3530 */ 3531 entry->wired_count = 0; 3532 } else if (!user_wire || 3533 (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) { 3534 /* 3535 * Undo the wiring. Wiring succeeded on this entry 3536 * but failed on a later entry. 3537 */ 3538 if (entry->wired_count == 1) { 3539 vm_map_entry_unwire(map, entry); 3540 if (user_wire) 3541 vm_map_wire_user_count_sub( 3542 atop(entry->end - entry->start)); 3543 } else 3544 entry->wired_count--; 3545 } 3546 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0, 3547 ("vm_map_wire: in-transition flag missing %p", entry)); 3548 KASSERT(entry->wiring_thread == curthread, 3549 ("vm_map_wire: alien wire %p", entry)); 3550 entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION | 3551 MAP_ENTRY_WIRE_SKIPPED); 3552 entry->wiring_thread = NULL; 3553 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) { 3554 entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP; 3555 need_wakeup = true; 3556 } 3557 vm_map_try_merge_entries(map, prev_entry, entry); 3558 } 3559 vm_map_try_merge_entries(map, prev_entry, entry); 3560 if (need_wakeup) 3561 vm_map_wakeup(map); 3562 return (rv); 3563 } 3564 3565 /* 3566 * vm_map_sync 3567 * 3568 * Push any dirty cached pages in the address range to their pager. 3569 * If syncio is TRUE, dirty pages are written synchronously. 3570 * If invalidate is TRUE, any cached pages are freed as well. 3571 * 3572 * If the size of the region from start to end is zero, we are 3573 * supposed to flush all modified pages within the region containing 3574 * start. Unfortunately, a region can be split or coalesced with 3575 * neighboring regions, making it difficult to determine what the 3576 * original region was. Therefore, we approximate this requirement by 3577 * flushing the current region containing start. 3578 * 3579 * Returns an error if any part of the specified range is not mapped. 3580 */ 3581 int 3582 vm_map_sync( 3583 vm_map_t map, 3584 vm_offset_t start, 3585 vm_offset_t end, 3586 boolean_t syncio, 3587 boolean_t invalidate) 3588 { 3589 vm_map_entry_t entry, first_entry, next_entry; 3590 vm_size_t size; 3591 vm_object_t object; 3592 vm_ooffset_t offset; 3593 unsigned int last_timestamp; 3594 int bdry_idx; 3595 boolean_t failed; 3596 3597 vm_map_lock_read(map); 3598 VM_MAP_RANGE_CHECK(map, start, end); 3599 if (!vm_map_lookup_entry(map, start, &first_entry)) { 3600 vm_map_unlock_read(map); 3601 return (KERN_INVALID_ADDRESS); 3602 } else if (start == end) { 3603 start = first_entry->start; 3604 end = first_entry->end; 3605 } 3606 3607 /* 3608 * Make a first pass to check for user-wired memory, holes, 3609 * and partial invalidation of largepage mappings. 3610 */ 3611 for (entry = first_entry; entry->start < end; entry = next_entry) { 3612 if (invalidate) { 3613 if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0) { 3614 vm_map_unlock_read(map); 3615 return (KERN_INVALID_ARGUMENT); 3616 } 3617 bdry_idx = (entry->eflags & 3618 MAP_ENTRY_SPLIT_BOUNDARY_MASK) >> 3619 MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; 3620 if (bdry_idx != 0 && 3621 ((start & (pagesizes[bdry_idx] - 1)) != 0 || 3622 (end & (pagesizes[bdry_idx] - 1)) != 0)) { 3623 vm_map_unlock_read(map); 3624 return (KERN_INVALID_ARGUMENT); 3625 } 3626 } 3627 next_entry = vm_map_entry_succ(entry); 3628 if (end > entry->end && 3629 entry->end != next_entry->start) { 3630 vm_map_unlock_read(map); 3631 return (KERN_INVALID_ADDRESS); 3632 } 3633 } 3634 3635 if (invalidate) 3636 pmap_remove(map->pmap, start, end); 3637 failed = FALSE; 3638 3639 /* 3640 * Make a second pass, cleaning/uncaching pages from the indicated 3641 * objects as we go. 3642 */ 3643 for (entry = first_entry; entry->start < end;) { 3644 offset = entry->offset + (start - entry->start); 3645 size = (end <= entry->end ? end : entry->end) - start; 3646 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) { 3647 vm_map_t smap; 3648 vm_map_entry_t tentry; 3649 vm_size_t tsize; 3650 3651 smap = entry->object.sub_map; 3652 vm_map_lock_read(smap); 3653 (void) vm_map_lookup_entry(smap, offset, &tentry); 3654 tsize = tentry->end - offset; 3655 if (tsize < size) 3656 size = tsize; 3657 object = tentry->object.vm_object; 3658 offset = tentry->offset + (offset - tentry->start); 3659 vm_map_unlock_read(smap); 3660 } else { 3661 object = entry->object.vm_object; 3662 } 3663 vm_object_reference(object); 3664 last_timestamp = map->timestamp; 3665 vm_map_unlock_read(map); 3666 if (!vm_object_sync(object, offset, size, syncio, invalidate)) 3667 failed = TRUE; 3668 start += size; 3669 vm_object_deallocate(object); 3670 vm_map_lock_read(map); 3671 if (last_timestamp == map->timestamp || 3672 !vm_map_lookup_entry(map, start, &entry)) 3673 entry = vm_map_entry_succ(entry); 3674 } 3675 3676 vm_map_unlock_read(map); 3677 return (failed ? KERN_FAILURE : KERN_SUCCESS); 3678 } 3679 3680 /* 3681 * vm_map_entry_unwire: [ internal use only ] 3682 * 3683 * Make the region specified by this entry pageable. 3684 * 3685 * The map in question should be locked. 3686 * [This is the reason for this routine's existence.] 3687 */ 3688 static void 3689 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry) 3690 { 3691 vm_size_t size; 3692 3693 VM_MAP_ASSERT_LOCKED(map); 3694 KASSERT(entry->wired_count > 0, 3695 ("vm_map_entry_unwire: entry %p isn't wired", entry)); 3696 3697 size = entry->end - entry->start; 3698 if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0) 3699 vm_map_wire_user_count_sub(atop(size)); 3700 pmap_unwire(map->pmap, entry->start, entry->end); 3701 vm_object_unwire(entry->object.vm_object, entry->offset, size, 3702 PQ_ACTIVE); 3703 entry->wired_count = 0; 3704 } 3705 3706 static void 3707 vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map) 3708 { 3709 3710 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) 3711 vm_object_deallocate(entry->object.vm_object); 3712 uma_zfree(system_map ? kmapentzone : mapentzone, entry); 3713 } 3714 3715 /* 3716 * vm_map_entry_delete: [ internal use only ] 3717 * 3718 * Deallocate the given entry from the target map. 3719 */ 3720 static void 3721 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry) 3722 { 3723 vm_object_t object; 3724 vm_pindex_t offidxstart, offidxend, size1; 3725 vm_size_t size; 3726 3727 vm_map_entry_unlink(map, entry, UNLINK_MERGE_NONE); 3728 object = entry->object.vm_object; 3729 3730 if ((entry->eflags & MAP_ENTRY_GUARD) != 0) { 3731 MPASS(entry->cred == NULL); 3732 MPASS((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0); 3733 MPASS(object == NULL); 3734 vm_map_entry_deallocate(entry, map->system_map); 3735 return; 3736 } 3737 3738 size = entry->end - entry->start; 3739 map->size -= size; 3740 3741 if (entry->cred != NULL) { 3742 swap_release_by_cred(size, entry->cred); 3743 crfree(entry->cred); 3744 } 3745 3746 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 || object == NULL) { 3747 entry->object.vm_object = NULL; 3748 } else if ((object->flags & OBJ_ANON) != 0 || 3749 object == kernel_object) { 3750 KASSERT(entry->cred == NULL || object->cred == NULL || 3751 (entry->eflags & MAP_ENTRY_NEEDS_COPY), 3752 ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry)); 3753 offidxstart = OFF_TO_IDX(entry->offset); 3754 offidxend = offidxstart + atop(size); 3755 VM_OBJECT_WLOCK(object); 3756 if (object->ref_count != 1 && 3757 ((object->flags & OBJ_ONEMAPPING) != 0 || 3758 object == kernel_object)) { 3759 vm_object_collapse(object); 3760 3761 /* 3762 * The option OBJPR_NOTMAPPED can be passed here 3763 * because vm_map_delete() already performed 3764 * pmap_remove() on the only mapping to this range 3765 * of pages. 3766 */ 3767 vm_object_page_remove(object, offidxstart, offidxend, 3768 OBJPR_NOTMAPPED); 3769 if (offidxend >= object->size && 3770 offidxstart < object->size) { 3771 size1 = object->size; 3772 object->size = offidxstart; 3773 if (object->cred != NULL) { 3774 size1 -= object->size; 3775 KASSERT(object->charge >= ptoa(size1), 3776 ("object %p charge < 0", object)); 3777 swap_release_by_cred(ptoa(size1), 3778 object->cred); 3779 object->charge -= ptoa(size1); 3780 } 3781 } 3782 } 3783 VM_OBJECT_WUNLOCK(object); 3784 } 3785 if (map->system_map) 3786 vm_map_entry_deallocate(entry, TRUE); 3787 else { 3788 entry->defer_next = curthread->td_map_def_user; 3789 curthread->td_map_def_user = entry; 3790 } 3791 } 3792 3793 /* 3794 * vm_map_delete: [ internal use only ] 3795 * 3796 * Deallocates the given address range from the target 3797 * map. 3798 */ 3799 int 3800 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end) 3801 { 3802 vm_map_entry_t entry, next_entry, scratch_entry; 3803 int rv; 3804 3805 VM_MAP_ASSERT_LOCKED(map); 3806 3807 if (start == end) 3808 return (KERN_SUCCESS); 3809 3810 /* 3811 * Find the start of the region, and clip it. 3812 * Step through all entries in this region. 3813 */ 3814 rv = vm_map_lookup_clip_start(map, start, &entry, &scratch_entry); 3815 if (rv != KERN_SUCCESS) 3816 return (rv); 3817 for (; entry->start < end; entry = next_entry) { 3818 /* 3819 * Wait for wiring or unwiring of an entry to complete. 3820 * Also wait for any system wirings to disappear on 3821 * user maps. 3822 */ 3823 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 || 3824 (vm_map_pmap(map) != kernel_pmap && 3825 vm_map_entry_system_wired_count(entry) != 0)) { 3826 unsigned int last_timestamp; 3827 vm_offset_t saved_start; 3828 3829 saved_start = entry->start; 3830 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; 3831 last_timestamp = map->timestamp; 3832 (void) vm_map_unlock_and_wait(map, 0); 3833 vm_map_lock(map); 3834 if (last_timestamp + 1 != map->timestamp) { 3835 /* 3836 * Look again for the entry because the map was 3837 * modified while it was unlocked. 3838 * Specifically, the entry may have been 3839 * clipped, merged, or deleted. 3840 */ 3841 rv = vm_map_lookup_clip_start(map, saved_start, 3842 &next_entry, &scratch_entry); 3843 if (rv != KERN_SUCCESS) 3844 break; 3845 } else 3846 next_entry = entry; 3847 continue; 3848 } 3849 3850 /* XXXKIB or delete to the upper superpage boundary ? */ 3851 rv = vm_map_clip_end(map, entry, end); 3852 if (rv != KERN_SUCCESS) 3853 break; 3854 next_entry = vm_map_entry_succ(entry); 3855 3856 /* 3857 * Unwire before removing addresses from the pmap; otherwise, 3858 * unwiring will put the entries back in the pmap. 3859 */ 3860 if (entry->wired_count != 0) 3861 vm_map_entry_unwire(map, entry); 3862 3863 /* 3864 * Remove mappings for the pages, but only if the 3865 * mappings could exist. For instance, it does not 3866 * make sense to call pmap_remove() for guard entries. 3867 */ 3868 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 || 3869 entry->object.vm_object != NULL) 3870 pmap_remove(map->pmap, entry->start, entry->end); 3871 3872 if (entry->end == map->anon_loc) 3873 map->anon_loc = entry->start; 3874 3875 /* 3876 * Delete the entry only after removing all pmap 3877 * entries pointing to its pages. (Otherwise, its 3878 * page frames may be reallocated, and any modify bits 3879 * will be set in the wrong object!) 3880 */ 3881 vm_map_entry_delete(map, entry); 3882 } 3883 return (rv); 3884 } 3885 3886 /* 3887 * vm_map_remove: 3888 * 3889 * Remove the given address range from the target map. 3890 * This is the exported form of vm_map_delete. 3891 */ 3892 int 3893 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end) 3894 { 3895 int result; 3896 3897 vm_map_lock(map); 3898 VM_MAP_RANGE_CHECK(map, start, end); 3899 result = vm_map_delete(map, start, end); 3900 vm_map_unlock(map); 3901 return (result); 3902 } 3903 3904 /* 3905 * vm_map_check_protection: 3906 * 3907 * Assert that the target map allows the specified privilege on the 3908 * entire address region given. The entire region must be allocated. 3909 * 3910 * WARNING! This code does not and should not check whether the 3911 * contents of the region is accessible. For example a smaller file 3912 * might be mapped into a larger address space. 3913 * 3914 * NOTE! This code is also called by munmap(). 3915 * 3916 * The map must be locked. A read lock is sufficient. 3917 */ 3918 boolean_t 3919 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end, 3920 vm_prot_t protection) 3921 { 3922 vm_map_entry_t entry; 3923 vm_map_entry_t tmp_entry; 3924 3925 if (!vm_map_lookup_entry(map, start, &tmp_entry)) 3926 return (FALSE); 3927 entry = tmp_entry; 3928 3929 while (start < end) { 3930 /* 3931 * No holes allowed! 3932 */ 3933 if (start < entry->start) 3934 return (FALSE); 3935 /* 3936 * Check protection associated with entry. 3937 */ 3938 if ((entry->protection & protection) != protection) 3939 return (FALSE); 3940 /* go to next entry */ 3941 start = entry->end; 3942 entry = vm_map_entry_succ(entry); 3943 } 3944 return (TRUE); 3945 } 3946 3947 /* 3948 * 3949 * vm_map_copy_swap_object: 3950 * 3951 * Copies a swap-backed object from an existing map entry to a 3952 * new one. Carries forward the swap charge. May change the 3953 * src object on return. 3954 */ 3955 static void 3956 vm_map_copy_swap_object(vm_map_entry_t src_entry, vm_map_entry_t dst_entry, 3957 vm_offset_t size, vm_ooffset_t *fork_charge) 3958 { 3959 vm_object_t src_object; 3960 struct ucred *cred; 3961 int charged; 3962 3963 src_object = src_entry->object.vm_object; 3964 charged = ENTRY_CHARGED(src_entry); 3965 if ((src_object->flags & OBJ_ANON) != 0) { 3966 VM_OBJECT_WLOCK(src_object); 3967 vm_object_collapse(src_object); 3968 if ((src_object->flags & OBJ_ONEMAPPING) != 0) { 3969 vm_object_split(src_entry); 3970 src_object = src_entry->object.vm_object; 3971 } 3972 vm_object_reference_locked(src_object); 3973 vm_object_clear_flag(src_object, OBJ_ONEMAPPING); 3974 VM_OBJECT_WUNLOCK(src_object); 3975 } else 3976 vm_object_reference(src_object); 3977 if (src_entry->cred != NULL && 3978 !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) { 3979 KASSERT(src_object->cred == NULL, 3980 ("OVERCOMMIT: vm_map_copy_anon_entry: cred %p", 3981 src_object)); 3982 src_object->cred = src_entry->cred; 3983 src_object->charge = size; 3984 } 3985 dst_entry->object.vm_object = src_object; 3986 if (charged) { 3987 cred = curthread->td_ucred; 3988 crhold(cred); 3989 dst_entry->cred = cred; 3990 *fork_charge += size; 3991 if (!(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) { 3992 crhold(cred); 3993 src_entry->cred = cred; 3994 *fork_charge += size; 3995 } 3996 } 3997 } 3998 3999 /* 4000 * vm_map_copy_entry: 4001 * 4002 * Copies the contents of the source entry to the destination 4003 * entry. The entries *must* be aligned properly. 4004 */ 4005 static void 4006 vm_map_copy_entry( 4007 vm_map_t src_map, 4008 vm_map_t dst_map, 4009 vm_map_entry_t src_entry, 4010 vm_map_entry_t dst_entry, 4011 vm_ooffset_t *fork_charge) 4012 { 4013 vm_object_t src_object; 4014 vm_map_entry_t fake_entry; 4015 vm_offset_t size; 4016 4017 VM_MAP_ASSERT_LOCKED(dst_map); 4018 4019 if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP) 4020 return; 4021 4022 if (src_entry->wired_count == 0 || 4023 (src_entry->protection & VM_PROT_WRITE) == 0) { 4024 /* 4025 * If the source entry is marked needs_copy, it is already 4026 * write-protected. 4027 */ 4028 if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 && 4029 (src_entry->protection & VM_PROT_WRITE) != 0) { 4030 pmap_protect(src_map->pmap, 4031 src_entry->start, 4032 src_entry->end, 4033 src_entry->protection & ~VM_PROT_WRITE); 4034 } 4035 4036 /* 4037 * Make a copy of the object. 4038 */ 4039 size = src_entry->end - src_entry->start; 4040 if ((src_object = src_entry->object.vm_object) != NULL) { 4041 if (src_object->type == OBJT_DEFAULT || 4042 src_object->type == OBJT_SWAP) { 4043 vm_map_copy_swap_object(src_entry, dst_entry, 4044 size, fork_charge); 4045 /* May have split/collapsed, reload obj. */ 4046 src_object = src_entry->object.vm_object; 4047 } else { 4048 vm_object_reference(src_object); 4049 dst_entry->object.vm_object = src_object; 4050 } 4051 src_entry->eflags |= MAP_ENTRY_COW | 4052 MAP_ENTRY_NEEDS_COPY; 4053 dst_entry->eflags |= MAP_ENTRY_COW | 4054 MAP_ENTRY_NEEDS_COPY; 4055 dst_entry->offset = src_entry->offset; 4056 if (src_entry->eflags & MAP_ENTRY_WRITECNT) { 4057 /* 4058 * MAP_ENTRY_WRITECNT cannot 4059 * indicate write reference from 4060 * src_entry, since the entry is 4061 * marked as needs copy. Allocate a 4062 * fake entry that is used to 4063 * decrement object->un_pager writecount 4064 * at the appropriate time. Attach 4065 * fake_entry to the deferred list. 4066 */ 4067 fake_entry = vm_map_entry_create(dst_map); 4068 fake_entry->eflags = MAP_ENTRY_WRITECNT; 4069 src_entry->eflags &= ~MAP_ENTRY_WRITECNT; 4070 vm_object_reference(src_object); 4071 fake_entry->object.vm_object = src_object; 4072 fake_entry->start = src_entry->start; 4073 fake_entry->end = src_entry->end; 4074 fake_entry->defer_next = 4075 curthread->td_map_def_user; 4076 curthread->td_map_def_user = fake_entry; 4077 } 4078 4079 pmap_copy(dst_map->pmap, src_map->pmap, 4080 dst_entry->start, dst_entry->end - dst_entry->start, 4081 src_entry->start); 4082 } else { 4083 dst_entry->object.vm_object = NULL; 4084 dst_entry->offset = 0; 4085 if (src_entry->cred != NULL) { 4086 dst_entry->cred = curthread->td_ucred; 4087 crhold(dst_entry->cred); 4088 *fork_charge += size; 4089 } 4090 } 4091 } else { 4092 /* 4093 * We don't want to make writeable wired pages copy-on-write. 4094 * Immediately copy these pages into the new map by simulating 4095 * page faults. The new pages are pageable. 4096 */ 4097 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry, 4098 fork_charge); 4099 } 4100 } 4101 4102 /* 4103 * vmspace_map_entry_forked: 4104 * Update the newly-forked vmspace each time a map entry is inherited 4105 * or copied. The values for vm_dsize and vm_tsize are approximate 4106 * (and mostly-obsolete ideas in the face of mmap(2) et al.) 4107 */ 4108 static void 4109 vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2, 4110 vm_map_entry_t entry) 4111 { 4112 vm_size_t entrysize; 4113 vm_offset_t newend; 4114 4115 if ((entry->eflags & MAP_ENTRY_GUARD) != 0) 4116 return; 4117 entrysize = entry->end - entry->start; 4118 vm2->vm_map.size += entrysize; 4119 if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) { 4120 vm2->vm_ssize += btoc(entrysize); 4121 } else if (entry->start >= (vm_offset_t)vm1->vm_daddr && 4122 entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) { 4123 newend = MIN(entry->end, 4124 (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)); 4125 vm2->vm_dsize += btoc(newend - entry->start); 4126 } else if (entry->start >= (vm_offset_t)vm1->vm_taddr && 4127 entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) { 4128 newend = MIN(entry->end, 4129 (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)); 4130 vm2->vm_tsize += btoc(newend - entry->start); 4131 } 4132 } 4133 4134 /* 4135 * vmspace_fork: 4136 * Create a new process vmspace structure and vm_map 4137 * based on those of an existing process. The new map 4138 * is based on the old map, according to the inheritance 4139 * values on the regions in that map. 4140 * 4141 * XXX It might be worth coalescing the entries added to the new vmspace. 4142 * 4143 * The source map must not be locked. 4144 */ 4145 struct vmspace * 4146 vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge) 4147 { 4148 struct vmspace *vm2; 4149 vm_map_t new_map, old_map; 4150 vm_map_entry_t new_entry, old_entry; 4151 vm_object_t object; 4152 int error, locked; 4153 vm_inherit_t inh; 4154 4155 old_map = &vm1->vm_map; 4156 /* Copy immutable fields of vm1 to vm2. */ 4157 vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map), 4158 pmap_pinit); 4159 if (vm2 == NULL) 4160 return (NULL); 4161 4162 vm2->vm_taddr = vm1->vm_taddr; 4163 vm2->vm_daddr = vm1->vm_daddr; 4164 vm2->vm_maxsaddr = vm1->vm_maxsaddr; 4165 vm_map_lock(old_map); 4166 if (old_map->busy) 4167 vm_map_wait_busy(old_map); 4168 new_map = &vm2->vm_map; 4169 locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */ 4170 KASSERT(locked, ("vmspace_fork: lock failed")); 4171 4172 error = pmap_vmspace_copy(new_map->pmap, old_map->pmap); 4173 if (error != 0) { 4174 sx_xunlock(&old_map->lock); 4175 sx_xunlock(&new_map->lock); 4176 vm_map_process_deferred(); 4177 vmspace_free(vm2); 4178 return (NULL); 4179 } 4180 4181 new_map->anon_loc = old_map->anon_loc; 4182 new_map->flags |= old_map->flags & (MAP_ASLR | MAP_ASLR_IGNSTART); 4183 4184 VM_MAP_ENTRY_FOREACH(old_entry, old_map) { 4185 if ((old_entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) 4186 panic("vm_map_fork: encountered a submap"); 4187 4188 inh = old_entry->inheritance; 4189 if ((old_entry->eflags & MAP_ENTRY_GUARD) != 0 && 4190 inh != VM_INHERIT_NONE) 4191 inh = VM_INHERIT_COPY; 4192 4193 switch (inh) { 4194 case VM_INHERIT_NONE: 4195 break; 4196 4197 case VM_INHERIT_SHARE: 4198 /* 4199 * Clone the entry, creating the shared object if 4200 * necessary. 4201 */ 4202 object = old_entry->object.vm_object; 4203 if (object == NULL) { 4204 vm_map_entry_back(old_entry); 4205 object = old_entry->object.vm_object; 4206 } 4207 4208 /* 4209 * Add the reference before calling vm_object_shadow 4210 * to insure that a shadow object is created. 4211 */ 4212 vm_object_reference(object); 4213 if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) { 4214 vm_object_shadow(&old_entry->object.vm_object, 4215 &old_entry->offset, 4216 old_entry->end - old_entry->start, 4217 old_entry->cred, 4218 /* Transfer the second reference too. */ 4219 true); 4220 old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY; 4221 old_entry->cred = NULL; 4222 4223 /* 4224 * As in vm_map_merged_neighbor_dispose(), 4225 * the vnode lock will not be acquired in 4226 * this call to vm_object_deallocate(). 4227 */ 4228 vm_object_deallocate(object); 4229 object = old_entry->object.vm_object; 4230 } else { 4231 VM_OBJECT_WLOCK(object); 4232 vm_object_clear_flag(object, OBJ_ONEMAPPING); 4233 if (old_entry->cred != NULL) { 4234 KASSERT(object->cred == NULL, 4235 ("vmspace_fork both cred")); 4236 object->cred = old_entry->cred; 4237 object->charge = old_entry->end - 4238 old_entry->start; 4239 old_entry->cred = NULL; 4240 } 4241 4242 /* 4243 * Assert the correct state of the vnode 4244 * v_writecount while the object is locked, to 4245 * not relock it later for the assertion 4246 * correctness. 4247 */ 4248 if (old_entry->eflags & MAP_ENTRY_WRITECNT && 4249 object->type == OBJT_VNODE) { 4250 KASSERT(((struct vnode *)object-> 4251 handle)->v_writecount > 0, 4252 ("vmspace_fork: v_writecount %p", 4253 object)); 4254 KASSERT(object->un_pager.vnp. 4255 writemappings > 0, 4256 ("vmspace_fork: vnp.writecount %p", 4257 object)); 4258 } 4259 VM_OBJECT_WUNLOCK(object); 4260 } 4261 4262 /* 4263 * Clone the entry, referencing the shared object. 4264 */ 4265 new_entry = vm_map_entry_create(new_map); 4266 *new_entry = *old_entry; 4267 new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | 4268 MAP_ENTRY_IN_TRANSITION); 4269 new_entry->wiring_thread = NULL; 4270 new_entry->wired_count = 0; 4271 if (new_entry->eflags & MAP_ENTRY_WRITECNT) { 4272 vm_pager_update_writecount(object, 4273 new_entry->start, new_entry->end); 4274 } 4275 vm_map_entry_set_vnode_text(new_entry, true); 4276 4277 /* 4278 * Insert the entry into the new map -- we know we're 4279 * inserting at the end of the new map. 4280 */ 4281 vm_map_entry_link(new_map, new_entry); 4282 vmspace_map_entry_forked(vm1, vm2, new_entry); 4283 4284 /* 4285 * Update the physical map 4286 */ 4287 pmap_copy(new_map->pmap, old_map->pmap, 4288 new_entry->start, 4289 (old_entry->end - old_entry->start), 4290 old_entry->start); 4291 break; 4292 4293 case VM_INHERIT_COPY: 4294 /* 4295 * Clone the entry and link into the map. 4296 */ 4297 new_entry = vm_map_entry_create(new_map); 4298 *new_entry = *old_entry; 4299 /* 4300 * Copied entry is COW over the old object. 4301 */ 4302 new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | 4303 MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_WRITECNT); 4304 new_entry->wiring_thread = NULL; 4305 new_entry->wired_count = 0; 4306 new_entry->object.vm_object = NULL; 4307 new_entry->cred = NULL; 4308 vm_map_entry_link(new_map, new_entry); 4309 vmspace_map_entry_forked(vm1, vm2, new_entry); 4310 vm_map_copy_entry(old_map, new_map, old_entry, 4311 new_entry, fork_charge); 4312 vm_map_entry_set_vnode_text(new_entry, true); 4313 break; 4314 4315 case VM_INHERIT_ZERO: 4316 /* 4317 * Create a new anonymous mapping entry modelled from 4318 * the old one. 4319 */ 4320 new_entry = vm_map_entry_create(new_map); 4321 memset(new_entry, 0, sizeof(*new_entry)); 4322 4323 new_entry->start = old_entry->start; 4324 new_entry->end = old_entry->end; 4325 new_entry->eflags = old_entry->eflags & 4326 ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION | 4327 MAP_ENTRY_WRITECNT | MAP_ENTRY_VN_EXEC | 4328 MAP_ENTRY_SPLIT_BOUNDARY_MASK); 4329 new_entry->protection = old_entry->protection; 4330 new_entry->max_protection = old_entry->max_protection; 4331 new_entry->inheritance = VM_INHERIT_ZERO; 4332 4333 vm_map_entry_link(new_map, new_entry); 4334 vmspace_map_entry_forked(vm1, vm2, new_entry); 4335 4336 new_entry->cred = curthread->td_ucred; 4337 crhold(new_entry->cred); 4338 *fork_charge += (new_entry->end - new_entry->start); 4339 4340 break; 4341 } 4342 } 4343 /* 4344 * Use inlined vm_map_unlock() to postpone handling the deferred 4345 * map entries, which cannot be done until both old_map and 4346 * new_map locks are released. 4347 */ 4348 sx_xunlock(&old_map->lock); 4349 sx_xunlock(&new_map->lock); 4350 vm_map_process_deferred(); 4351 4352 return (vm2); 4353 } 4354 4355 /* 4356 * Create a process's stack for exec_new_vmspace(). This function is never 4357 * asked to wire the newly created stack. 4358 */ 4359 int 4360 vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, 4361 vm_prot_t prot, vm_prot_t max, int cow) 4362 { 4363 vm_size_t growsize, init_ssize; 4364 rlim_t vmemlim; 4365 int rv; 4366 4367 MPASS((map->flags & MAP_WIREFUTURE) == 0); 4368 growsize = sgrowsiz; 4369 init_ssize = (max_ssize < growsize) ? max_ssize : growsize; 4370 vm_map_lock(map); 4371 vmemlim = lim_cur(curthread, RLIMIT_VMEM); 4372 /* If we would blow our VMEM resource limit, no go */ 4373 if (map->size + init_ssize > vmemlim) { 4374 rv = KERN_NO_SPACE; 4375 goto out; 4376 } 4377 rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot, 4378 max, cow); 4379 out: 4380 vm_map_unlock(map); 4381 return (rv); 4382 } 4383 4384 static int stack_guard_page = 1; 4385 SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN, 4386 &stack_guard_page, 0, 4387 "Specifies the number of guard pages for a stack that grows"); 4388 4389 static int 4390 vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, 4391 vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow) 4392 { 4393 vm_map_entry_t new_entry, prev_entry; 4394 vm_offset_t bot, gap_bot, gap_top, top; 4395 vm_size_t init_ssize, sgp; 4396 int orient, rv; 4397 4398 /* 4399 * The stack orientation is piggybacked with the cow argument. 4400 * Extract it into orient and mask the cow argument so that we 4401 * don't pass it around further. 4402 */ 4403 orient = cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP); 4404 KASSERT(orient != 0, ("No stack grow direction")); 4405 KASSERT(orient != (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP), 4406 ("bi-dir stack")); 4407 4408 if (max_ssize == 0 || 4409 !vm_map_range_valid(map, addrbos, addrbos + max_ssize)) 4410 return (KERN_INVALID_ADDRESS); 4411 sgp = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 || 4412 (curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 : 4413 (vm_size_t)stack_guard_page * PAGE_SIZE; 4414 if (sgp >= max_ssize) 4415 return (KERN_INVALID_ARGUMENT); 4416 4417 init_ssize = growsize; 4418 if (max_ssize < init_ssize + sgp) 4419 init_ssize = max_ssize - sgp; 4420 4421 /* If addr is already mapped, no go */ 4422 if (vm_map_lookup_entry(map, addrbos, &prev_entry)) 4423 return (KERN_NO_SPACE); 4424 4425 /* 4426 * If we can't accommodate max_ssize in the current mapping, no go. 4427 */ 4428 if (vm_map_entry_succ(prev_entry)->start < addrbos + max_ssize) 4429 return (KERN_NO_SPACE); 4430 4431 /* 4432 * We initially map a stack of only init_ssize. We will grow as 4433 * needed later. Depending on the orientation of the stack (i.e. 4434 * the grow direction) we either map at the top of the range, the 4435 * bottom of the range or in the middle. 4436 * 4437 * Note: we would normally expect prot and max to be VM_PROT_ALL, 4438 * and cow to be 0. Possibly we should eliminate these as input 4439 * parameters, and just pass these values here in the insert call. 4440 */ 4441 if (orient == MAP_STACK_GROWS_DOWN) { 4442 bot = addrbos + max_ssize - init_ssize; 4443 top = bot + init_ssize; 4444 gap_bot = addrbos; 4445 gap_top = bot; 4446 } else /* if (orient == MAP_STACK_GROWS_UP) */ { 4447 bot = addrbos; 4448 top = bot + init_ssize; 4449 gap_bot = top; 4450 gap_top = addrbos + max_ssize; 4451 } 4452 rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow); 4453 if (rv != KERN_SUCCESS) 4454 return (rv); 4455 new_entry = vm_map_entry_succ(prev_entry); 4456 KASSERT(new_entry->end == top || new_entry->start == bot, 4457 ("Bad entry start/end for new stack entry")); 4458 KASSERT((orient & MAP_STACK_GROWS_DOWN) == 0 || 4459 (new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0, 4460 ("new entry lacks MAP_ENTRY_GROWS_DOWN")); 4461 KASSERT((orient & MAP_STACK_GROWS_UP) == 0 || 4462 (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0, 4463 ("new entry lacks MAP_ENTRY_GROWS_UP")); 4464 if (gap_bot == gap_top) 4465 return (KERN_SUCCESS); 4466 rv = vm_map_insert(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE, 4467 VM_PROT_NONE, MAP_CREATE_GUARD | (orient == MAP_STACK_GROWS_DOWN ? 4468 MAP_CREATE_STACK_GAP_DN : MAP_CREATE_STACK_GAP_UP)); 4469 if (rv == KERN_SUCCESS) { 4470 /* 4471 * Gap can never successfully handle a fault, so 4472 * read-ahead logic is never used for it. Re-use 4473 * next_read of the gap entry to store 4474 * stack_guard_page for vm_map_growstack(). 4475 */ 4476 if (orient == MAP_STACK_GROWS_DOWN) 4477 vm_map_entry_pred(new_entry)->next_read = sgp; 4478 else 4479 vm_map_entry_succ(new_entry)->next_read = sgp; 4480 } else { 4481 (void)vm_map_delete(map, bot, top); 4482 } 4483 return (rv); 4484 } 4485 4486 /* 4487 * Attempts to grow a vm stack entry. Returns KERN_SUCCESS if we 4488 * successfully grow the stack. 4489 */ 4490 static int 4491 vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry) 4492 { 4493 vm_map_entry_t stack_entry; 4494 struct proc *p; 4495 struct vmspace *vm; 4496 struct ucred *cred; 4497 vm_offset_t gap_end, gap_start, grow_start; 4498 vm_size_t grow_amount, guard, max_grow; 4499 rlim_t lmemlim, stacklim, vmemlim; 4500 int rv, rv1; 4501 bool gap_deleted, grow_down, is_procstack; 4502 #ifdef notyet 4503 uint64_t limit; 4504 #endif 4505 #ifdef RACCT 4506 int error; 4507 #endif 4508 4509 p = curproc; 4510 vm = p->p_vmspace; 4511 4512 /* 4513 * Disallow stack growth when the access is performed by a 4514 * debugger or AIO daemon. The reason is that the wrong 4515 * resource limits are applied. 4516 */ 4517 if (p != initproc && (map != &p->p_vmspace->vm_map || 4518 p->p_textvp == NULL)) 4519 return (KERN_FAILURE); 4520 4521 MPASS(!map->system_map); 4522 4523 lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK); 4524 stacklim = lim_cur(curthread, RLIMIT_STACK); 4525 vmemlim = lim_cur(curthread, RLIMIT_VMEM); 4526 retry: 4527 /* If addr is not in a hole for a stack grow area, no need to grow. */ 4528 if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry)) 4529 return (KERN_FAILURE); 4530 if ((gap_entry->eflags & MAP_ENTRY_GUARD) == 0) 4531 return (KERN_SUCCESS); 4532 if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_DN) != 0) { 4533 stack_entry = vm_map_entry_succ(gap_entry); 4534 if ((stack_entry->eflags & MAP_ENTRY_GROWS_DOWN) == 0 || 4535 stack_entry->start != gap_entry->end) 4536 return (KERN_FAILURE); 4537 grow_amount = round_page(stack_entry->start - addr); 4538 grow_down = true; 4539 } else if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_UP) != 0) { 4540 stack_entry = vm_map_entry_pred(gap_entry); 4541 if ((stack_entry->eflags & MAP_ENTRY_GROWS_UP) == 0 || 4542 stack_entry->end != gap_entry->start) 4543 return (KERN_FAILURE); 4544 grow_amount = round_page(addr + 1 - stack_entry->end); 4545 grow_down = false; 4546 } else { 4547 return (KERN_FAILURE); 4548 } 4549 guard = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 || 4550 (curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 : 4551 gap_entry->next_read; 4552 max_grow = gap_entry->end - gap_entry->start; 4553 if (guard > max_grow) 4554 return (KERN_NO_SPACE); 4555 max_grow -= guard; 4556 if (grow_amount > max_grow) 4557 return (KERN_NO_SPACE); 4558 4559 /* 4560 * If this is the main process stack, see if we're over the stack 4561 * limit. 4562 */ 4563 is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr && 4564 addr < (vm_offset_t)p->p_sysent->sv_usrstack; 4565 if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) 4566 return (KERN_NO_SPACE); 4567 4568 #ifdef RACCT 4569 if (racct_enable) { 4570 PROC_LOCK(p); 4571 if (is_procstack && racct_set(p, RACCT_STACK, 4572 ctob(vm->vm_ssize) + grow_amount)) { 4573 PROC_UNLOCK(p); 4574 return (KERN_NO_SPACE); 4575 } 4576 PROC_UNLOCK(p); 4577 } 4578 #endif 4579 4580 grow_amount = roundup(grow_amount, sgrowsiz); 4581 if (grow_amount > max_grow) 4582 grow_amount = max_grow; 4583 if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) { 4584 grow_amount = trunc_page((vm_size_t)stacklim) - 4585 ctob(vm->vm_ssize); 4586 } 4587 4588 #ifdef notyet 4589 PROC_LOCK(p); 4590 limit = racct_get_available(p, RACCT_STACK); 4591 PROC_UNLOCK(p); 4592 if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit)) 4593 grow_amount = limit - ctob(vm->vm_ssize); 4594 #endif 4595 4596 if (!old_mlock && (map->flags & MAP_WIREFUTURE) != 0) { 4597 if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) { 4598 rv = KERN_NO_SPACE; 4599 goto out; 4600 } 4601 #ifdef RACCT 4602 if (racct_enable) { 4603 PROC_LOCK(p); 4604 if (racct_set(p, RACCT_MEMLOCK, 4605 ptoa(pmap_wired_count(map->pmap)) + grow_amount)) { 4606 PROC_UNLOCK(p); 4607 rv = KERN_NO_SPACE; 4608 goto out; 4609 } 4610 PROC_UNLOCK(p); 4611 } 4612 #endif 4613 } 4614 4615 /* If we would blow our VMEM resource limit, no go */ 4616 if (map->size + grow_amount > vmemlim) { 4617 rv = KERN_NO_SPACE; 4618 goto out; 4619 } 4620 #ifdef RACCT 4621 if (racct_enable) { 4622 PROC_LOCK(p); 4623 if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) { 4624 PROC_UNLOCK(p); 4625 rv = KERN_NO_SPACE; 4626 goto out; 4627 } 4628 PROC_UNLOCK(p); 4629 } 4630 #endif 4631 4632 if (vm_map_lock_upgrade(map)) { 4633 gap_entry = NULL; 4634 vm_map_lock_read(map); 4635 goto retry; 4636 } 4637 4638 if (grow_down) { 4639 grow_start = gap_entry->end - grow_amount; 4640 if (gap_entry->start + grow_amount == gap_entry->end) { 4641 gap_start = gap_entry->start; 4642 gap_end = gap_entry->end; 4643 vm_map_entry_delete(map, gap_entry); 4644 gap_deleted = true; 4645 } else { 4646 MPASS(gap_entry->start < gap_entry->end - grow_amount); 4647 vm_map_entry_resize(map, gap_entry, -grow_amount); 4648 gap_deleted = false; 4649 } 4650 rv = vm_map_insert(map, NULL, 0, grow_start, 4651 grow_start + grow_amount, 4652 stack_entry->protection, stack_entry->max_protection, 4653 MAP_STACK_GROWS_DOWN); 4654 if (rv != KERN_SUCCESS) { 4655 if (gap_deleted) { 4656 rv1 = vm_map_insert(map, NULL, 0, gap_start, 4657 gap_end, VM_PROT_NONE, VM_PROT_NONE, 4658 MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP_DN); 4659 MPASS(rv1 == KERN_SUCCESS); 4660 } else 4661 vm_map_entry_resize(map, gap_entry, 4662 grow_amount); 4663 } 4664 } else { 4665 grow_start = stack_entry->end; 4666 cred = stack_entry->cred; 4667 if (cred == NULL && stack_entry->object.vm_object != NULL) 4668 cred = stack_entry->object.vm_object->cred; 4669 if (cred != NULL && !swap_reserve_by_cred(grow_amount, cred)) 4670 rv = KERN_NO_SPACE; 4671 /* Grow the underlying object if applicable. */ 4672 else if (stack_entry->object.vm_object == NULL || 4673 vm_object_coalesce(stack_entry->object.vm_object, 4674 stack_entry->offset, 4675 (vm_size_t)(stack_entry->end - stack_entry->start), 4676 grow_amount, cred != NULL)) { 4677 if (gap_entry->start + grow_amount == gap_entry->end) { 4678 vm_map_entry_delete(map, gap_entry); 4679 vm_map_entry_resize(map, stack_entry, 4680 grow_amount); 4681 } else { 4682 gap_entry->start += grow_amount; 4683 stack_entry->end += grow_amount; 4684 } 4685 map->size += grow_amount; 4686 rv = KERN_SUCCESS; 4687 } else 4688 rv = KERN_FAILURE; 4689 } 4690 if (rv == KERN_SUCCESS && is_procstack) 4691 vm->vm_ssize += btoc(grow_amount); 4692 4693 /* 4694 * Heed the MAP_WIREFUTURE flag if it was set for this process. 4695 */ 4696 if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) { 4697 rv = vm_map_wire_locked(map, grow_start, 4698 grow_start + grow_amount, 4699 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 4700 } 4701 vm_map_lock_downgrade(map); 4702 4703 out: 4704 #ifdef RACCT 4705 if (racct_enable && rv != KERN_SUCCESS) { 4706 PROC_LOCK(p); 4707 error = racct_set(p, RACCT_VMEM, map->size); 4708 KASSERT(error == 0, ("decreasing RACCT_VMEM failed")); 4709 if (!old_mlock) { 4710 error = racct_set(p, RACCT_MEMLOCK, 4711 ptoa(pmap_wired_count(map->pmap))); 4712 KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed")); 4713 } 4714 error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize)); 4715 KASSERT(error == 0, ("decreasing RACCT_STACK failed")); 4716 PROC_UNLOCK(p); 4717 } 4718 #endif 4719 4720 return (rv); 4721 } 4722 4723 /* 4724 * Unshare the specified VM space for exec. If other processes are 4725 * mapped to it, then create a new one. The new vmspace is null. 4726 */ 4727 int 4728 vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser) 4729 { 4730 struct vmspace *oldvmspace = p->p_vmspace; 4731 struct vmspace *newvmspace; 4732 4733 KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0, 4734 ("vmspace_exec recursed")); 4735 newvmspace = vmspace_alloc(minuser, maxuser, pmap_pinit); 4736 if (newvmspace == NULL) 4737 return (ENOMEM); 4738 newvmspace->vm_swrss = oldvmspace->vm_swrss; 4739 /* 4740 * This code is written like this for prototype purposes. The 4741 * goal is to avoid running down the vmspace here, but let the 4742 * other process's that are still using the vmspace to finally 4743 * run it down. Even though there is little or no chance of blocking 4744 * here, it is a good idea to keep this form for future mods. 4745 */ 4746 PROC_VMSPACE_LOCK(p); 4747 p->p_vmspace = newvmspace; 4748 PROC_VMSPACE_UNLOCK(p); 4749 if (p == curthread->td_proc) 4750 pmap_activate(curthread); 4751 curthread->td_pflags |= TDP_EXECVMSPC; 4752 return (0); 4753 } 4754 4755 /* 4756 * Unshare the specified VM space for forcing COW. This 4757 * is called by rfork, for the (RFMEM|RFPROC) == 0 case. 4758 */ 4759 int 4760 vmspace_unshare(struct proc *p) 4761 { 4762 struct vmspace *oldvmspace = p->p_vmspace; 4763 struct vmspace *newvmspace; 4764 vm_ooffset_t fork_charge; 4765 4766 if (refcount_load(&oldvmspace->vm_refcnt) == 1) 4767 return (0); 4768 fork_charge = 0; 4769 newvmspace = vmspace_fork(oldvmspace, &fork_charge); 4770 if (newvmspace == NULL) 4771 return (ENOMEM); 4772 if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) { 4773 vmspace_free(newvmspace); 4774 return (ENOMEM); 4775 } 4776 PROC_VMSPACE_LOCK(p); 4777 p->p_vmspace = newvmspace; 4778 PROC_VMSPACE_UNLOCK(p); 4779 if (p == curthread->td_proc) 4780 pmap_activate(curthread); 4781 vmspace_free(oldvmspace); 4782 return (0); 4783 } 4784 4785 /* 4786 * vm_map_lookup: 4787 * 4788 * Finds the VM object, offset, and 4789 * protection for a given virtual address in the 4790 * specified map, assuming a page fault of the 4791 * type specified. 4792 * 4793 * Leaves the map in question locked for read; return 4794 * values are guaranteed until a vm_map_lookup_done 4795 * call is performed. Note that the map argument 4796 * is in/out; the returned map must be used in 4797 * the call to vm_map_lookup_done. 4798 * 4799 * A handle (out_entry) is returned for use in 4800 * vm_map_lookup_done, to make that fast. 4801 * 4802 * If a lookup is requested with "write protection" 4803 * specified, the map may be changed to perform virtual 4804 * copying operations, although the data referenced will 4805 * remain the same. 4806 */ 4807 int 4808 vm_map_lookup(vm_map_t *var_map, /* IN/OUT */ 4809 vm_offset_t vaddr, 4810 vm_prot_t fault_typea, 4811 vm_map_entry_t *out_entry, /* OUT */ 4812 vm_object_t *object, /* OUT */ 4813 vm_pindex_t *pindex, /* OUT */ 4814 vm_prot_t *out_prot, /* OUT */ 4815 boolean_t *wired) /* OUT */ 4816 { 4817 vm_map_entry_t entry; 4818 vm_map_t map = *var_map; 4819 vm_prot_t prot; 4820 vm_prot_t fault_type; 4821 vm_object_t eobject; 4822 vm_size_t size; 4823 struct ucred *cred; 4824 4825 RetryLookup: 4826 4827 vm_map_lock_read(map); 4828 4829 RetryLookupLocked: 4830 /* 4831 * Lookup the faulting address. 4832 */ 4833 if (!vm_map_lookup_entry(map, vaddr, out_entry)) { 4834 vm_map_unlock_read(map); 4835 return (KERN_INVALID_ADDRESS); 4836 } 4837 4838 entry = *out_entry; 4839 4840 /* 4841 * Handle submaps. 4842 */ 4843 if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { 4844 vm_map_t old_map = map; 4845 4846 *var_map = map = entry->object.sub_map; 4847 vm_map_unlock_read(old_map); 4848 goto RetryLookup; 4849 } 4850 4851 /* 4852 * Check whether this task is allowed to have this page. 4853 */ 4854 prot = entry->protection; 4855 if ((fault_typea & VM_PROT_FAULT_LOOKUP) != 0) { 4856 fault_typea &= ~VM_PROT_FAULT_LOOKUP; 4857 if (prot == VM_PROT_NONE && map != kernel_map && 4858 (entry->eflags & MAP_ENTRY_GUARD) != 0 && 4859 (entry->eflags & (MAP_ENTRY_STACK_GAP_DN | 4860 MAP_ENTRY_STACK_GAP_UP)) != 0 && 4861 vm_map_growstack(map, vaddr, entry) == KERN_SUCCESS) 4862 goto RetryLookupLocked; 4863 } 4864 fault_type = fault_typea & VM_PROT_ALL; 4865 if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) { 4866 vm_map_unlock_read(map); 4867 return (KERN_PROTECTION_FAILURE); 4868 } 4869 KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags & 4870 (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) != 4871 (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY), 4872 ("entry %p flags %x", entry, entry->eflags)); 4873 if ((fault_typea & VM_PROT_COPY) != 0 && 4874 (entry->max_protection & VM_PROT_WRITE) == 0 && 4875 (entry->eflags & MAP_ENTRY_COW) == 0) { 4876 vm_map_unlock_read(map); 4877 return (KERN_PROTECTION_FAILURE); 4878 } 4879 4880 /* 4881 * If this page is not pageable, we have to get it for all possible 4882 * accesses. 4883 */ 4884 *wired = (entry->wired_count != 0); 4885 if (*wired) 4886 fault_type = entry->protection; 4887 size = entry->end - entry->start; 4888 4889 /* 4890 * If the entry was copy-on-write, we either ... 4891 */ 4892 if (entry->eflags & MAP_ENTRY_NEEDS_COPY) { 4893 /* 4894 * If we want to write the page, we may as well handle that 4895 * now since we've got the map locked. 4896 * 4897 * If we don't need to write the page, we just demote the 4898 * permissions allowed. 4899 */ 4900 if ((fault_type & VM_PROT_WRITE) != 0 || 4901 (fault_typea & VM_PROT_COPY) != 0) { 4902 /* 4903 * Make a new object, and place it in the object 4904 * chain. Note that no new references have appeared 4905 * -- one just moved from the map to the new 4906 * object. 4907 */ 4908 if (vm_map_lock_upgrade(map)) 4909 goto RetryLookup; 4910 4911 if (entry->cred == NULL) { 4912 /* 4913 * The debugger owner is charged for 4914 * the memory. 4915 */ 4916 cred = curthread->td_ucred; 4917 crhold(cred); 4918 if (!swap_reserve_by_cred(size, cred)) { 4919 crfree(cred); 4920 vm_map_unlock(map); 4921 return (KERN_RESOURCE_SHORTAGE); 4922 } 4923 entry->cred = cred; 4924 } 4925 eobject = entry->object.vm_object; 4926 vm_object_shadow(&entry->object.vm_object, 4927 &entry->offset, size, entry->cred, false); 4928 if (eobject == entry->object.vm_object) { 4929 /* 4930 * The object was not shadowed. 4931 */ 4932 swap_release_by_cred(size, entry->cred); 4933 crfree(entry->cred); 4934 } 4935 entry->cred = NULL; 4936 entry->eflags &= ~MAP_ENTRY_NEEDS_COPY; 4937 4938 vm_map_lock_downgrade(map); 4939 } else { 4940 /* 4941 * We're attempting to read a copy-on-write page -- 4942 * don't allow writes. 4943 */ 4944 prot &= ~VM_PROT_WRITE; 4945 } 4946 } 4947 4948 /* 4949 * Create an object if necessary. 4950 */ 4951 if (entry->object.vm_object == NULL && !map->system_map) { 4952 if (vm_map_lock_upgrade(map)) 4953 goto RetryLookup; 4954 entry->object.vm_object = vm_object_allocate_anon(atop(size), 4955 NULL, entry->cred, entry->cred != NULL ? size : 0); 4956 entry->offset = 0; 4957 entry->cred = NULL; 4958 vm_map_lock_downgrade(map); 4959 } 4960 4961 /* 4962 * Return the object/offset from this entry. If the entry was 4963 * copy-on-write or empty, it has been fixed up. 4964 */ 4965 *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset); 4966 *object = entry->object.vm_object; 4967 4968 *out_prot = prot; 4969 return (KERN_SUCCESS); 4970 } 4971 4972 /* 4973 * vm_map_lookup_locked: 4974 * 4975 * Lookup the faulting address. A version of vm_map_lookup that returns 4976 * KERN_FAILURE instead of blocking on map lock or memory allocation. 4977 */ 4978 int 4979 vm_map_lookup_locked(vm_map_t *var_map, /* IN/OUT */ 4980 vm_offset_t vaddr, 4981 vm_prot_t fault_typea, 4982 vm_map_entry_t *out_entry, /* OUT */ 4983 vm_object_t *object, /* OUT */ 4984 vm_pindex_t *pindex, /* OUT */ 4985 vm_prot_t *out_prot, /* OUT */ 4986 boolean_t *wired) /* OUT */ 4987 { 4988 vm_map_entry_t entry; 4989 vm_map_t map = *var_map; 4990 vm_prot_t prot; 4991 vm_prot_t fault_type = fault_typea; 4992 4993 /* 4994 * Lookup the faulting address. 4995 */ 4996 if (!vm_map_lookup_entry(map, vaddr, out_entry)) 4997 return (KERN_INVALID_ADDRESS); 4998 4999 entry = *out_entry; 5000 5001 /* 5002 * Fail if the entry refers to a submap. 5003 */ 5004 if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) 5005 return (KERN_FAILURE); 5006 5007 /* 5008 * Check whether this task is allowed to have this page. 5009 */ 5010 prot = entry->protection; 5011 fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; 5012 if ((fault_type & prot) != fault_type) 5013 return (KERN_PROTECTION_FAILURE); 5014 5015 /* 5016 * If this page is not pageable, we have to get it for all possible 5017 * accesses. 5018 */ 5019 *wired = (entry->wired_count != 0); 5020 if (*wired) 5021 fault_type = entry->protection; 5022 5023 if (entry->eflags & MAP_ENTRY_NEEDS_COPY) { 5024 /* 5025 * Fail if the entry was copy-on-write for a write fault. 5026 */ 5027 if (fault_type & VM_PROT_WRITE) 5028 return (KERN_FAILURE); 5029 /* 5030 * We're attempting to read a copy-on-write page -- 5031 * don't allow writes. 5032 */ 5033 prot &= ~VM_PROT_WRITE; 5034 } 5035 5036 /* 5037 * Fail if an object should be created. 5038 */ 5039 if (entry->object.vm_object == NULL && !map->system_map) 5040 return (KERN_FAILURE); 5041 5042 /* 5043 * Return the object/offset from this entry. If the entry was 5044 * copy-on-write or empty, it has been fixed up. 5045 */ 5046 *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset); 5047 *object = entry->object.vm_object; 5048 5049 *out_prot = prot; 5050 return (KERN_SUCCESS); 5051 } 5052 5053 /* 5054 * vm_map_lookup_done: 5055 * 5056 * Releases locks acquired by a vm_map_lookup 5057 * (according to the handle returned by that lookup). 5058 */ 5059 void 5060 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry) 5061 { 5062 /* 5063 * Unlock the main-level map 5064 */ 5065 vm_map_unlock_read(map); 5066 } 5067 5068 vm_offset_t 5069 vm_map_max_KBI(const struct vm_map *map) 5070 { 5071 5072 return (vm_map_max(map)); 5073 } 5074 5075 vm_offset_t 5076 vm_map_min_KBI(const struct vm_map *map) 5077 { 5078 5079 return (vm_map_min(map)); 5080 } 5081 5082 pmap_t 5083 vm_map_pmap_KBI(vm_map_t map) 5084 { 5085 5086 return (map->pmap); 5087 } 5088 5089 bool 5090 vm_map_range_valid_KBI(vm_map_t map, vm_offset_t start, vm_offset_t end) 5091 { 5092 5093 return (vm_map_range_valid(map, start, end)); 5094 } 5095 5096 #ifdef INVARIANTS 5097 static void 5098 _vm_map_assert_consistent(vm_map_t map, int check) 5099 { 5100 vm_map_entry_t entry, prev; 5101 vm_map_entry_t cur, header, lbound, ubound; 5102 vm_size_t max_left, max_right; 5103 5104 #ifdef DIAGNOSTIC 5105 ++map->nupdates; 5106 #endif 5107 if (enable_vmmap_check != check) 5108 return; 5109 5110 header = prev = &map->header; 5111 VM_MAP_ENTRY_FOREACH(entry, map) { 5112 KASSERT(prev->end <= entry->start, 5113 ("map %p prev->end = %jx, start = %jx", map, 5114 (uintmax_t)prev->end, (uintmax_t)entry->start)); 5115 KASSERT(entry->start < entry->end, 5116 ("map %p start = %jx, end = %jx", map, 5117 (uintmax_t)entry->start, (uintmax_t)entry->end)); 5118 KASSERT(entry->left == header || 5119 entry->left->start < entry->start, 5120 ("map %p left->start = %jx, start = %jx", map, 5121 (uintmax_t)entry->left->start, (uintmax_t)entry->start)); 5122 KASSERT(entry->right == header || 5123 entry->start < entry->right->start, 5124 ("map %p start = %jx, right->start = %jx", map, 5125 (uintmax_t)entry->start, (uintmax_t)entry->right->start)); 5126 cur = map->root; 5127 lbound = ubound = header; 5128 for (;;) { 5129 if (entry->start < cur->start) { 5130 ubound = cur; 5131 cur = cur->left; 5132 KASSERT(cur != lbound, 5133 ("map %p cannot find %jx", 5134 map, (uintmax_t)entry->start)); 5135 } else if (cur->end <= entry->start) { 5136 lbound = cur; 5137 cur = cur->right; 5138 KASSERT(cur != ubound, 5139 ("map %p cannot find %jx", 5140 map, (uintmax_t)entry->start)); 5141 } else { 5142 KASSERT(cur == entry, 5143 ("map %p cannot find %jx", 5144 map, (uintmax_t)entry->start)); 5145 break; 5146 } 5147 } 5148 max_left = vm_map_entry_max_free_left(entry, lbound); 5149 max_right = vm_map_entry_max_free_right(entry, ubound); 5150 KASSERT(entry->max_free == vm_size_max(max_left, max_right), 5151 ("map %p max = %jx, max_left = %jx, max_right = %jx", map, 5152 (uintmax_t)entry->max_free, 5153 (uintmax_t)max_left, (uintmax_t)max_right)); 5154 prev = entry; 5155 } 5156 KASSERT(prev->end <= entry->start, 5157 ("map %p prev->end = %jx, start = %jx", map, 5158 (uintmax_t)prev->end, (uintmax_t)entry->start)); 5159 } 5160 #endif 5161 5162 #include "opt_ddb.h" 5163 #ifdef DDB 5164 #include <sys/kernel.h> 5165 5166 #include <ddb/ddb.h> 5167 5168 static void 5169 vm_map_print(vm_map_t map) 5170 { 5171 vm_map_entry_t entry, prev; 5172 5173 db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n", 5174 (void *)map, 5175 (void *)map->pmap, map->nentries, map->timestamp); 5176 5177 db_indent += 2; 5178 prev = &map->header; 5179 VM_MAP_ENTRY_FOREACH(entry, map) { 5180 db_iprintf("map entry %p: start=%p, end=%p, eflags=%#x, \n", 5181 (void *)entry, (void *)entry->start, (void *)entry->end, 5182 entry->eflags); 5183 { 5184 static const char * const inheritance_name[4] = 5185 {"share", "copy", "none", "donate_copy"}; 5186 5187 db_iprintf(" prot=%x/%x/%s", 5188 entry->protection, 5189 entry->max_protection, 5190 inheritance_name[(int)(unsigned char) 5191 entry->inheritance]); 5192 if (entry->wired_count != 0) 5193 db_printf(", wired"); 5194 } 5195 if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { 5196 db_printf(", share=%p, offset=0x%jx\n", 5197 (void *)entry->object.sub_map, 5198 (uintmax_t)entry->offset); 5199 if (prev == &map->header || 5200 prev->object.sub_map != 5201 entry->object.sub_map) { 5202 db_indent += 2; 5203 vm_map_print((vm_map_t)entry->object.sub_map); 5204 db_indent -= 2; 5205 } 5206 } else { 5207 if (entry->cred != NULL) 5208 db_printf(", ruid %d", entry->cred->cr_ruid); 5209 db_printf(", object=%p, offset=0x%jx", 5210 (void *)entry->object.vm_object, 5211 (uintmax_t)entry->offset); 5212 if (entry->object.vm_object && entry->object.vm_object->cred) 5213 db_printf(", obj ruid %d charge %jx", 5214 entry->object.vm_object->cred->cr_ruid, 5215 (uintmax_t)entry->object.vm_object->charge); 5216 if (entry->eflags & MAP_ENTRY_COW) 5217 db_printf(", copy (%s)", 5218 (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done"); 5219 db_printf("\n"); 5220 5221 if (prev == &map->header || 5222 prev->object.vm_object != 5223 entry->object.vm_object) { 5224 db_indent += 2; 5225 vm_object_print((db_expr_t)(intptr_t) 5226 entry->object.vm_object, 5227 0, 0, (char *)0); 5228 db_indent -= 2; 5229 } 5230 } 5231 prev = entry; 5232 } 5233 db_indent -= 2; 5234 } 5235 5236 DB_SHOW_COMMAND(map, map) 5237 { 5238 5239 if (!have_addr) { 5240 db_printf("usage: show map <addr>\n"); 5241 return; 5242 } 5243 vm_map_print((vm_map_t)addr); 5244 } 5245 5246 DB_SHOW_COMMAND(procvm, procvm) 5247 { 5248 struct proc *p; 5249 5250 if (have_addr) { 5251 p = db_lookup_proc(addr); 5252 } else { 5253 p = curproc; 5254 } 5255 5256 db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n", 5257 (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map, 5258 (void *)vmspace_pmap(p->p_vmspace)); 5259 5260 vm_map_print((vm_map_t)&p->p_vmspace->vm_map); 5261 } 5262 5263 #endif /* DDB */ 5264