xref: /freebsd/sys/vm/vm_map.c (revision 97cb52fa9aefd90fad38790fded50905aeeb9b9e)
1 /*-
2  * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
3  *
4  * Copyright (c) 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * The Mach Operating System project at Carnegie-Mellon University.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	from: @(#)vm_map.c	8.3 (Berkeley) 1/12/94
35  *
36  *
37  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
38  * All rights reserved.
39  *
40  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
41  *
42  * Permission to use, copy, modify and distribute this software and
43  * its documentation is hereby granted, provided that both the copyright
44  * notice and this permission notice appear in all copies of the
45  * software, derivative works or modified versions, and any portions
46  * thereof, and that both notices appear in supporting documentation.
47  *
48  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
49  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
50  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
51  *
52  * Carnegie Mellon requests users of this software to return to
53  *
54  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
55  *  School of Computer Science
56  *  Carnegie Mellon University
57  *  Pittsburgh PA 15213-3890
58  *
59  * any improvements or extensions that they make and grant Carnegie the
60  * rights to redistribute these changes.
61  */
62 
63 /*
64  *	Virtual memory mapping module.
65  */
66 
67 #include <sys/cdefs.h>
68 __FBSDID("$FreeBSD$");
69 
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/kernel.h>
73 #include <sys/ktr.h>
74 #include <sys/lock.h>
75 #include <sys/mutex.h>
76 #include <sys/proc.h>
77 #include <sys/vmmeter.h>
78 #include <sys/mman.h>
79 #include <sys/vnode.h>
80 #include <sys/racct.h>
81 #include <sys/resourcevar.h>
82 #include <sys/rwlock.h>
83 #include <sys/file.h>
84 #include <sys/sysctl.h>
85 #include <sys/sysent.h>
86 #include <sys/shm.h>
87 
88 #include <vm/vm.h>
89 #include <vm/vm_param.h>
90 #include <vm/pmap.h>
91 #include <vm/vm_map.h>
92 #include <vm/vm_page.h>
93 #include <vm/vm_object.h>
94 #include <vm/vm_pager.h>
95 #include <vm/vm_kern.h>
96 #include <vm/vm_extern.h>
97 #include <vm/vnode_pager.h>
98 #include <vm/swap_pager.h>
99 #include <vm/uma.h>
100 
101 /*
102  *	Virtual memory maps provide for the mapping, protection,
103  *	and sharing of virtual memory objects.  In addition,
104  *	this module provides for an efficient virtual copy of
105  *	memory from one map to another.
106  *
107  *	Synchronization is required prior to most operations.
108  *
109  *	Maps consist of an ordered doubly-linked list of simple
110  *	entries; a self-adjusting binary search tree of these
111  *	entries is used to speed up lookups.
112  *
113  *	Since portions of maps are specified by start/end addresses,
114  *	which may not align with existing map entries, all
115  *	routines merely "clip" entries to these start/end values.
116  *	[That is, an entry is split into two, bordering at a
117  *	start or end value.]  Note that these clippings may not
118  *	always be necessary (as the two resulting entries are then
119  *	not changed); however, the clipping is done for convenience.
120  *
121  *	As mentioned above, virtual copy operations are performed
122  *	by copying VM object references from one map to
123  *	another, and then marking both regions as copy-on-write.
124  */
125 
126 static struct mtx map_sleep_mtx;
127 static uma_zone_t mapentzone;
128 static uma_zone_t kmapentzone;
129 static uma_zone_t mapzone;
130 static uma_zone_t vmspace_zone;
131 static int vmspace_zinit(void *mem, int size, int flags);
132 static int vm_map_zinit(void *mem, int ize, int flags);
133 static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min,
134     vm_offset_t max);
135 static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map);
136 static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry);
137 static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry);
138 static int vm_map_growstack(vm_map_t map, vm_offset_t addr,
139     vm_map_entry_t gap_entry);
140 static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
141     vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags);
142 #ifdef INVARIANTS
143 static void vm_map_zdtor(void *mem, int size, void *arg);
144 static void vmspace_zdtor(void *mem, int size, void *arg);
145 #endif
146 static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos,
147     vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max,
148     int cow);
149 static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
150     vm_offset_t failed_addr);
151 
152 #define	ENTRY_CHARGED(e) ((e)->cred != NULL || \
153     ((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \
154      !((e)->eflags & MAP_ENTRY_NEEDS_COPY)))
155 
156 /*
157  * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type
158  * stable.
159  */
160 #define PROC_VMSPACE_LOCK(p) do { } while (0)
161 #define PROC_VMSPACE_UNLOCK(p) do { } while (0)
162 
163 /*
164  *	VM_MAP_RANGE_CHECK:	[ internal use only ]
165  *
166  *	Asserts that the starting and ending region
167  *	addresses fall within the valid range of the map.
168  */
169 #define	VM_MAP_RANGE_CHECK(map, start, end)		\
170 		{					\
171 		if (start < vm_map_min(map))		\
172 			start = vm_map_min(map);	\
173 		if (end > vm_map_max(map))		\
174 			end = vm_map_max(map);		\
175 		if (start > end)			\
176 			start = end;			\
177 		}
178 
179 /*
180  *	vm_map_startup:
181  *
182  *	Initialize the vm_map module.  Must be called before
183  *	any other vm_map routines.
184  *
185  *	Map and entry structures are allocated from the general
186  *	purpose memory pool with some exceptions:
187  *
188  *	- The kernel map and kmem submap are allocated statically.
189  *	- Kernel map entries are allocated out of a static pool.
190  *
191  *	These restrictions are necessary since malloc() uses the
192  *	maps and requires map entries.
193  */
194 
195 void
196 vm_map_startup(void)
197 {
198 	mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF);
199 	mapzone = uma_zcreate("MAP", sizeof(struct vm_map), NULL,
200 #ifdef INVARIANTS
201 	    vm_map_zdtor,
202 #else
203 	    NULL,
204 #endif
205 	    vm_map_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
206 	uma_prealloc(mapzone, MAX_KMAP);
207 	kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
208 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
209 	    UMA_ZONE_MTXCLASS | UMA_ZONE_VM);
210 	mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
211 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
212 	vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
213 #ifdef INVARIANTS
214 	    vmspace_zdtor,
215 #else
216 	    NULL,
217 #endif
218 	    vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
219 }
220 
221 static int
222 vmspace_zinit(void *mem, int size, int flags)
223 {
224 	struct vmspace *vm;
225 
226 	vm = (struct vmspace *)mem;
227 
228 	vm->vm_map.pmap = NULL;
229 	(void)vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map), flags);
230 	PMAP_LOCK_INIT(vmspace_pmap(vm));
231 	return (0);
232 }
233 
234 static int
235 vm_map_zinit(void *mem, int size, int flags)
236 {
237 	vm_map_t map;
238 
239 	map = (vm_map_t)mem;
240 	memset(map, 0, sizeof(*map));
241 	mtx_init(&map->system_mtx, "vm map (system)", NULL, MTX_DEF | MTX_DUPOK);
242 	sx_init(&map->lock, "vm map (user)");
243 	return (0);
244 }
245 
246 #ifdef INVARIANTS
247 static void
248 vmspace_zdtor(void *mem, int size, void *arg)
249 {
250 	struct vmspace *vm;
251 
252 	vm = (struct vmspace *)mem;
253 
254 	vm_map_zdtor(&vm->vm_map, sizeof(vm->vm_map), arg);
255 }
256 static void
257 vm_map_zdtor(void *mem, int size, void *arg)
258 {
259 	vm_map_t map;
260 
261 	map = (vm_map_t)mem;
262 	KASSERT(map->nentries == 0,
263 	    ("map %p nentries == %d on free.",
264 	    map, map->nentries));
265 	KASSERT(map->size == 0,
266 	    ("map %p size == %lu on free.",
267 	    map, (unsigned long)map->size));
268 }
269 #endif	/* INVARIANTS */
270 
271 /*
272  * Allocate a vmspace structure, including a vm_map and pmap,
273  * and initialize those structures.  The refcnt is set to 1.
274  *
275  * If 'pinit' is NULL then the embedded pmap is initialized via pmap_pinit().
276  */
277 struct vmspace *
278 vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit)
279 {
280 	struct vmspace *vm;
281 
282 	vm = uma_zalloc(vmspace_zone, M_WAITOK);
283 
284 	KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL"));
285 
286 	if (pinit == NULL)
287 		pinit = &pmap_pinit;
288 
289 	if (!pinit(vmspace_pmap(vm))) {
290 		uma_zfree(vmspace_zone, vm);
291 		return (NULL);
292 	}
293 	CTR1(KTR_VM, "vmspace_alloc: %p", vm);
294 	_vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max);
295 	vm->vm_refcnt = 1;
296 	vm->vm_shm = NULL;
297 	vm->vm_swrss = 0;
298 	vm->vm_tsize = 0;
299 	vm->vm_dsize = 0;
300 	vm->vm_ssize = 0;
301 	vm->vm_taddr = 0;
302 	vm->vm_daddr = 0;
303 	vm->vm_maxsaddr = 0;
304 	return (vm);
305 }
306 
307 #ifdef RACCT
308 static void
309 vmspace_container_reset(struct proc *p)
310 {
311 
312 	PROC_LOCK(p);
313 	racct_set(p, RACCT_DATA, 0);
314 	racct_set(p, RACCT_STACK, 0);
315 	racct_set(p, RACCT_RSS, 0);
316 	racct_set(p, RACCT_MEMLOCK, 0);
317 	racct_set(p, RACCT_VMEM, 0);
318 	PROC_UNLOCK(p);
319 }
320 #endif
321 
322 static inline void
323 vmspace_dofree(struct vmspace *vm)
324 {
325 
326 	CTR1(KTR_VM, "vmspace_free: %p", vm);
327 
328 	/*
329 	 * Make sure any SysV shm is freed, it might not have been in
330 	 * exit1().
331 	 */
332 	shmexit(vm);
333 
334 	/*
335 	 * Lock the map, to wait out all other references to it.
336 	 * Delete all of the mappings and pages they hold, then call
337 	 * the pmap module to reclaim anything left.
338 	 */
339 	(void)vm_map_remove(&vm->vm_map, vm->vm_map.min_offset,
340 	    vm->vm_map.max_offset);
341 
342 	pmap_release(vmspace_pmap(vm));
343 	vm->vm_map.pmap = NULL;
344 	uma_zfree(vmspace_zone, vm);
345 }
346 
347 void
348 vmspace_free(struct vmspace *vm)
349 {
350 
351 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
352 	    "vmspace_free() called");
353 
354 	if (vm->vm_refcnt == 0)
355 		panic("vmspace_free: attempt to free already freed vmspace");
356 
357 	if (atomic_fetchadd_int(&vm->vm_refcnt, -1) == 1)
358 		vmspace_dofree(vm);
359 }
360 
361 void
362 vmspace_exitfree(struct proc *p)
363 {
364 	struct vmspace *vm;
365 
366 	PROC_VMSPACE_LOCK(p);
367 	vm = p->p_vmspace;
368 	p->p_vmspace = NULL;
369 	PROC_VMSPACE_UNLOCK(p);
370 	KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace"));
371 	vmspace_free(vm);
372 }
373 
374 void
375 vmspace_exit(struct thread *td)
376 {
377 	int refcnt;
378 	struct vmspace *vm;
379 	struct proc *p;
380 
381 	/*
382 	 * Release user portion of address space.
383 	 * This releases references to vnodes,
384 	 * which could cause I/O if the file has been unlinked.
385 	 * Need to do this early enough that we can still sleep.
386 	 *
387 	 * The last exiting process to reach this point releases as
388 	 * much of the environment as it can. vmspace_dofree() is the
389 	 * slower fallback in case another process had a temporary
390 	 * reference to the vmspace.
391 	 */
392 
393 	p = td->td_proc;
394 	vm = p->p_vmspace;
395 	atomic_add_int(&vmspace0.vm_refcnt, 1);
396 	do {
397 		refcnt = vm->vm_refcnt;
398 		if (refcnt > 1 && p->p_vmspace != &vmspace0) {
399 			/* Switch now since other proc might free vmspace */
400 			PROC_VMSPACE_LOCK(p);
401 			p->p_vmspace = &vmspace0;
402 			PROC_VMSPACE_UNLOCK(p);
403 			pmap_activate(td);
404 		}
405 	} while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1));
406 	if (refcnt == 1) {
407 		if (p->p_vmspace != vm) {
408 			/* vmspace not yet freed, switch back */
409 			PROC_VMSPACE_LOCK(p);
410 			p->p_vmspace = vm;
411 			PROC_VMSPACE_UNLOCK(p);
412 			pmap_activate(td);
413 		}
414 		pmap_remove_pages(vmspace_pmap(vm));
415 		/* Switch now since this proc will free vmspace */
416 		PROC_VMSPACE_LOCK(p);
417 		p->p_vmspace = &vmspace0;
418 		PROC_VMSPACE_UNLOCK(p);
419 		pmap_activate(td);
420 		vmspace_dofree(vm);
421 	}
422 #ifdef RACCT
423 	if (racct_enable)
424 		vmspace_container_reset(p);
425 #endif
426 }
427 
428 /* Acquire reference to vmspace owned by another process. */
429 
430 struct vmspace *
431 vmspace_acquire_ref(struct proc *p)
432 {
433 	struct vmspace *vm;
434 	int refcnt;
435 
436 	PROC_VMSPACE_LOCK(p);
437 	vm = p->p_vmspace;
438 	if (vm == NULL) {
439 		PROC_VMSPACE_UNLOCK(p);
440 		return (NULL);
441 	}
442 	do {
443 		refcnt = vm->vm_refcnt;
444 		if (refcnt <= 0) { 	/* Avoid 0->1 transition */
445 			PROC_VMSPACE_UNLOCK(p);
446 			return (NULL);
447 		}
448 	} while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt + 1));
449 	if (vm != p->p_vmspace) {
450 		PROC_VMSPACE_UNLOCK(p);
451 		vmspace_free(vm);
452 		return (NULL);
453 	}
454 	PROC_VMSPACE_UNLOCK(p);
455 	return (vm);
456 }
457 
458 /*
459  * Switch between vmspaces in an AIO kernel process.
460  *
461  * The AIO kernel processes switch to and from a user process's
462  * vmspace while performing an I/O operation on behalf of a user
463  * process.  The new vmspace is either the vmspace of a user process
464  * obtained from an active AIO request or the initial vmspace of the
465  * AIO kernel process (when it is idling).  Because user processes
466  * will block to drain any active AIO requests before proceeding in
467  * exit() or execve(), the vmspace reference count for these vmspaces
468  * can never be 0.  This allows for a much simpler implementation than
469  * the loop in vmspace_acquire_ref() above.  Similarly, AIO kernel
470  * processes hold an extra reference on their initial vmspace for the
471  * life of the process so that this guarantee is true for any vmspace
472  * passed as 'newvm'.
473  */
474 void
475 vmspace_switch_aio(struct vmspace *newvm)
476 {
477 	struct vmspace *oldvm;
478 
479 	/* XXX: Need some way to assert that this is an aio daemon. */
480 
481 	KASSERT(newvm->vm_refcnt > 0,
482 	    ("vmspace_switch_aio: newvm unreferenced"));
483 
484 	oldvm = curproc->p_vmspace;
485 	if (oldvm == newvm)
486 		return;
487 
488 	/*
489 	 * Point to the new address space and refer to it.
490 	 */
491 	curproc->p_vmspace = newvm;
492 	atomic_add_int(&newvm->vm_refcnt, 1);
493 
494 	/* Activate the new mapping. */
495 	pmap_activate(curthread);
496 
497 	/* Remove the daemon's reference to the old address space. */
498 	KASSERT(oldvm->vm_refcnt > 1,
499 	    ("vmspace_switch_aio: oldvm dropping last reference"));
500 	vmspace_free(oldvm);
501 }
502 
503 void
504 _vm_map_lock(vm_map_t map, const char *file, int line)
505 {
506 
507 	if (map->system_map)
508 		mtx_lock_flags_(&map->system_mtx, 0, file, line);
509 	else
510 		sx_xlock_(&map->lock, file, line);
511 	map->timestamp++;
512 }
513 
514 static void
515 vm_map_process_deferred(void)
516 {
517 	struct thread *td;
518 	vm_map_entry_t entry, next;
519 	vm_object_t object;
520 
521 	td = curthread;
522 	entry = td->td_map_def_user;
523 	td->td_map_def_user = NULL;
524 	while (entry != NULL) {
525 		next = entry->next;
526 		if ((entry->eflags & MAP_ENTRY_VN_WRITECNT) != 0) {
527 			/*
528 			 * Decrement the object's writemappings and
529 			 * possibly the vnode's v_writecount.
530 			 */
531 			KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
532 			    ("Submap with writecount"));
533 			object = entry->object.vm_object;
534 			KASSERT(object != NULL, ("No object for writecount"));
535 			vnode_pager_release_writecount(object, entry->start,
536 			    entry->end);
537 		}
538 		vm_map_entry_deallocate(entry, FALSE);
539 		entry = next;
540 	}
541 }
542 
543 void
544 _vm_map_unlock(vm_map_t map, const char *file, int line)
545 {
546 
547 	if (map->system_map)
548 		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
549 	else {
550 		sx_xunlock_(&map->lock, file, line);
551 		vm_map_process_deferred();
552 	}
553 }
554 
555 void
556 _vm_map_lock_read(vm_map_t map, const char *file, int line)
557 {
558 
559 	if (map->system_map)
560 		mtx_lock_flags_(&map->system_mtx, 0, file, line);
561 	else
562 		sx_slock_(&map->lock, file, line);
563 }
564 
565 void
566 _vm_map_unlock_read(vm_map_t map, const char *file, int line)
567 {
568 
569 	if (map->system_map)
570 		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
571 	else {
572 		sx_sunlock_(&map->lock, file, line);
573 		vm_map_process_deferred();
574 	}
575 }
576 
577 int
578 _vm_map_trylock(vm_map_t map, const char *file, int line)
579 {
580 	int error;
581 
582 	error = map->system_map ?
583 	    !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
584 	    !sx_try_xlock_(&map->lock, file, line);
585 	if (error == 0)
586 		map->timestamp++;
587 	return (error == 0);
588 }
589 
590 int
591 _vm_map_trylock_read(vm_map_t map, const char *file, int line)
592 {
593 	int error;
594 
595 	error = map->system_map ?
596 	    !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
597 	    !sx_try_slock_(&map->lock, file, line);
598 	return (error == 0);
599 }
600 
601 /*
602  *	_vm_map_lock_upgrade:	[ internal use only ]
603  *
604  *	Tries to upgrade a read (shared) lock on the specified map to a write
605  *	(exclusive) lock.  Returns the value "0" if the upgrade succeeds and a
606  *	non-zero value if the upgrade fails.  If the upgrade fails, the map is
607  *	returned without a read or write lock held.
608  *
609  *	Requires that the map be read locked.
610  */
611 int
612 _vm_map_lock_upgrade(vm_map_t map, const char *file, int line)
613 {
614 	unsigned int last_timestamp;
615 
616 	if (map->system_map) {
617 		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
618 	} else {
619 		if (!sx_try_upgrade_(&map->lock, file, line)) {
620 			last_timestamp = map->timestamp;
621 			sx_sunlock_(&map->lock, file, line);
622 			vm_map_process_deferred();
623 			/*
624 			 * If the map's timestamp does not change while the
625 			 * map is unlocked, then the upgrade succeeds.
626 			 */
627 			sx_xlock_(&map->lock, file, line);
628 			if (last_timestamp != map->timestamp) {
629 				sx_xunlock_(&map->lock, file, line);
630 				return (1);
631 			}
632 		}
633 	}
634 	map->timestamp++;
635 	return (0);
636 }
637 
638 void
639 _vm_map_lock_downgrade(vm_map_t map, const char *file, int line)
640 {
641 
642 	if (map->system_map) {
643 		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
644 	} else
645 		sx_downgrade_(&map->lock, file, line);
646 }
647 
648 /*
649  *	vm_map_locked:
650  *
651  *	Returns a non-zero value if the caller holds a write (exclusive) lock
652  *	on the specified map and the value "0" otherwise.
653  */
654 int
655 vm_map_locked(vm_map_t map)
656 {
657 
658 	if (map->system_map)
659 		return (mtx_owned(&map->system_mtx));
660 	else
661 		return (sx_xlocked(&map->lock));
662 }
663 
664 #ifdef INVARIANTS
665 static void
666 _vm_map_assert_locked(vm_map_t map, const char *file, int line)
667 {
668 
669 	if (map->system_map)
670 		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
671 	else
672 		sx_assert_(&map->lock, SA_XLOCKED, file, line);
673 }
674 
675 #define	VM_MAP_ASSERT_LOCKED(map) \
676     _vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE)
677 #else
678 #define	VM_MAP_ASSERT_LOCKED(map)
679 #endif
680 
681 /*
682  *	_vm_map_unlock_and_wait:
683  *
684  *	Atomically releases the lock on the specified map and puts the calling
685  *	thread to sleep.  The calling thread will remain asleep until either
686  *	vm_map_wakeup() is performed on the map or the specified timeout is
687  *	exceeded.
688  *
689  *	WARNING!  This function does not perform deferred deallocations of
690  *	objects and map	entries.  Therefore, the calling thread is expected to
691  *	reacquire the map lock after reawakening and later perform an ordinary
692  *	unlock operation, such as vm_map_unlock(), before completing its
693  *	operation on the map.
694  */
695 int
696 _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line)
697 {
698 
699 	mtx_lock(&map_sleep_mtx);
700 	if (map->system_map)
701 		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
702 	else
703 		sx_xunlock_(&map->lock, file, line);
704 	return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps",
705 	    timo));
706 }
707 
708 /*
709  *	vm_map_wakeup:
710  *
711  *	Awaken any threads that have slept on the map using
712  *	vm_map_unlock_and_wait().
713  */
714 void
715 vm_map_wakeup(vm_map_t map)
716 {
717 
718 	/*
719 	 * Acquire and release map_sleep_mtx to prevent a wakeup()
720 	 * from being performed (and lost) between the map unlock
721 	 * and the msleep() in _vm_map_unlock_and_wait().
722 	 */
723 	mtx_lock(&map_sleep_mtx);
724 	mtx_unlock(&map_sleep_mtx);
725 	wakeup(&map->root);
726 }
727 
728 void
729 vm_map_busy(vm_map_t map)
730 {
731 
732 	VM_MAP_ASSERT_LOCKED(map);
733 	map->busy++;
734 }
735 
736 void
737 vm_map_unbusy(vm_map_t map)
738 {
739 
740 	VM_MAP_ASSERT_LOCKED(map);
741 	KASSERT(map->busy, ("vm_map_unbusy: not busy"));
742 	if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) {
743 		vm_map_modflags(map, 0, MAP_BUSY_WAKEUP);
744 		wakeup(&map->busy);
745 	}
746 }
747 
748 void
749 vm_map_wait_busy(vm_map_t map)
750 {
751 
752 	VM_MAP_ASSERT_LOCKED(map);
753 	while (map->busy) {
754 		vm_map_modflags(map, MAP_BUSY_WAKEUP, 0);
755 		if (map->system_map)
756 			msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0);
757 		else
758 			sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0);
759 	}
760 	map->timestamp++;
761 }
762 
763 long
764 vmspace_resident_count(struct vmspace *vmspace)
765 {
766 	return pmap_resident_count(vmspace_pmap(vmspace));
767 }
768 
769 /*
770  *	vm_map_create:
771  *
772  *	Creates and returns a new empty VM map with
773  *	the given physical map structure, and having
774  *	the given lower and upper address bounds.
775  */
776 vm_map_t
777 vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max)
778 {
779 	vm_map_t result;
780 
781 	result = uma_zalloc(mapzone, M_WAITOK);
782 	CTR1(KTR_VM, "vm_map_create: %p", result);
783 	_vm_map_init(result, pmap, min, max);
784 	return (result);
785 }
786 
787 /*
788  * Initialize an existing vm_map structure
789  * such as that in the vmspace structure.
790  */
791 static void
792 _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
793 {
794 
795 	map->header.next = map->header.prev = &map->header;
796 	map->needs_wakeup = FALSE;
797 	map->system_map = 0;
798 	map->pmap = pmap;
799 	map->min_offset = min;
800 	map->max_offset = max;
801 	map->flags = 0;
802 	map->root = NULL;
803 	map->timestamp = 0;
804 	map->busy = 0;
805 }
806 
807 void
808 vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
809 {
810 
811 	_vm_map_init(map, pmap, min, max);
812 	mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK);
813 	sx_init(&map->lock, "user map");
814 }
815 
816 /*
817  *	vm_map_entry_dispose:	[ internal use only ]
818  *
819  *	Inverse of vm_map_entry_create.
820  */
821 static void
822 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry)
823 {
824 	uma_zfree(map->system_map ? kmapentzone : mapentzone, entry);
825 }
826 
827 /*
828  *	vm_map_entry_create:	[ internal use only ]
829  *
830  *	Allocates a VM map entry for insertion.
831  *	No entry fields are filled in.
832  */
833 static vm_map_entry_t
834 vm_map_entry_create(vm_map_t map)
835 {
836 	vm_map_entry_t new_entry;
837 
838 	if (map->system_map)
839 		new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
840 	else
841 		new_entry = uma_zalloc(mapentzone, M_WAITOK);
842 	if (new_entry == NULL)
843 		panic("vm_map_entry_create: kernel resources exhausted");
844 	return (new_entry);
845 }
846 
847 /*
848  *	vm_map_entry_set_behavior:
849  *
850  *	Set the expected access behavior, either normal, random, or
851  *	sequential.
852  */
853 static inline void
854 vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
855 {
856 	entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
857 	    (behavior & MAP_ENTRY_BEHAV_MASK);
858 }
859 
860 /*
861  *	vm_map_entry_set_max_free:
862  *
863  *	Set the max_free field in a vm_map_entry.
864  */
865 static inline void
866 vm_map_entry_set_max_free(vm_map_entry_t entry)
867 {
868 
869 	entry->max_free = entry->adj_free;
870 	if (entry->left != NULL && entry->left->max_free > entry->max_free)
871 		entry->max_free = entry->left->max_free;
872 	if (entry->right != NULL && entry->right->max_free > entry->max_free)
873 		entry->max_free = entry->right->max_free;
874 }
875 
876 /*
877  *	vm_map_entry_splay:
878  *
879  *	The Sleator and Tarjan top-down splay algorithm with the
880  *	following variation.  Max_free must be computed bottom-up, so
881  *	on the downward pass, maintain the left and right spines in
882  *	reverse order.  Then, make a second pass up each side to fix
883  *	the pointers and compute max_free.  The time bound is O(log n)
884  *	amortized.
885  *
886  *	The new root is the vm_map_entry containing "addr", or else an
887  *	adjacent entry (lower or higher) if addr is not in the tree.
888  *
889  *	The map must be locked, and leaves it so.
890  *
891  *	Returns: the new root.
892  */
893 static vm_map_entry_t
894 vm_map_entry_splay(vm_offset_t addr, vm_map_entry_t root)
895 {
896 	vm_map_entry_t llist, rlist;
897 	vm_map_entry_t ltree, rtree;
898 	vm_map_entry_t y;
899 
900 	/* Special case of empty tree. */
901 	if (root == NULL)
902 		return (root);
903 
904 	/*
905 	 * Pass One: Splay down the tree until we find addr or a NULL
906 	 * pointer where addr would go.  llist and rlist are the two
907 	 * sides in reverse order (bottom-up), with llist linked by
908 	 * the right pointer and rlist linked by the left pointer in
909 	 * the vm_map_entry.  Wait until Pass Two to set max_free on
910 	 * the two spines.
911 	 */
912 	llist = NULL;
913 	rlist = NULL;
914 	for (;;) {
915 		/* root is never NULL in here. */
916 		if (addr < root->start) {
917 			y = root->left;
918 			if (y == NULL)
919 				break;
920 			if (addr < y->start && y->left != NULL) {
921 				/* Rotate right and put y on rlist. */
922 				root->left = y->right;
923 				y->right = root;
924 				vm_map_entry_set_max_free(root);
925 				root = y->left;
926 				y->left = rlist;
927 				rlist = y;
928 			} else {
929 				/* Put root on rlist. */
930 				root->left = rlist;
931 				rlist = root;
932 				root = y;
933 			}
934 		} else if (addr >= root->end) {
935 			y = root->right;
936 			if (y == NULL)
937 				break;
938 			if (addr >= y->end && y->right != NULL) {
939 				/* Rotate left and put y on llist. */
940 				root->right = y->left;
941 				y->left = root;
942 				vm_map_entry_set_max_free(root);
943 				root = y->right;
944 				y->right = llist;
945 				llist = y;
946 			} else {
947 				/* Put root on llist. */
948 				root->right = llist;
949 				llist = root;
950 				root = y;
951 			}
952 		} else
953 			break;
954 	}
955 
956 	/*
957 	 * Pass Two: Walk back up the two spines, flip the pointers
958 	 * and set max_free.  The subtrees of the root go at the
959 	 * bottom of llist and rlist.
960 	 */
961 	ltree = root->left;
962 	while (llist != NULL) {
963 		y = llist->right;
964 		llist->right = ltree;
965 		vm_map_entry_set_max_free(llist);
966 		ltree = llist;
967 		llist = y;
968 	}
969 	rtree = root->right;
970 	while (rlist != NULL) {
971 		y = rlist->left;
972 		rlist->left = rtree;
973 		vm_map_entry_set_max_free(rlist);
974 		rtree = rlist;
975 		rlist = y;
976 	}
977 
978 	/*
979 	 * Final assembly: add ltree and rtree as subtrees of root.
980 	 */
981 	root->left = ltree;
982 	root->right = rtree;
983 	vm_map_entry_set_max_free(root);
984 
985 	return (root);
986 }
987 
988 /*
989  *	vm_map_entry_{un,}link:
990  *
991  *	Insert/remove entries from maps.
992  */
993 static void
994 vm_map_entry_link(vm_map_t map,
995 		  vm_map_entry_t after_where,
996 		  vm_map_entry_t entry)
997 {
998 
999 	CTR4(KTR_VM,
1000 	    "vm_map_entry_link: map %p, nentries %d, entry %p, after %p", map,
1001 	    map->nentries, entry, after_where);
1002 	VM_MAP_ASSERT_LOCKED(map);
1003 	KASSERT(after_where == &map->header ||
1004 	    after_where->end <= entry->start,
1005 	    ("vm_map_entry_link: prev end %jx new start %jx overlap",
1006 	    (uintmax_t)after_where->end, (uintmax_t)entry->start));
1007 	KASSERT(after_where->next == &map->header ||
1008 	    entry->end <= after_where->next->start,
1009 	    ("vm_map_entry_link: new end %jx next start %jx overlap",
1010 	    (uintmax_t)entry->end, (uintmax_t)after_where->next->start));
1011 
1012 	map->nentries++;
1013 	entry->prev = after_where;
1014 	entry->next = after_where->next;
1015 	entry->next->prev = entry;
1016 	after_where->next = entry;
1017 
1018 	if (after_where != &map->header) {
1019 		if (after_where != map->root)
1020 			vm_map_entry_splay(after_where->start, map->root);
1021 		entry->right = after_where->right;
1022 		entry->left = after_where;
1023 		after_where->right = NULL;
1024 		after_where->adj_free = entry->start - after_where->end;
1025 		vm_map_entry_set_max_free(after_where);
1026 	} else {
1027 		entry->right = map->root;
1028 		entry->left = NULL;
1029 	}
1030 	entry->adj_free = (entry->next == &map->header ? map->max_offset :
1031 	    entry->next->start) - entry->end;
1032 	vm_map_entry_set_max_free(entry);
1033 	map->root = entry;
1034 }
1035 
1036 static void
1037 vm_map_entry_unlink(vm_map_t map,
1038 		    vm_map_entry_t entry)
1039 {
1040 	vm_map_entry_t next, prev, root;
1041 
1042 	VM_MAP_ASSERT_LOCKED(map);
1043 	if (entry != map->root)
1044 		vm_map_entry_splay(entry->start, map->root);
1045 	if (entry->left == NULL)
1046 		root = entry->right;
1047 	else {
1048 		root = vm_map_entry_splay(entry->start, entry->left);
1049 		root->right = entry->right;
1050 		root->adj_free = (entry->next == &map->header ? map->max_offset :
1051 		    entry->next->start) - root->end;
1052 		vm_map_entry_set_max_free(root);
1053 	}
1054 	map->root = root;
1055 
1056 	prev = entry->prev;
1057 	next = entry->next;
1058 	next->prev = prev;
1059 	prev->next = next;
1060 	map->nentries--;
1061 	CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map,
1062 	    map->nentries, entry);
1063 }
1064 
1065 /*
1066  *	vm_map_entry_resize_free:
1067  *
1068  *	Recompute the amount of free space following a vm_map_entry
1069  *	and propagate that value up the tree.  Call this function after
1070  *	resizing a map entry in-place, that is, without a call to
1071  *	vm_map_entry_link() or _unlink().
1072  *
1073  *	The map must be locked, and leaves it so.
1074  */
1075 static void
1076 vm_map_entry_resize_free(vm_map_t map, vm_map_entry_t entry)
1077 {
1078 
1079 	/*
1080 	 * Using splay trees without parent pointers, propagating
1081 	 * max_free up the tree is done by moving the entry to the
1082 	 * root and making the change there.
1083 	 */
1084 	if (entry != map->root)
1085 		map->root = vm_map_entry_splay(entry->start, map->root);
1086 
1087 	entry->adj_free = (entry->next == &map->header ? map->max_offset :
1088 	    entry->next->start) - entry->end;
1089 	vm_map_entry_set_max_free(entry);
1090 }
1091 
1092 /*
1093  *	vm_map_lookup_entry:	[ internal use only ]
1094  *
1095  *	Finds the map entry containing (or
1096  *	immediately preceding) the specified address
1097  *	in the given map; the entry is returned
1098  *	in the "entry" parameter.  The boolean
1099  *	result indicates whether the address is
1100  *	actually contained in the map.
1101  */
1102 boolean_t
1103 vm_map_lookup_entry(
1104 	vm_map_t map,
1105 	vm_offset_t address,
1106 	vm_map_entry_t *entry)	/* OUT */
1107 {
1108 	vm_map_entry_t cur;
1109 	boolean_t locked;
1110 
1111 	/*
1112 	 * If the map is empty, then the map entry immediately preceding
1113 	 * "address" is the map's header.
1114 	 */
1115 	cur = map->root;
1116 	if (cur == NULL)
1117 		*entry = &map->header;
1118 	else if (address >= cur->start && cur->end > address) {
1119 		*entry = cur;
1120 		return (TRUE);
1121 	} else if ((locked = vm_map_locked(map)) ||
1122 	    sx_try_upgrade(&map->lock)) {
1123 		/*
1124 		 * Splay requires a write lock on the map.  However, it only
1125 		 * restructures the binary search tree; it does not otherwise
1126 		 * change the map.  Thus, the map's timestamp need not change
1127 		 * on a temporary upgrade.
1128 		 */
1129 		map->root = cur = vm_map_entry_splay(address, cur);
1130 		if (!locked)
1131 			sx_downgrade(&map->lock);
1132 
1133 		/*
1134 		 * If "address" is contained within a map entry, the new root
1135 		 * is that map entry.  Otherwise, the new root is a map entry
1136 		 * immediately before or after "address".
1137 		 */
1138 		if (address >= cur->start) {
1139 			*entry = cur;
1140 			if (cur->end > address)
1141 				return (TRUE);
1142 		} else
1143 			*entry = cur->prev;
1144 	} else
1145 		/*
1146 		 * Since the map is only locked for read access, perform a
1147 		 * standard binary search tree lookup for "address".
1148 		 */
1149 		for (;;) {
1150 			if (address < cur->start) {
1151 				if (cur->left == NULL) {
1152 					*entry = cur->prev;
1153 					break;
1154 				}
1155 				cur = cur->left;
1156 			} else if (cur->end > address) {
1157 				*entry = cur;
1158 				return (TRUE);
1159 			} else {
1160 				if (cur->right == NULL) {
1161 					*entry = cur;
1162 					break;
1163 				}
1164 				cur = cur->right;
1165 			}
1166 		}
1167 	return (FALSE);
1168 }
1169 
1170 /*
1171  *	vm_map_insert:
1172  *
1173  *	Inserts the given whole VM object into the target
1174  *	map at the specified address range.  The object's
1175  *	size should match that of the address range.
1176  *
1177  *	Requires that the map be locked, and leaves it so.
1178  *
1179  *	If object is non-NULL, ref count must be bumped by caller
1180  *	prior to making call to account for the new entry.
1181  */
1182 int
1183 vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1184     vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow)
1185 {
1186 	vm_map_entry_t new_entry, prev_entry, temp_entry;
1187 	struct ucred *cred;
1188 	vm_eflags_t protoeflags;
1189 	vm_inherit_t inheritance;
1190 
1191 	VM_MAP_ASSERT_LOCKED(map);
1192 	KASSERT(object != kernel_object ||
1193 	    (cow & MAP_COPY_ON_WRITE) == 0,
1194 	    ("vm_map_insert: kernel object and COW"));
1195 	KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0,
1196 	    ("vm_map_insert: paradoxical MAP_NOFAULT request"));
1197 	KASSERT((prot & ~max) == 0,
1198 	    ("prot %#x is not subset of max_prot %#x", prot, max));
1199 
1200 	/*
1201 	 * Check that the start and end points are not bogus.
1202 	 */
1203 	if (start < map->min_offset || end > map->max_offset || start >= end)
1204 		return (KERN_INVALID_ADDRESS);
1205 
1206 	/*
1207 	 * Find the entry prior to the proposed starting address; if it's part
1208 	 * of an existing entry, this range is bogus.
1209 	 */
1210 	if (vm_map_lookup_entry(map, start, &temp_entry))
1211 		return (KERN_NO_SPACE);
1212 
1213 	prev_entry = temp_entry;
1214 
1215 	/*
1216 	 * Assert that the next entry doesn't overlap the end point.
1217 	 */
1218 	if (prev_entry->next != &map->header && prev_entry->next->start < end)
1219 		return (KERN_NO_SPACE);
1220 
1221 	if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL ||
1222 	    max != VM_PROT_NONE))
1223 		return (KERN_INVALID_ARGUMENT);
1224 
1225 	protoeflags = 0;
1226 	if (cow & MAP_COPY_ON_WRITE)
1227 		protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY;
1228 	if (cow & MAP_NOFAULT)
1229 		protoeflags |= MAP_ENTRY_NOFAULT;
1230 	if (cow & MAP_DISABLE_SYNCER)
1231 		protoeflags |= MAP_ENTRY_NOSYNC;
1232 	if (cow & MAP_DISABLE_COREDUMP)
1233 		protoeflags |= MAP_ENTRY_NOCOREDUMP;
1234 	if (cow & MAP_STACK_GROWS_DOWN)
1235 		protoeflags |= MAP_ENTRY_GROWS_DOWN;
1236 	if (cow & MAP_STACK_GROWS_UP)
1237 		protoeflags |= MAP_ENTRY_GROWS_UP;
1238 	if (cow & MAP_VN_WRITECOUNT)
1239 		protoeflags |= MAP_ENTRY_VN_WRITECNT;
1240 	if ((cow & MAP_CREATE_GUARD) != 0)
1241 		protoeflags |= MAP_ENTRY_GUARD;
1242 	if ((cow & MAP_CREATE_STACK_GAP_DN) != 0)
1243 		protoeflags |= MAP_ENTRY_STACK_GAP_DN;
1244 	if ((cow & MAP_CREATE_STACK_GAP_UP) != 0)
1245 		protoeflags |= MAP_ENTRY_STACK_GAP_UP;
1246 	if (cow & MAP_INHERIT_SHARE)
1247 		inheritance = VM_INHERIT_SHARE;
1248 	else
1249 		inheritance = VM_INHERIT_DEFAULT;
1250 
1251 	cred = NULL;
1252 	if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0)
1253 		goto charged;
1254 	if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) &&
1255 	    ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) {
1256 		if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start))
1257 			return (KERN_RESOURCE_SHORTAGE);
1258 		KASSERT(object == NULL ||
1259 		    (protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 ||
1260 		    object->cred == NULL,
1261 		    ("overcommit: vm_map_insert o %p", object));
1262 		cred = curthread->td_ucred;
1263 	}
1264 
1265 charged:
1266 	/* Expand the kernel pmap, if necessary. */
1267 	if (map == kernel_map && end > kernel_vm_end)
1268 		pmap_growkernel(end);
1269 	if (object != NULL) {
1270 		/*
1271 		 * OBJ_ONEMAPPING must be cleared unless this mapping
1272 		 * is trivially proven to be the only mapping for any
1273 		 * of the object's pages.  (Object granularity
1274 		 * reference counting is insufficient to recognize
1275 		 * aliases with precision.)
1276 		 */
1277 		VM_OBJECT_WLOCK(object);
1278 		if (object->ref_count > 1 || object->shadow_count != 0)
1279 			vm_object_clear_flag(object, OBJ_ONEMAPPING);
1280 		VM_OBJECT_WUNLOCK(object);
1281 	} else if (prev_entry != &map->header &&
1282 	    prev_entry->eflags == protoeflags &&
1283 	    (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 &&
1284 	    prev_entry->end == start && prev_entry->wired_count == 0 &&
1285 	    (prev_entry->cred == cred ||
1286 	    (prev_entry->object.vm_object != NULL &&
1287 	    prev_entry->object.vm_object->cred == cred)) &&
1288 	    vm_object_coalesce(prev_entry->object.vm_object,
1289 	    prev_entry->offset,
1290 	    (vm_size_t)(prev_entry->end - prev_entry->start),
1291 	    (vm_size_t)(end - prev_entry->end), cred != NULL &&
1292 	    (protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)) {
1293 		/*
1294 		 * We were able to extend the object.  Determine if we
1295 		 * can extend the previous map entry to include the
1296 		 * new range as well.
1297 		 */
1298 		if (prev_entry->inheritance == inheritance &&
1299 		    prev_entry->protection == prot &&
1300 		    prev_entry->max_protection == max) {
1301 			if ((prev_entry->eflags & MAP_ENTRY_GUARD) == 0)
1302 				map->size += end - prev_entry->end;
1303 			prev_entry->end = end;
1304 			vm_map_entry_resize_free(map, prev_entry);
1305 			vm_map_simplify_entry(map, prev_entry);
1306 			return (KERN_SUCCESS);
1307 		}
1308 
1309 		/*
1310 		 * If we can extend the object but cannot extend the
1311 		 * map entry, we have to create a new map entry.  We
1312 		 * must bump the ref count on the extended object to
1313 		 * account for it.  object may be NULL.
1314 		 */
1315 		object = prev_entry->object.vm_object;
1316 		offset = prev_entry->offset +
1317 		    (prev_entry->end - prev_entry->start);
1318 		vm_object_reference(object);
1319 		if (cred != NULL && object != NULL && object->cred != NULL &&
1320 		    !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
1321 			/* Object already accounts for this uid. */
1322 			cred = NULL;
1323 		}
1324 	}
1325 	if (cred != NULL)
1326 		crhold(cred);
1327 
1328 	/*
1329 	 * Create a new entry
1330 	 */
1331 	new_entry = vm_map_entry_create(map);
1332 	new_entry->start = start;
1333 	new_entry->end = end;
1334 	new_entry->cred = NULL;
1335 
1336 	new_entry->eflags = protoeflags;
1337 	new_entry->object.vm_object = object;
1338 	new_entry->offset = offset;
1339 
1340 	new_entry->inheritance = inheritance;
1341 	new_entry->protection = prot;
1342 	new_entry->max_protection = max;
1343 	new_entry->wired_count = 0;
1344 	new_entry->wiring_thread = NULL;
1345 	new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT;
1346 	new_entry->next_read = start;
1347 
1348 	KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry),
1349 	    ("overcommit: vm_map_insert leaks vm_map %p", new_entry));
1350 	new_entry->cred = cred;
1351 
1352 	/*
1353 	 * Insert the new entry into the list
1354 	 */
1355 	vm_map_entry_link(map, prev_entry, new_entry);
1356 	if ((new_entry->eflags & MAP_ENTRY_GUARD) == 0)
1357 		map->size += new_entry->end - new_entry->start;
1358 
1359 	/*
1360 	 * Try to coalesce the new entry with both the previous and next
1361 	 * entries in the list.  Previously, we only attempted to coalesce
1362 	 * with the previous entry when object is NULL.  Here, we handle the
1363 	 * other cases, which are less common.
1364 	 */
1365 	vm_map_simplify_entry(map, new_entry);
1366 
1367 	if ((cow & (MAP_PREFAULT | MAP_PREFAULT_PARTIAL)) != 0) {
1368 		vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset),
1369 		    end - start, cow & MAP_PREFAULT_PARTIAL);
1370 	}
1371 
1372 	return (KERN_SUCCESS);
1373 }
1374 
1375 /*
1376  *	vm_map_findspace:
1377  *
1378  *	Find the first fit (lowest VM address) for "length" free bytes
1379  *	beginning at address >= start in the given map.
1380  *
1381  *	In a vm_map_entry, "adj_free" is the amount of free space
1382  *	adjacent (higher address) to this entry, and "max_free" is the
1383  *	maximum amount of contiguous free space in its subtree.  This
1384  *	allows finding a free region in one path down the tree, so
1385  *	O(log n) amortized with splay trees.
1386  *
1387  *	The map must be locked, and leaves it so.
1388  *
1389  *	Returns: 0 on success, and starting address in *addr,
1390  *		 1 if insufficient space.
1391  */
1392 int
1393 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length,
1394     vm_offset_t *addr)	/* OUT */
1395 {
1396 	vm_map_entry_t entry;
1397 	vm_offset_t st;
1398 
1399 	/*
1400 	 * Request must fit within min/max VM address and must avoid
1401 	 * address wrap.
1402 	 */
1403 	if (start < map->min_offset)
1404 		start = map->min_offset;
1405 	if (start + length > map->max_offset || start + length < start)
1406 		return (1);
1407 
1408 	/* Empty tree means wide open address space. */
1409 	if (map->root == NULL) {
1410 		*addr = start;
1411 		return (0);
1412 	}
1413 
1414 	/*
1415 	 * After splay, if start comes before root node, then there
1416 	 * must be a gap from start to the root.
1417 	 */
1418 	map->root = vm_map_entry_splay(start, map->root);
1419 	if (start + length <= map->root->start) {
1420 		*addr = start;
1421 		return (0);
1422 	}
1423 
1424 	/*
1425 	 * Root is the last node that might begin its gap before
1426 	 * start, and this is the last comparison where address
1427 	 * wrap might be a problem.
1428 	 */
1429 	st = (start > map->root->end) ? start : map->root->end;
1430 	if (length <= map->root->end + map->root->adj_free - st) {
1431 		*addr = st;
1432 		return (0);
1433 	}
1434 
1435 	/* With max_free, can immediately tell if no solution. */
1436 	entry = map->root->right;
1437 	if (entry == NULL || length > entry->max_free)
1438 		return (1);
1439 
1440 	/*
1441 	 * Search the right subtree in the order: left subtree, root,
1442 	 * right subtree (first fit).  The previous splay implies that
1443 	 * all regions in the right subtree have addresses > start.
1444 	 */
1445 	while (entry != NULL) {
1446 		if (entry->left != NULL && entry->left->max_free >= length)
1447 			entry = entry->left;
1448 		else if (entry->adj_free >= length) {
1449 			*addr = entry->end;
1450 			return (0);
1451 		} else
1452 			entry = entry->right;
1453 	}
1454 
1455 	/* Can't get here, so panic if we do. */
1456 	panic("vm_map_findspace: max_free corrupt");
1457 }
1458 
1459 int
1460 vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1461     vm_offset_t start, vm_size_t length, vm_prot_t prot,
1462     vm_prot_t max, int cow)
1463 {
1464 	vm_offset_t end;
1465 	int result;
1466 
1467 	end = start + length;
1468 	KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
1469 	    object == NULL,
1470 	    ("vm_map_fixed: non-NULL backing object for stack"));
1471 	vm_map_lock(map);
1472 	VM_MAP_RANGE_CHECK(map, start, end);
1473 	if ((cow & MAP_CHECK_EXCL) == 0)
1474 		vm_map_delete(map, start, end);
1475 	if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
1476 		result = vm_map_stack_locked(map, start, length, sgrowsiz,
1477 		    prot, max, cow);
1478 	} else {
1479 		result = vm_map_insert(map, object, offset, start, end,
1480 		    prot, max, cow);
1481 	}
1482 	vm_map_unlock(map);
1483 	return (result);
1484 }
1485 
1486 /*
1487  *	vm_map_find finds an unallocated region in the target address
1488  *	map with the given length.  The search is defined to be
1489  *	first-fit from the specified address; the region found is
1490  *	returned in the same parameter.
1491  *
1492  *	If object is non-NULL, ref count must be bumped by caller
1493  *	prior to making call to account for the new entry.
1494  */
1495 int
1496 vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1497 	    vm_offset_t *addr,	/* IN/OUT */
1498 	    vm_size_t length, vm_offset_t max_addr, int find_space,
1499 	    vm_prot_t prot, vm_prot_t max, int cow)
1500 {
1501 	vm_offset_t alignment, initial_addr, start;
1502 	int result;
1503 
1504 	KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
1505 	    object == NULL,
1506 	    ("vm_map_find: non-NULL backing object for stack"));
1507 	if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL ||
1508 	    (object->flags & OBJ_COLORED) == 0))
1509 		find_space = VMFS_ANY_SPACE;
1510 	if (find_space >> 8 != 0) {
1511 		KASSERT((find_space & 0xff) == 0, ("bad VMFS flags"));
1512 		alignment = (vm_offset_t)1 << (find_space >> 8);
1513 	} else
1514 		alignment = 0;
1515 	initial_addr = *addr;
1516 	vm_map_lock(map);
1517 again:
1518 	start = initial_addr;
1519 	do {
1520 		if (find_space != VMFS_NO_SPACE) {
1521 			if (vm_map_findspace(map, start, length, addr) ||
1522 			    (max_addr != 0 && *addr + length > max_addr)) {
1523 				if (find_space == VMFS_OPTIMAL_SPACE) {
1524 					find_space = VMFS_ANY_SPACE;
1525 					goto again;
1526 				}
1527 				vm_map_unlock(map);
1528 				return (KERN_NO_SPACE);
1529 			}
1530 			switch (find_space) {
1531 			case VMFS_SUPER_SPACE:
1532 			case VMFS_OPTIMAL_SPACE:
1533 				pmap_align_superpage(object, offset, addr,
1534 				    length);
1535 				break;
1536 			case VMFS_ANY_SPACE:
1537 				break;
1538 			default:
1539 				if ((*addr & (alignment - 1)) != 0) {
1540 					*addr &= ~(alignment - 1);
1541 					*addr += alignment;
1542 				}
1543 				break;
1544 			}
1545 
1546 			start = *addr;
1547 		}
1548 		if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
1549 			result = vm_map_stack_locked(map, start, length,
1550 			    sgrowsiz, prot, max, cow);
1551 		} else {
1552 			result = vm_map_insert(map, object, offset, start,
1553 			    start + length, prot, max, cow);
1554 		}
1555 	} while (result == KERN_NO_SPACE && find_space != VMFS_NO_SPACE &&
1556 	    find_space != VMFS_ANY_SPACE);
1557 	vm_map_unlock(map);
1558 	return (result);
1559 }
1560 
1561 /*
1562  *	vm_map_find_min() is a variant of vm_map_find() that takes an
1563  *	additional parameter (min_addr) and treats the given address
1564  *	(*addr) differently.  Specifically, it treats *addr as a hint
1565  *	and not as the minimum address where the mapping is created.
1566  *
1567  *	This function works in two phases.  First, it tries to
1568  *	allocate above the hint.  If that fails and the hint is
1569  *	greater than min_addr, it performs a second pass, replacing
1570  *	the hint with min_addr as the minimum address for the
1571  *	allocation.
1572  */
1573 int
1574 vm_map_find_min(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1575     vm_offset_t *addr, vm_size_t length, vm_offset_t min_addr,
1576     vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max,
1577     int cow)
1578 {
1579 	vm_offset_t hint;
1580 	int rv;
1581 
1582 	hint = *addr;
1583 	for (;;) {
1584 		rv = vm_map_find(map, object, offset, addr, length, max_addr,
1585 		    find_space, prot, max, cow);
1586 		if (rv == KERN_SUCCESS || min_addr >= hint)
1587 			return (rv);
1588 		*addr = hint = min_addr;
1589 	}
1590 }
1591 
1592 /*
1593  *	vm_map_simplify_entry:
1594  *
1595  *	Simplify the given map entry by merging with either neighbor.  This
1596  *	routine also has the ability to merge with both neighbors.
1597  *
1598  *	The map must be locked.
1599  *
1600  *	This routine guarantees that the passed entry remains valid (though
1601  *	possibly extended).  When merging, this routine may delete one or
1602  *	both neighbors.
1603  */
1604 void
1605 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry)
1606 {
1607 	vm_map_entry_t next, prev;
1608 	vm_size_t prevsize, esize;
1609 
1610 	if ((entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP |
1611 	    MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP)) != 0)
1612 		return;
1613 
1614 	prev = entry->prev;
1615 	if (prev != &map->header) {
1616 		prevsize = prev->end - prev->start;
1617 		if ( (prev->end == entry->start) &&
1618 		     (prev->object.vm_object == entry->object.vm_object) &&
1619 		     (!prev->object.vm_object ||
1620 			(prev->offset + prevsize == entry->offset)) &&
1621 		     (prev->eflags == entry->eflags) &&
1622 		     (prev->protection == entry->protection) &&
1623 		     (prev->max_protection == entry->max_protection) &&
1624 		     (prev->inheritance == entry->inheritance) &&
1625 		     (prev->wired_count == entry->wired_count) &&
1626 		     (prev->cred == entry->cred)) {
1627 			vm_map_entry_unlink(map, prev);
1628 			entry->start = prev->start;
1629 			entry->offset = prev->offset;
1630 			if (entry->prev != &map->header)
1631 				vm_map_entry_resize_free(map, entry->prev);
1632 
1633 			/*
1634 			 * If the backing object is a vnode object,
1635 			 * vm_object_deallocate() calls vrele().
1636 			 * However, vrele() does not lock the vnode
1637 			 * because the vnode has additional
1638 			 * references.  Thus, the map lock can be kept
1639 			 * without causing a lock-order reversal with
1640 			 * the vnode lock.
1641 			 *
1642 			 * Since we count the number of virtual page
1643 			 * mappings in object->un_pager.vnp.writemappings,
1644 			 * the writemappings value should not be adjusted
1645 			 * when the entry is disposed of.
1646 			 */
1647 			if (prev->object.vm_object)
1648 				vm_object_deallocate(prev->object.vm_object);
1649 			if (prev->cred != NULL)
1650 				crfree(prev->cred);
1651 			vm_map_entry_dispose(map, prev);
1652 		}
1653 	}
1654 
1655 	next = entry->next;
1656 	if (next != &map->header) {
1657 		esize = entry->end - entry->start;
1658 		if ((entry->end == next->start) &&
1659 		    (next->object.vm_object == entry->object.vm_object) &&
1660 		     (!entry->object.vm_object ||
1661 			(entry->offset + esize == next->offset)) &&
1662 		    (next->eflags == entry->eflags) &&
1663 		    (next->protection == entry->protection) &&
1664 		    (next->max_protection == entry->max_protection) &&
1665 		    (next->inheritance == entry->inheritance) &&
1666 		    (next->wired_count == entry->wired_count) &&
1667 		    (next->cred == entry->cred)) {
1668 			vm_map_entry_unlink(map, next);
1669 			entry->end = next->end;
1670 			vm_map_entry_resize_free(map, entry);
1671 
1672 			/*
1673 			 * See comment above.
1674 			 */
1675 			if (next->object.vm_object)
1676 				vm_object_deallocate(next->object.vm_object);
1677 			if (next->cred != NULL)
1678 				crfree(next->cred);
1679 			vm_map_entry_dispose(map, next);
1680 		}
1681 	}
1682 }
1683 /*
1684  *	vm_map_clip_start:	[ internal use only ]
1685  *
1686  *	Asserts that the given entry begins at or after
1687  *	the specified address; if necessary,
1688  *	it splits the entry into two.
1689  */
1690 #define vm_map_clip_start(map, entry, startaddr) \
1691 { \
1692 	if (startaddr > entry->start) \
1693 		_vm_map_clip_start(map, entry, startaddr); \
1694 }
1695 
1696 /*
1697  *	This routine is called only when it is known that
1698  *	the entry must be split.
1699  */
1700 static void
1701 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start)
1702 {
1703 	vm_map_entry_t new_entry;
1704 
1705 	VM_MAP_ASSERT_LOCKED(map);
1706 	KASSERT(entry->end > start && entry->start < start,
1707 	    ("_vm_map_clip_start: invalid clip of entry %p", entry));
1708 
1709 	/*
1710 	 * Split off the front portion -- note that we must insert the new
1711 	 * entry BEFORE this one, so that this entry has the specified
1712 	 * starting address.
1713 	 */
1714 	vm_map_simplify_entry(map, entry);
1715 
1716 	/*
1717 	 * If there is no object backing this entry, we might as well create
1718 	 * one now.  If we defer it, an object can get created after the map
1719 	 * is clipped, and individual objects will be created for the split-up
1720 	 * map.  This is a bit of a hack, but is also about the best place to
1721 	 * put this improvement.
1722 	 */
1723 	if (entry->object.vm_object == NULL && !map->system_map &&
1724 	    (entry->eflags & MAP_ENTRY_GUARD) == 0) {
1725 		vm_object_t object;
1726 		object = vm_object_allocate(OBJT_DEFAULT,
1727 				atop(entry->end - entry->start));
1728 		entry->object.vm_object = object;
1729 		entry->offset = 0;
1730 		if (entry->cred != NULL) {
1731 			object->cred = entry->cred;
1732 			object->charge = entry->end - entry->start;
1733 			entry->cred = NULL;
1734 		}
1735 	} else if (entry->object.vm_object != NULL &&
1736 		   ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
1737 		   entry->cred != NULL) {
1738 		VM_OBJECT_WLOCK(entry->object.vm_object);
1739 		KASSERT(entry->object.vm_object->cred == NULL,
1740 		    ("OVERCOMMIT: vm_entry_clip_start: both cred e %p", entry));
1741 		entry->object.vm_object->cred = entry->cred;
1742 		entry->object.vm_object->charge = entry->end - entry->start;
1743 		VM_OBJECT_WUNLOCK(entry->object.vm_object);
1744 		entry->cred = NULL;
1745 	}
1746 
1747 	new_entry = vm_map_entry_create(map);
1748 	*new_entry = *entry;
1749 
1750 	new_entry->end = start;
1751 	entry->offset += (start - entry->start);
1752 	entry->start = start;
1753 	if (new_entry->cred != NULL)
1754 		crhold(entry->cred);
1755 
1756 	vm_map_entry_link(map, entry->prev, new_entry);
1757 
1758 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
1759 		vm_object_reference(new_entry->object.vm_object);
1760 		/*
1761 		 * The object->un_pager.vnp.writemappings for the
1762 		 * object of MAP_ENTRY_VN_WRITECNT type entry shall be
1763 		 * kept as is here.  The virtual pages are
1764 		 * re-distributed among the clipped entries, so the sum is
1765 		 * left the same.
1766 		 */
1767 	}
1768 }
1769 
1770 /*
1771  *	vm_map_clip_end:	[ internal use only ]
1772  *
1773  *	Asserts that the given entry ends at or before
1774  *	the specified address; if necessary,
1775  *	it splits the entry into two.
1776  */
1777 #define vm_map_clip_end(map, entry, endaddr) \
1778 { \
1779 	if ((endaddr) < (entry->end)) \
1780 		_vm_map_clip_end((map), (entry), (endaddr)); \
1781 }
1782 
1783 /*
1784  *	This routine is called only when it is known that
1785  *	the entry must be split.
1786  */
1787 static void
1788 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end)
1789 {
1790 	vm_map_entry_t new_entry;
1791 
1792 	VM_MAP_ASSERT_LOCKED(map);
1793 	KASSERT(entry->start < end && entry->end > end,
1794 	    ("_vm_map_clip_end: invalid clip of entry %p", entry));
1795 
1796 	/*
1797 	 * If there is no object backing this entry, we might as well create
1798 	 * one now.  If we defer it, an object can get created after the map
1799 	 * is clipped, and individual objects will be created for the split-up
1800 	 * map.  This is a bit of a hack, but is also about the best place to
1801 	 * put this improvement.
1802 	 */
1803 	if (entry->object.vm_object == NULL && !map->system_map &&
1804 	    (entry->eflags & MAP_ENTRY_GUARD) == 0) {
1805 		vm_object_t object;
1806 		object = vm_object_allocate(OBJT_DEFAULT,
1807 				atop(entry->end - entry->start));
1808 		entry->object.vm_object = object;
1809 		entry->offset = 0;
1810 		if (entry->cred != NULL) {
1811 			object->cred = entry->cred;
1812 			object->charge = entry->end - entry->start;
1813 			entry->cred = NULL;
1814 		}
1815 	} else if (entry->object.vm_object != NULL &&
1816 		   ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
1817 		   entry->cred != NULL) {
1818 		VM_OBJECT_WLOCK(entry->object.vm_object);
1819 		KASSERT(entry->object.vm_object->cred == NULL,
1820 		    ("OVERCOMMIT: vm_entry_clip_end: both cred e %p", entry));
1821 		entry->object.vm_object->cred = entry->cred;
1822 		entry->object.vm_object->charge = entry->end - entry->start;
1823 		VM_OBJECT_WUNLOCK(entry->object.vm_object);
1824 		entry->cred = NULL;
1825 	}
1826 
1827 	/*
1828 	 * Create a new entry and insert it AFTER the specified entry
1829 	 */
1830 	new_entry = vm_map_entry_create(map);
1831 	*new_entry = *entry;
1832 
1833 	new_entry->start = entry->end = end;
1834 	new_entry->offset += (end - entry->start);
1835 	if (new_entry->cred != NULL)
1836 		crhold(entry->cred);
1837 
1838 	vm_map_entry_link(map, entry, new_entry);
1839 
1840 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
1841 		vm_object_reference(new_entry->object.vm_object);
1842 	}
1843 }
1844 
1845 /*
1846  *	vm_map_submap:		[ kernel use only ]
1847  *
1848  *	Mark the given range as handled by a subordinate map.
1849  *
1850  *	This range must have been created with vm_map_find,
1851  *	and no other operations may have been performed on this
1852  *	range prior to calling vm_map_submap.
1853  *
1854  *	Only a limited number of operations can be performed
1855  *	within this rage after calling vm_map_submap:
1856  *		vm_fault
1857  *	[Don't try vm_map_copy!]
1858  *
1859  *	To remove a submapping, one must first remove the
1860  *	range from the superior map, and then destroy the
1861  *	submap (if desired).  [Better yet, don't try it.]
1862  */
1863 int
1864 vm_map_submap(
1865 	vm_map_t map,
1866 	vm_offset_t start,
1867 	vm_offset_t end,
1868 	vm_map_t submap)
1869 {
1870 	vm_map_entry_t entry;
1871 	int result = KERN_INVALID_ARGUMENT;
1872 
1873 	vm_map_lock(map);
1874 
1875 	VM_MAP_RANGE_CHECK(map, start, end);
1876 
1877 	if (vm_map_lookup_entry(map, start, &entry)) {
1878 		vm_map_clip_start(map, entry, start);
1879 	} else
1880 		entry = entry->next;
1881 
1882 	vm_map_clip_end(map, entry, end);
1883 
1884 	if ((entry->start == start) && (entry->end == end) &&
1885 	    ((entry->eflags & MAP_ENTRY_COW) == 0) &&
1886 	    (entry->object.vm_object == NULL)) {
1887 		entry->object.sub_map = submap;
1888 		entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
1889 		result = KERN_SUCCESS;
1890 	}
1891 	vm_map_unlock(map);
1892 
1893 	return (result);
1894 }
1895 
1896 /*
1897  * The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified
1898  */
1899 #define	MAX_INIT_PT	96
1900 
1901 /*
1902  *	vm_map_pmap_enter:
1903  *
1904  *	Preload the specified map's pmap with mappings to the specified
1905  *	object's memory-resident pages.  No further physical pages are
1906  *	allocated, and no further virtual pages are retrieved from secondary
1907  *	storage.  If the specified flags include MAP_PREFAULT_PARTIAL, then a
1908  *	limited number of page mappings are created at the low-end of the
1909  *	specified address range.  (For this purpose, a superpage mapping
1910  *	counts as one page mapping.)  Otherwise, all resident pages within
1911  *	the specified address range are mapped.
1912  */
1913 static void
1914 vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
1915     vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
1916 {
1917 	vm_offset_t start;
1918 	vm_page_t p, p_start;
1919 	vm_pindex_t mask, psize, threshold, tmpidx;
1920 
1921 	if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
1922 		return;
1923 	VM_OBJECT_RLOCK(object);
1924 	if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
1925 		VM_OBJECT_RUNLOCK(object);
1926 		VM_OBJECT_WLOCK(object);
1927 		if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
1928 			pmap_object_init_pt(map->pmap, addr, object, pindex,
1929 			    size);
1930 			VM_OBJECT_WUNLOCK(object);
1931 			return;
1932 		}
1933 		VM_OBJECT_LOCK_DOWNGRADE(object);
1934 	}
1935 
1936 	psize = atop(size);
1937 	if (psize + pindex > object->size) {
1938 		if (object->size < pindex) {
1939 			VM_OBJECT_RUNLOCK(object);
1940 			return;
1941 		}
1942 		psize = object->size - pindex;
1943 	}
1944 
1945 	start = 0;
1946 	p_start = NULL;
1947 	threshold = MAX_INIT_PT;
1948 
1949 	p = vm_page_find_least(object, pindex);
1950 	/*
1951 	 * Assert: the variable p is either (1) the page with the
1952 	 * least pindex greater than or equal to the parameter pindex
1953 	 * or (2) NULL.
1954 	 */
1955 	for (;
1956 	     p != NULL && (tmpidx = p->pindex - pindex) < psize;
1957 	     p = TAILQ_NEXT(p, listq)) {
1958 		/*
1959 		 * don't allow an madvise to blow away our really
1960 		 * free pages allocating pv entries.
1961 		 */
1962 		if (((flags & MAP_PREFAULT_MADVISE) != 0 &&
1963 		    vm_cnt.v_free_count < vm_cnt.v_free_reserved) ||
1964 		    ((flags & MAP_PREFAULT_PARTIAL) != 0 &&
1965 		    tmpidx >= threshold)) {
1966 			psize = tmpidx;
1967 			break;
1968 		}
1969 		if (p->valid == VM_PAGE_BITS_ALL) {
1970 			if (p_start == NULL) {
1971 				start = addr + ptoa(tmpidx);
1972 				p_start = p;
1973 			}
1974 			/* Jump ahead if a superpage mapping is possible. */
1975 			if (p->psind > 0 && ((addr + ptoa(tmpidx)) &
1976 			    (pagesizes[p->psind] - 1)) == 0) {
1977 				mask = atop(pagesizes[p->psind]) - 1;
1978 				if (tmpidx + mask < psize &&
1979 				    vm_page_ps_test(p, PS_ALL_VALID, NULL)) {
1980 					p += mask;
1981 					threshold += mask;
1982 				}
1983 			}
1984 		} else if (p_start != NULL) {
1985 			pmap_enter_object(map->pmap, start, addr +
1986 			    ptoa(tmpidx), p_start, prot);
1987 			p_start = NULL;
1988 		}
1989 	}
1990 	if (p_start != NULL)
1991 		pmap_enter_object(map->pmap, start, addr + ptoa(psize),
1992 		    p_start, prot);
1993 	VM_OBJECT_RUNLOCK(object);
1994 }
1995 
1996 /*
1997  *	vm_map_protect:
1998  *
1999  *	Sets the protection of the specified address
2000  *	region in the target map.  If "set_max" is
2001  *	specified, the maximum protection is to be set;
2002  *	otherwise, only the current protection is affected.
2003  */
2004 int
2005 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
2006 	       vm_prot_t new_prot, boolean_t set_max)
2007 {
2008 	vm_map_entry_t current, entry;
2009 	vm_object_t obj;
2010 	struct ucred *cred;
2011 	vm_prot_t old_prot;
2012 
2013 	if (start == end)
2014 		return (KERN_SUCCESS);
2015 
2016 	vm_map_lock(map);
2017 
2018 	/*
2019 	 * Ensure that we are not concurrently wiring pages.  vm_map_wire() may
2020 	 * need to fault pages into the map and will drop the map lock while
2021 	 * doing so, and the VM object may end up in an inconsistent state if we
2022 	 * update the protection on the map entry in between faults.
2023 	 */
2024 	vm_map_wait_busy(map);
2025 
2026 	VM_MAP_RANGE_CHECK(map, start, end);
2027 
2028 	if (vm_map_lookup_entry(map, start, &entry)) {
2029 		vm_map_clip_start(map, entry, start);
2030 	} else {
2031 		entry = entry->next;
2032 	}
2033 
2034 	/*
2035 	 * Make a first pass to check for protection violations.
2036 	 */
2037 	for (current = entry; current != &map->header && current->start < end;
2038 	    current = current->next) {
2039 		if ((current->eflags & MAP_ENTRY_GUARD) != 0)
2040 			continue;
2041 		if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
2042 			vm_map_unlock(map);
2043 			return (KERN_INVALID_ARGUMENT);
2044 		}
2045 		if ((new_prot & current->max_protection) != new_prot) {
2046 			vm_map_unlock(map);
2047 			return (KERN_PROTECTION_FAILURE);
2048 		}
2049 	}
2050 
2051 	/*
2052 	 * Do an accounting pass for private read-only mappings that
2053 	 * now will do cow due to allowed write (e.g. debugger sets
2054 	 * breakpoint on text segment)
2055 	 */
2056 	for (current = entry; current != &map->header && current->start < end;
2057 	    current = current->next) {
2058 
2059 		vm_map_clip_end(map, current, end);
2060 
2061 		if (set_max ||
2062 		    ((new_prot & ~(current->protection)) & VM_PROT_WRITE) == 0 ||
2063 		    ENTRY_CHARGED(current) ||
2064 		    (current->eflags & MAP_ENTRY_GUARD) != 0) {
2065 			continue;
2066 		}
2067 
2068 		cred = curthread->td_ucred;
2069 		obj = current->object.vm_object;
2070 
2071 		if (obj == NULL || (current->eflags & MAP_ENTRY_NEEDS_COPY)) {
2072 			if (!swap_reserve(current->end - current->start)) {
2073 				vm_map_unlock(map);
2074 				return (KERN_RESOURCE_SHORTAGE);
2075 			}
2076 			crhold(cred);
2077 			current->cred = cred;
2078 			continue;
2079 		}
2080 
2081 		VM_OBJECT_WLOCK(obj);
2082 		if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP) {
2083 			VM_OBJECT_WUNLOCK(obj);
2084 			continue;
2085 		}
2086 
2087 		/*
2088 		 * Charge for the whole object allocation now, since
2089 		 * we cannot distinguish between non-charged and
2090 		 * charged clipped mapping of the same object later.
2091 		 */
2092 		KASSERT(obj->charge == 0,
2093 		    ("vm_map_protect: object %p overcharged (entry %p)",
2094 		    obj, current));
2095 		if (!swap_reserve(ptoa(obj->size))) {
2096 			VM_OBJECT_WUNLOCK(obj);
2097 			vm_map_unlock(map);
2098 			return (KERN_RESOURCE_SHORTAGE);
2099 		}
2100 
2101 		crhold(cred);
2102 		obj->cred = cred;
2103 		obj->charge = ptoa(obj->size);
2104 		VM_OBJECT_WUNLOCK(obj);
2105 	}
2106 
2107 	/*
2108 	 * Go back and fix up protections. [Note that clipping is not
2109 	 * necessary the second time.]
2110 	 */
2111 	for (current = entry; current != &map->header && current->start < end;
2112 	    current = current->next) {
2113 		if ((current->eflags & MAP_ENTRY_GUARD) != 0)
2114 			continue;
2115 
2116 		old_prot = current->protection;
2117 
2118 		if (set_max)
2119 			current->protection =
2120 			    (current->max_protection = new_prot) &
2121 			    old_prot;
2122 		else
2123 			current->protection = new_prot;
2124 
2125 		/*
2126 		 * For user wired map entries, the normal lazy evaluation of
2127 		 * write access upgrades through soft page faults is
2128 		 * undesirable.  Instead, immediately copy any pages that are
2129 		 * copy-on-write and enable write access in the physical map.
2130 		 */
2131 		if ((current->eflags & MAP_ENTRY_USER_WIRED) != 0 &&
2132 		    (current->protection & VM_PROT_WRITE) != 0 &&
2133 		    (old_prot & VM_PROT_WRITE) == 0)
2134 			vm_fault_copy_entry(map, map, current, current, NULL);
2135 
2136 		/*
2137 		 * When restricting access, update the physical map.  Worry
2138 		 * about copy-on-write here.
2139 		 */
2140 		if ((old_prot & ~current->protection) != 0) {
2141 #define MASK(entry)	(((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
2142 							VM_PROT_ALL)
2143 			pmap_protect(map->pmap, current->start,
2144 			    current->end,
2145 			    current->protection & MASK(current));
2146 #undef	MASK
2147 		}
2148 		vm_map_simplify_entry(map, current);
2149 	}
2150 	vm_map_unlock(map);
2151 	return (KERN_SUCCESS);
2152 }
2153 
2154 /*
2155  *	vm_map_madvise:
2156  *
2157  *	This routine traverses a processes map handling the madvise
2158  *	system call.  Advisories are classified as either those effecting
2159  *	the vm_map_entry structure, or those effecting the underlying
2160  *	objects.
2161  */
2162 int
2163 vm_map_madvise(
2164 	vm_map_t map,
2165 	vm_offset_t start,
2166 	vm_offset_t end,
2167 	int behav)
2168 {
2169 	vm_map_entry_t current, entry;
2170 	int modify_map = 0;
2171 
2172 	/*
2173 	 * Some madvise calls directly modify the vm_map_entry, in which case
2174 	 * we need to use an exclusive lock on the map and we need to perform
2175 	 * various clipping operations.  Otherwise we only need a read-lock
2176 	 * on the map.
2177 	 */
2178 	switch(behav) {
2179 	case MADV_NORMAL:
2180 	case MADV_SEQUENTIAL:
2181 	case MADV_RANDOM:
2182 	case MADV_NOSYNC:
2183 	case MADV_AUTOSYNC:
2184 	case MADV_NOCORE:
2185 	case MADV_CORE:
2186 		if (start == end)
2187 			return (KERN_SUCCESS);
2188 		modify_map = 1;
2189 		vm_map_lock(map);
2190 		break;
2191 	case MADV_WILLNEED:
2192 	case MADV_DONTNEED:
2193 	case MADV_FREE:
2194 		if (start == end)
2195 			return (KERN_SUCCESS);
2196 		vm_map_lock_read(map);
2197 		break;
2198 	default:
2199 		return (KERN_INVALID_ARGUMENT);
2200 	}
2201 
2202 	/*
2203 	 * Locate starting entry and clip if necessary.
2204 	 */
2205 	VM_MAP_RANGE_CHECK(map, start, end);
2206 
2207 	if (vm_map_lookup_entry(map, start, &entry)) {
2208 		if (modify_map)
2209 			vm_map_clip_start(map, entry, start);
2210 	} else {
2211 		entry = entry->next;
2212 	}
2213 
2214 	if (modify_map) {
2215 		/*
2216 		 * madvise behaviors that are implemented in the vm_map_entry.
2217 		 *
2218 		 * We clip the vm_map_entry so that behavioral changes are
2219 		 * limited to the specified address range.
2220 		 */
2221 		for (current = entry;
2222 		     (current != &map->header) && (current->start < end);
2223 		     current = current->next
2224 		) {
2225 			if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
2226 				continue;
2227 
2228 			vm_map_clip_end(map, current, end);
2229 
2230 			switch (behav) {
2231 			case MADV_NORMAL:
2232 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
2233 				break;
2234 			case MADV_SEQUENTIAL:
2235 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
2236 				break;
2237 			case MADV_RANDOM:
2238 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
2239 				break;
2240 			case MADV_NOSYNC:
2241 				current->eflags |= MAP_ENTRY_NOSYNC;
2242 				break;
2243 			case MADV_AUTOSYNC:
2244 				current->eflags &= ~MAP_ENTRY_NOSYNC;
2245 				break;
2246 			case MADV_NOCORE:
2247 				current->eflags |= MAP_ENTRY_NOCOREDUMP;
2248 				break;
2249 			case MADV_CORE:
2250 				current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
2251 				break;
2252 			default:
2253 				break;
2254 			}
2255 			vm_map_simplify_entry(map, current);
2256 		}
2257 		vm_map_unlock(map);
2258 	} else {
2259 		vm_pindex_t pstart, pend;
2260 
2261 		/*
2262 		 * madvise behaviors that are implemented in the underlying
2263 		 * vm_object.
2264 		 *
2265 		 * Since we don't clip the vm_map_entry, we have to clip
2266 		 * the vm_object pindex and count.
2267 		 */
2268 		for (current = entry;
2269 		     (current != &map->header) && (current->start < end);
2270 		     current = current->next
2271 		) {
2272 			vm_offset_t useEnd, useStart;
2273 
2274 			if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
2275 				continue;
2276 
2277 			pstart = OFF_TO_IDX(current->offset);
2278 			pend = pstart + atop(current->end - current->start);
2279 			useStart = current->start;
2280 			useEnd = current->end;
2281 
2282 			if (current->start < start) {
2283 				pstart += atop(start - current->start);
2284 				useStart = start;
2285 			}
2286 			if (current->end > end) {
2287 				pend -= atop(current->end - end);
2288 				useEnd = end;
2289 			}
2290 
2291 			if (pstart >= pend)
2292 				continue;
2293 
2294 			/*
2295 			 * Perform the pmap_advise() before clearing
2296 			 * PGA_REFERENCED in vm_page_advise().  Otherwise, a
2297 			 * concurrent pmap operation, such as pmap_remove(),
2298 			 * could clear a reference in the pmap and set
2299 			 * PGA_REFERENCED on the page before the pmap_advise()
2300 			 * had completed.  Consequently, the page would appear
2301 			 * referenced based upon an old reference that
2302 			 * occurred before this pmap_advise() ran.
2303 			 */
2304 			if (behav == MADV_DONTNEED || behav == MADV_FREE)
2305 				pmap_advise(map->pmap, useStart, useEnd,
2306 				    behav);
2307 
2308 			vm_object_madvise(current->object.vm_object, pstart,
2309 			    pend, behav);
2310 
2311 			/*
2312 			 * Pre-populate paging structures in the
2313 			 * WILLNEED case.  For wired entries, the
2314 			 * paging structures are already populated.
2315 			 */
2316 			if (behav == MADV_WILLNEED &&
2317 			    current->wired_count == 0) {
2318 				vm_map_pmap_enter(map,
2319 				    useStart,
2320 				    current->protection,
2321 				    current->object.vm_object,
2322 				    pstart,
2323 				    ptoa(pend - pstart),
2324 				    MAP_PREFAULT_MADVISE
2325 				);
2326 			}
2327 		}
2328 		vm_map_unlock_read(map);
2329 	}
2330 	return (0);
2331 }
2332 
2333 
2334 /*
2335  *	vm_map_inherit:
2336  *
2337  *	Sets the inheritance of the specified address
2338  *	range in the target map.  Inheritance
2339  *	affects how the map will be shared with
2340  *	child maps at the time of vmspace_fork.
2341  */
2342 int
2343 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
2344 	       vm_inherit_t new_inheritance)
2345 {
2346 	vm_map_entry_t entry;
2347 	vm_map_entry_t temp_entry;
2348 
2349 	switch (new_inheritance) {
2350 	case VM_INHERIT_NONE:
2351 	case VM_INHERIT_COPY:
2352 	case VM_INHERIT_SHARE:
2353 	case VM_INHERIT_ZERO:
2354 		break;
2355 	default:
2356 		return (KERN_INVALID_ARGUMENT);
2357 	}
2358 	if (start == end)
2359 		return (KERN_SUCCESS);
2360 	vm_map_lock(map);
2361 	VM_MAP_RANGE_CHECK(map, start, end);
2362 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
2363 		entry = temp_entry;
2364 		vm_map_clip_start(map, entry, start);
2365 	} else
2366 		entry = temp_entry->next;
2367 	while ((entry != &map->header) && (entry->start < end)) {
2368 		vm_map_clip_end(map, entry, end);
2369 		if ((entry->eflags & MAP_ENTRY_GUARD) == 0 ||
2370 		    new_inheritance != VM_INHERIT_ZERO)
2371 			entry->inheritance = new_inheritance;
2372 		vm_map_simplify_entry(map, entry);
2373 		entry = entry->next;
2374 	}
2375 	vm_map_unlock(map);
2376 	return (KERN_SUCCESS);
2377 }
2378 
2379 /*
2380  *	vm_map_unwire:
2381  *
2382  *	Implements both kernel and user unwiring.
2383  */
2384 int
2385 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
2386     int flags)
2387 {
2388 	vm_map_entry_t entry, first_entry, tmp_entry;
2389 	vm_offset_t saved_start;
2390 	unsigned int last_timestamp;
2391 	int rv;
2392 	boolean_t need_wakeup, result, user_unwire;
2393 
2394 	if (start == end)
2395 		return (KERN_SUCCESS);
2396 	user_unwire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
2397 	vm_map_lock(map);
2398 	VM_MAP_RANGE_CHECK(map, start, end);
2399 	if (!vm_map_lookup_entry(map, start, &first_entry)) {
2400 		if (flags & VM_MAP_WIRE_HOLESOK)
2401 			first_entry = first_entry->next;
2402 		else {
2403 			vm_map_unlock(map);
2404 			return (KERN_INVALID_ADDRESS);
2405 		}
2406 	}
2407 	last_timestamp = map->timestamp;
2408 	entry = first_entry;
2409 	while (entry != &map->header && entry->start < end) {
2410 		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
2411 			/*
2412 			 * We have not yet clipped the entry.
2413 			 */
2414 			saved_start = (start >= entry->start) ? start :
2415 			    entry->start;
2416 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2417 			if (vm_map_unlock_and_wait(map, 0)) {
2418 				/*
2419 				 * Allow interruption of user unwiring?
2420 				 */
2421 			}
2422 			vm_map_lock(map);
2423 			if (last_timestamp+1 != map->timestamp) {
2424 				/*
2425 				 * Look again for the entry because the map was
2426 				 * modified while it was unlocked.
2427 				 * Specifically, the entry may have been
2428 				 * clipped, merged, or deleted.
2429 				 */
2430 				if (!vm_map_lookup_entry(map, saved_start,
2431 				    &tmp_entry)) {
2432 					if (flags & VM_MAP_WIRE_HOLESOK)
2433 						tmp_entry = tmp_entry->next;
2434 					else {
2435 						if (saved_start == start) {
2436 							/*
2437 							 * First_entry has been deleted.
2438 							 */
2439 							vm_map_unlock(map);
2440 							return (KERN_INVALID_ADDRESS);
2441 						}
2442 						end = saved_start;
2443 						rv = KERN_INVALID_ADDRESS;
2444 						goto done;
2445 					}
2446 				}
2447 				if (entry == first_entry)
2448 					first_entry = tmp_entry;
2449 				else
2450 					first_entry = NULL;
2451 				entry = tmp_entry;
2452 			}
2453 			last_timestamp = map->timestamp;
2454 			continue;
2455 		}
2456 		vm_map_clip_start(map, entry, start);
2457 		vm_map_clip_end(map, entry, end);
2458 		/*
2459 		 * Mark the entry in case the map lock is released.  (See
2460 		 * above.)
2461 		 */
2462 		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
2463 		    entry->wiring_thread == NULL,
2464 		    ("owned map entry %p", entry));
2465 		entry->eflags |= MAP_ENTRY_IN_TRANSITION;
2466 		entry->wiring_thread = curthread;
2467 		/*
2468 		 * Check the map for holes in the specified region.
2469 		 * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
2470 		 */
2471 		if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
2472 		    (entry->end < end && (entry->next == &map->header ||
2473 		    entry->next->start > entry->end))) {
2474 			end = entry->end;
2475 			rv = KERN_INVALID_ADDRESS;
2476 			goto done;
2477 		}
2478 		/*
2479 		 * If system unwiring, require that the entry is system wired.
2480 		 */
2481 		if (!user_unwire &&
2482 		    vm_map_entry_system_wired_count(entry) == 0) {
2483 			end = entry->end;
2484 			rv = KERN_INVALID_ARGUMENT;
2485 			goto done;
2486 		}
2487 		entry = entry->next;
2488 	}
2489 	rv = KERN_SUCCESS;
2490 done:
2491 	need_wakeup = FALSE;
2492 	if (first_entry == NULL) {
2493 		result = vm_map_lookup_entry(map, start, &first_entry);
2494 		if (!result && (flags & VM_MAP_WIRE_HOLESOK))
2495 			first_entry = first_entry->next;
2496 		else
2497 			KASSERT(result, ("vm_map_unwire: lookup failed"));
2498 	}
2499 	for (entry = first_entry; entry != &map->header && entry->start < end;
2500 	    entry = entry->next) {
2501 		/*
2502 		 * If VM_MAP_WIRE_HOLESOK was specified, an empty
2503 		 * space in the unwired region could have been mapped
2504 		 * while the map lock was dropped for draining
2505 		 * MAP_ENTRY_IN_TRANSITION.  Moreover, another thread
2506 		 * could be simultaneously wiring this new mapping
2507 		 * entry.  Detect these cases and skip any entries
2508 		 * marked as in transition by us.
2509 		 */
2510 		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
2511 		    entry->wiring_thread != curthread) {
2512 			KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0,
2513 			    ("vm_map_unwire: !HOLESOK and new/changed entry"));
2514 			continue;
2515 		}
2516 
2517 		if (rv == KERN_SUCCESS && (!user_unwire ||
2518 		    (entry->eflags & MAP_ENTRY_USER_WIRED))) {
2519 			if (user_unwire)
2520 				entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2521 			if (entry->wired_count == 1)
2522 				vm_map_entry_unwire(map, entry);
2523 			else
2524 				entry->wired_count--;
2525 		}
2526 		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
2527 		    ("vm_map_unwire: in-transition flag missing %p", entry));
2528 		KASSERT(entry->wiring_thread == curthread,
2529 		    ("vm_map_unwire: alien wire %p", entry));
2530 		entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
2531 		entry->wiring_thread = NULL;
2532 		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
2533 			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
2534 			need_wakeup = TRUE;
2535 		}
2536 		vm_map_simplify_entry(map, entry);
2537 	}
2538 	vm_map_unlock(map);
2539 	if (need_wakeup)
2540 		vm_map_wakeup(map);
2541 	return (rv);
2542 }
2543 
2544 /*
2545  *	vm_map_wire_entry_failure:
2546  *
2547  *	Handle a wiring failure on the given entry.
2548  *
2549  *	The map should be locked.
2550  */
2551 static void
2552 vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
2553     vm_offset_t failed_addr)
2554 {
2555 
2556 	VM_MAP_ASSERT_LOCKED(map);
2557 	KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 &&
2558 	    entry->wired_count == 1,
2559 	    ("vm_map_wire_entry_failure: entry %p isn't being wired", entry));
2560 	KASSERT(failed_addr < entry->end,
2561 	    ("vm_map_wire_entry_failure: entry %p was fully wired", entry));
2562 
2563 	/*
2564 	 * If any pages at the start of this entry were successfully wired,
2565 	 * then unwire them.
2566 	 */
2567 	if (failed_addr > entry->start) {
2568 		pmap_unwire(map->pmap, entry->start, failed_addr);
2569 		vm_object_unwire(entry->object.vm_object, entry->offset,
2570 		    failed_addr - entry->start, PQ_ACTIVE);
2571 	}
2572 
2573 	/*
2574 	 * Assign an out-of-range value to represent the failure to wire this
2575 	 * entry.
2576 	 */
2577 	entry->wired_count = -1;
2578 }
2579 
2580 /*
2581  *	vm_map_wire:
2582  *
2583  *	Implements both kernel and user wiring.
2584  */
2585 int
2586 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,
2587     int flags)
2588 {
2589 	vm_map_entry_t entry, first_entry, tmp_entry;
2590 	vm_offset_t faddr, saved_end, saved_start;
2591 	unsigned int last_timestamp;
2592 	int rv;
2593 	boolean_t need_wakeup, result, user_wire;
2594 	vm_prot_t prot;
2595 
2596 	if (start == end)
2597 		return (KERN_SUCCESS);
2598 	prot = 0;
2599 	if (flags & VM_MAP_WIRE_WRITE)
2600 		prot |= VM_PROT_WRITE;
2601 	user_wire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
2602 	vm_map_lock(map);
2603 	VM_MAP_RANGE_CHECK(map, start, end);
2604 	if (!vm_map_lookup_entry(map, start, &first_entry)) {
2605 		if (flags & VM_MAP_WIRE_HOLESOK)
2606 			first_entry = first_entry->next;
2607 		else {
2608 			vm_map_unlock(map);
2609 			return (KERN_INVALID_ADDRESS);
2610 		}
2611 	}
2612 	last_timestamp = map->timestamp;
2613 	entry = first_entry;
2614 	while (entry != &map->header && entry->start < end) {
2615 		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
2616 			/*
2617 			 * We have not yet clipped the entry.
2618 			 */
2619 			saved_start = (start >= entry->start) ? start :
2620 			    entry->start;
2621 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2622 			if (vm_map_unlock_and_wait(map, 0)) {
2623 				/*
2624 				 * Allow interruption of user wiring?
2625 				 */
2626 			}
2627 			vm_map_lock(map);
2628 			if (last_timestamp + 1 != map->timestamp) {
2629 				/*
2630 				 * Look again for the entry because the map was
2631 				 * modified while it was unlocked.
2632 				 * Specifically, the entry may have been
2633 				 * clipped, merged, or deleted.
2634 				 */
2635 				if (!vm_map_lookup_entry(map, saved_start,
2636 				    &tmp_entry)) {
2637 					if (flags & VM_MAP_WIRE_HOLESOK)
2638 						tmp_entry = tmp_entry->next;
2639 					else {
2640 						if (saved_start == start) {
2641 							/*
2642 							 * first_entry has been deleted.
2643 							 */
2644 							vm_map_unlock(map);
2645 							return (KERN_INVALID_ADDRESS);
2646 						}
2647 						end = saved_start;
2648 						rv = KERN_INVALID_ADDRESS;
2649 						goto done;
2650 					}
2651 				}
2652 				if (entry == first_entry)
2653 					first_entry = tmp_entry;
2654 				else
2655 					first_entry = NULL;
2656 				entry = tmp_entry;
2657 			}
2658 			last_timestamp = map->timestamp;
2659 			continue;
2660 		}
2661 		vm_map_clip_start(map, entry, start);
2662 		vm_map_clip_end(map, entry, end);
2663 		/*
2664 		 * Mark the entry in case the map lock is released.  (See
2665 		 * above.)
2666 		 */
2667 		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
2668 		    entry->wiring_thread == NULL,
2669 		    ("owned map entry %p", entry));
2670 		entry->eflags |= MAP_ENTRY_IN_TRANSITION;
2671 		entry->wiring_thread = curthread;
2672 		if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0
2673 		    || (entry->protection & prot) != prot) {
2674 			entry->eflags |= MAP_ENTRY_WIRE_SKIPPED;
2675 			if ((flags & VM_MAP_WIRE_HOLESOK) == 0) {
2676 				end = entry->end;
2677 				rv = KERN_INVALID_ADDRESS;
2678 				goto done;
2679 			}
2680 			goto next_entry;
2681 		}
2682 		if (entry->wired_count == 0) {
2683 			entry->wired_count++;
2684 			saved_start = entry->start;
2685 			saved_end = entry->end;
2686 
2687 			/*
2688 			 * Release the map lock, relying on the in-transition
2689 			 * mark.  Mark the map busy for fork.
2690 			 */
2691 			vm_map_busy(map);
2692 			vm_map_unlock(map);
2693 
2694 			faddr = saved_start;
2695 			do {
2696 				/*
2697 				 * Simulate a fault to get the page and enter
2698 				 * it into the physical map.
2699 				 */
2700 				if ((rv = vm_fault(map, faddr, VM_PROT_NONE,
2701 				    VM_FAULT_WIRE)) != KERN_SUCCESS)
2702 					break;
2703 			} while ((faddr += PAGE_SIZE) < saved_end);
2704 			vm_map_lock(map);
2705 			vm_map_unbusy(map);
2706 			if (last_timestamp + 1 != map->timestamp) {
2707 				/*
2708 				 * Look again for the entry because the map was
2709 				 * modified while it was unlocked.  The entry
2710 				 * may have been clipped, but NOT merged or
2711 				 * deleted.
2712 				 */
2713 				result = vm_map_lookup_entry(map, saved_start,
2714 				    &tmp_entry);
2715 				KASSERT(result, ("vm_map_wire: lookup failed"));
2716 				if (entry == first_entry)
2717 					first_entry = tmp_entry;
2718 				else
2719 					first_entry = NULL;
2720 				entry = tmp_entry;
2721 				while (entry->end < saved_end) {
2722 					/*
2723 					 * In case of failure, handle entries
2724 					 * that were not fully wired here;
2725 					 * fully wired entries are handled
2726 					 * later.
2727 					 */
2728 					if (rv != KERN_SUCCESS &&
2729 					    faddr < entry->end)
2730 						vm_map_wire_entry_failure(map,
2731 						    entry, faddr);
2732 					entry = entry->next;
2733 				}
2734 			}
2735 			last_timestamp = map->timestamp;
2736 			if (rv != KERN_SUCCESS) {
2737 				vm_map_wire_entry_failure(map, entry, faddr);
2738 				end = entry->end;
2739 				goto done;
2740 			}
2741 		} else if (!user_wire ||
2742 			   (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
2743 			entry->wired_count++;
2744 		}
2745 		/*
2746 		 * Check the map for holes in the specified region.
2747 		 * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
2748 		 */
2749 	next_entry:
2750 		if ((flags & VM_MAP_WIRE_HOLESOK) == 0 &&
2751 		    entry->end < end && (entry->next == &map->header ||
2752 		    entry->next->start > entry->end)) {
2753 			end = entry->end;
2754 			rv = KERN_INVALID_ADDRESS;
2755 			goto done;
2756 		}
2757 		entry = entry->next;
2758 	}
2759 	rv = KERN_SUCCESS;
2760 done:
2761 	need_wakeup = FALSE;
2762 	if (first_entry == NULL) {
2763 		result = vm_map_lookup_entry(map, start, &first_entry);
2764 		if (!result && (flags & VM_MAP_WIRE_HOLESOK))
2765 			first_entry = first_entry->next;
2766 		else
2767 			KASSERT(result, ("vm_map_wire: lookup failed"));
2768 	}
2769 	for (entry = first_entry; entry != &map->header && entry->start < end;
2770 	    entry = entry->next) {
2771 		/*
2772 		 * If VM_MAP_WIRE_HOLESOK was specified, an empty
2773 		 * space in the unwired region could have been mapped
2774 		 * while the map lock was dropped for faulting in the
2775 		 * pages or draining MAP_ENTRY_IN_TRANSITION.
2776 		 * Moreover, another thread could be simultaneously
2777 		 * wiring this new mapping entry.  Detect these cases
2778 		 * and skip any entries marked as in transition not by us.
2779 		 */
2780 		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
2781 		    entry->wiring_thread != curthread) {
2782 			KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0,
2783 			    ("vm_map_wire: !HOLESOK and new/changed entry"));
2784 			continue;
2785 		}
2786 
2787 		if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0)
2788 			goto next_entry_done;
2789 
2790 		if (rv == KERN_SUCCESS) {
2791 			if (user_wire)
2792 				entry->eflags |= MAP_ENTRY_USER_WIRED;
2793 		} else if (entry->wired_count == -1) {
2794 			/*
2795 			 * Wiring failed on this entry.  Thus, unwiring is
2796 			 * unnecessary.
2797 			 */
2798 			entry->wired_count = 0;
2799 		} else if (!user_wire ||
2800 		    (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
2801 			/*
2802 			 * Undo the wiring.  Wiring succeeded on this entry
2803 			 * but failed on a later entry.
2804 			 */
2805 			if (entry->wired_count == 1)
2806 				vm_map_entry_unwire(map, entry);
2807 			else
2808 				entry->wired_count--;
2809 		}
2810 	next_entry_done:
2811 		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
2812 		    ("vm_map_wire: in-transition flag missing %p", entry));
2813 		KASSERT(entry->wiring_thread == curthread,
2814 		    ("vm_map_wire: alien wire %p", entry));
2815 		entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION |
2816 		    MAP_ENTRY_WIRE_SKIPPED);
2817 		entry->wiring_thread = NULL;
2818 		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
2819 			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
2820 			need_wakeup = TRUE;
2821 		}
2822 		vm_map_simplify_entry(map, entry);
2823 	}
2824 	vm_map_unlock(map);
2825 	if (need_wakeup)
2826 		vm_map_wakeup(map);
2827 	return (rv);
2828 }
2829 
2830 /*
2831  * vm_map_sync
2832  *
2833  * Push any dirty cached pages in the address range to their pager.
2834  * If syncio is TRUE, dirty pages are written synchronously.
2835  * If invalidate is TRUE, any cached pages are freed as well.
2836  *
2837  * If the size of the region from start to end is zero, we are
2838  * supposed to flush all modified pages within the region containing
2839  * start.  Unfortunately, a region can be split or coalesced with
2840  * neighboring regions, making it difficult to determine what the
2841  * original region was.  Therefore, we approximate this requirement by
2842  * flushing the current region containing start.
2843  *
2844  * Returns an error if any part of the specified range is not mapped.
2845  */
2846 int
2847 vm_map_sync(
2848 	vm_map_t map,
2849 	vm_offset_t start,
2850 	vm_offset_t end,
2851 	boolean_t syncio,
2852 	boolean_t invalidate)
2853 {
2854 	vm_map_entry_t current;
2855 	vm_map_entry_t entry;
2856 	vm_size_t size;
2857 	vm_object_t object;
2858 	vm_ooffset_t offset;
2859 	unsigned int last_timestamp;
2860 	boolean_t failed;
2861 
2862 	vm_map_lock_read(map);
2863 	VM_MAP_RANGE_CHECK(map, start, end);
2864 	if (!vm_map_lookup_entry(map, start, &entry)) {
2865 		vm_map_unlock_read(map);
2866 		return (KERN_INVALID_ADDRESS);
2867 	} else if (start == end) {
2868 		start = entry->start;
2869 		end = entry->end;
2870 	}
2871 	/*
2872 	 * Make a first pass to check for user-wired memory and holes.
2873 	 */
2874 	for (current = entry; current != &map->header && current->start < end;
2875 	    current = current->next) {
2876 		if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) {
2877 			vm_map_unlock_read(map);
2878 			return (KERN_INVALID_ARGUMENT);
2879 		}
2880 		if (end > current->end &&
2881 		    (current->next == &map->header ||
2882 			current->end != current->next->start)) {
2883 			vm_map_unlock_read(map);
2884 			return (KERN_INVALID_ADDRESS);
2885 		}
2886 	}
2887 
2888 	if (invalidate)
2889 		pmap_remove(map->pmap, start, end);
2890 	failed = FALSE;
2891 
2892 	/*
2893 	 * Make a second pass, cleaning/uncaching pages from the indicated
2894 	 * objects as we go.
2895 	 */
2896 	for (current = entry; current != &map->header && current->start < end;) {
2897 		offset = current->offset + (start - current->start);
2898 		size = (end <= current->end ? end : current->end) - start;
2899 		if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
2900 			vm_map_t smap;
2901 			vm_map_entry_t tentry;
2902 			vm_size_t tsize;
2903 
2904 			smap = current->object.sub_map;
2905 			vm_map_lock_read(smap);
2906 			(void) vm_map_lookup_entry(smap, offset, &tentry);
2907 			tsize = tentry->end - offset;
2908 			if (tsize < size)
2909 				size = tsize;
2910 			object = tentry->object.vm_object;
2911 			offset = tentry->offset + (offset - tentry->start);
2912 			vm_map_unlock_read(smap);
2913 		} else {
2914 			object = current->object.vm_object;
2915 		}
2916 		vm_object_reference(object);
2917 		last_timestamp = map->timestamp;
2918 		vm_map_unlock_read(map);
2919 		if (!vm_object_sync(object, offset, size, syncio, invalidate))
2920 			failed = TRUE;
2921 		start += size;
2922 		vm_object_deallocate(object);
2923 		vm_map_lock_read(map);
2924 		if (last_timestamp == map->timestamp ||
2925 		    !vm_map_lookup_entry(map, start, &current))
2926 			current = current->next;
2927 	}
2928 
2929 	vm_map_unlock_read(map);
2930 	return (failed ? KERN_FAILURE : KERN_SUCCESS);
2931 }
2932 
2933 /*
2934  *	vm_map_entry_unwire:	[ internal use only ]
2935  *
2936  *	Make the region specified by this entry pageable.
2937  *
2938  *	The map in question should be locked.
2939  *	[This is the reason for this routine's existence.]
2940  */
2941 static void
2942 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
2943 {
2944 
2945 	VM_MAP_ASSERT_LOCKED(map);
2946 	KASSERT(entry->wired_count > 0,
2947 	    ("vm_map_entry_unwire: entry %p isn't wired", entry));
2948 	pmap_unwire(map->pmap, entry->start, entry->end);
2949 	vm_object_unwire(entry->object.vm_object, entry->offset, entry->end -
2950 	    entry->start, PQ_ACTIVE);
2951 	entry->wired_count = 0;
2952 }
2953 
2954 static void
2955 vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map)
2956 {
2957 
2958 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0)
2959 		vm_object_deallocate(entry->object.vm_object);
2960 	uma_zfree(system_map ? kmapentzone : mapentzone, entry);
2961 }
2962 
2963 /*
2964  *	vm_map_entry_delete:	[ internal use only ]
2965  *
2966  *	Deallocate the given entry from the target map.
2967  */
2968 static void
2969 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
2970 {
2971 	vm_object_t object;
2972 	vm_pindex_t offidxstart, offidxend, count, size1;
2973 	vm_size_t size;
2974 
2975 	vm_map_entry_unlink(map, entry);
2976 	object = entry->object.vm_object;
2977 
2978 	if ((entry->eflags & MAP_ENTRY_GUARD) != 0) {
2979 		MPASS(entry->cred == NULL);
2980 		MPASS((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0);
2981 		MPASS(object == NULL);
2982 		vm_map_entry_deallocate(entry, map->system_map);
2983 		return;
2984 	}
2985 
2986 	size = entry->end - entry->start;
2987 	map->size -= size;
2988 
2989 	if (entry->cred != NULL) {
2990 		swap_release_by_cred(size, entry->cred);
2991 		crfree(entry->cred);
2992 	}
2993 
2994 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
2995 	    (object != NULL)) {
2996 		KASSERT(entry->cred == NULL || object->cred == NULL ||
2997 		    (entry->eflags & MAP_ENTRY_NEEDS_COPY),
2998 		    ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry));
2999 		count = atop(size);
3000 		offidxstart = OFF_TO_IDX(entry->offset);
3001 		offidxend = offidxstart + count;
3002 		VM_OBJECT_WLOCK(object);
3003 		if (object->ref_count != 1 && ((object->flags & (OBJ_NOSPLIT |
3004 		    OBJ_ONEMAPPING)) == OBJ_ONEMAPPING ||
3005 		    object == kernel_object)) {
3006 			vm_object_collapse(object);
3007 
3008 			/*
3009 			 * The option OBJPR_NOTMAPPED can be passed here
3010 			 * because vm_map_delete() already performed
3011 			 * pmap_remove() on the only mapping to this range
3012 			 * of pages.
3013 			 */
3014 			vm_object_page_remove(object, offidxstart, offidxend,
3015 			    OBJPR_NOTMAPPED);
3016 			if (object->type == OBJT_SWAP)
3017 				swap_pager_freespace(object, offidxstart,
3018 				    count);
3019 			if (offidxend >= object->size &&
3020 			    offidxstart < object->size) {
3021 				size1 = object->size;
3022 				object->size = offidxstart;
3023 				if (object->cred != NULL) {
3024 					size1 -= object->size;
3025 					KASSERT(object->charge >= ptoa(size1),
3026 					    ("object %p charge < 0", object));
3027 					swap_release_by_cred(ptoa(size1),
3028 					    object->cred);
3029 					object->charge -= ptoa(size1);
3030 				}
3031 			}
3032 		}
3033 		VM_OBJECT_WUNLOCK(object);
3034 	} else
3035 		entry->object.vm_object = NULL;
3036 	if (map->system_map)
3037 		vm_map_entry_deallocate(entry, TRUE);
3038 	else {
3039 		entry->next = curthread->td_map_def_user;
3040 		curthread->td_map_def_user = entry;
3041 	}
3042 }
3043 
3044 /*
3045  *	vm_map_delete:	[ internal use only ]
3046  *
3047  *	Deallocates the given address range from the target
3048  *	map.
3049  */
3050 int
3051 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
3052 {
3053 	vm_map_entry_t entry;
3054 	vm_map_entry_t first_entry;
3055 
3056 	VM_MAP_ASSERT_LOCKED(map);
3057 	if (start == end)
3058 		return (KERN_SUCCESS);
3059 
3060 	/*
3061 	 * Find the start of the region, and clip it
3062 	 */
3063 	if (!vm_map_lookup_entry(map, start, &first_entry))
3064 		entry = first_entry->next;
3065 	else {
3066 		entry = first_entry;
3067 		vm_map_clip_start(map, entry, start);
3068 	}
3069 
3070 	/*
3071 	 * Step through all entries in this region
3072 	 */
3073 	while ((entry != &map->header) && (entry->start < end)) {
3074 		vm_map_entry_t next;
3075 
3076 		/*
3077 		 * Wait for wiring or unwiring of an entry to complete.
3078 		 * Also wait for any system wirings to disappear on
3079 		 * user maps.
3080 		 */
3081 		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 ||
3082 		    (vm_map_pmap(map) != kernel_pmap &&
3083 		    vm_map_entry_system_wired_count(entry) != 0)) {
3084 			unsigned int last_timestamp;
3085 			vm_offset_t saved_start;
3086 			vm_map_entry_t tmp_entry;
3087 
3088 			saved_start = entry->start;
3089 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
3090 			last_timestamp = map->timestamp;
3091 			(void) vm_map_unlock_and_wait(map, 0);
3092 			vm_map_lock(map);
3093 			if (last_timestamp + 1 != map->timestamp) {
3094 				/*
3095 				 * Look again for the entry because the map was
3096 				 * modified while it was unlocked.
3097 				 * Specifically, the entry may have been
3098 				 * clipped, merged, or deleted.
3099 				 */
3100 				if (!vm_map_lookup_entry(map, saved_start,
3101 							 &tmp_entry))
3102 					entry = tmp_entry->next;
3103 				else {
3104 					entry = tmp_entry;
3105 					vm_map_clip_start(map, entry,
3106 							  saved_start);
3107 				}
3108 			}
3109 			continue;
3110 		}
3111 		vm_map_clip_end(map, entry, end);
3112 
3113 		next = entry->next;
3114 
3115 		/*
3116 		 * Unwire before removing addresses from the pmap; otherwise,
3117 		 * unwiring will put the entries back in the pmap.
3118 		 */
3119 		if (entry->wired_count != 0) {
3120 			vm_map_entry_unwire(map, entry);
3121 		}
3122 
3123 		pmap_remove(map->pmap, entry->start, entry->end);
3124 
3125 		/*
3126 		 * Delete the entry only after removing all pmap
3127 		 * entries pointing to its pages.  (Otherwise, its
3128 		 * page frames may be reallocated, and any modify bits
3129 		 * will be set in the wrong object!)
3130 		 */
3131 		vm_map_entry_delete(map, entry);
3132 		entry = next;
3133 	}
3134 	return (KERN_SUCCESS);
3135 }
3136 
3137 /*
3138  *	vm_map_remove:
3139  *
3140  *	Remove the given address range from the target map.
3141  *	This is the exported form of vm_map_delete.
3142  */
3143 int
3144 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
3145 {
3146 	int result;
3147 
3148 	vm_map_lock(map);
3149 	VM_MAP_RANGE_CHECK(map, start, end);
3150 	result = vm_map_delete(map, start, end);
3151 	vm_map_unlock(map);
3152 	return (result);
3153 }
3154 
3155 /*
3156  *	vm_map_check_protection:
3157  *
3158  *	Assert that the target map allows the specified privilege on the
3159  *	entire address region given.  The entire region must be allocated.
3160  *
3161  *	WARNING!  This code does not and should not check whether the
3162  *	contents of the region is accessible.  For example a smaller file
3163  *	might be mapped into a larger address space.
3164  *
3165  *	NOTE!  This code is also called by munmap().
3166  *
3167  *	The map must be locked.  A read lock is sufficient.
3168  */
3169 boolean_t
3170 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
3171 			vm_prot_t protection)
3172 {
3173 	vm_map_entry_t entry;
3174 	vm_map_entry_t tmp_entry;
3175 
3176 	if (!vm_map_lookup_entry(map, start, &tmp_entry))
3177 		return (FALSE);
3178 	entry = tmp_entry;
3179 
3180 	while (start < end) {
3181 		if (entry == &map->header)
3182 			return (FALSE);
3183 		/*
3184 		 * No holes allowed!
3185 		 */
3186 		if (start < entry->start)
3187 			return (FALSE);
3188 		/*
3189 		 * Check protection associated with entry.
3190 		 */
3191 		if ((entry->protection & protection) != protection)
3192 			return (FALSE);
3193 		/* go to next entry */
3194 		start = entry->end;
3195 		entry = entry->next;
3196 	}
3197 	return (TRUE);
3198 }
3199 
3200 /*
3201  *	vm_map_copy_entry:
3202  *
3203  *	Copies the contents of the source entry to the destination
3204  *	entry.  The entries *must* be aligned properly.
3205  */
3206 static void
3207 vm_map_copy_entry(
3208 	vm_map_t src_map,
3209 	vm_map_t dst_map,
3210 	vm_map_entry_t src_entry,
3211 	vm_map_entry_t dst_entry,
3212 	vm_ooffset_t *fork_charge)
3213 {
3214 	vm_object_t src_object;
3215 	vm_map_entry_t fake_entry;
3216 	vm_offset_t size;
3217 	struct ucred *cred;
3218 	int charged;
3219 
3220 	VM_MAP_ASSERT_LOCKED(dst_map);
3221 
3222 	if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
3223 		return;
3224 
3225 	if (src_entry->wired_count == 0 ||
3226 	    (src_entry->protection & VM_PROT_WRITE) == 0) {
3227 		/*
3228 		 * If the source entry is marked needs_copy, it is already
3229 		 * write-protected.
3230 		 */
3231 		if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
3232 		    (src_entry->protection & VM_PROT_WRITE) != 0) {
3233 			pmap_protect(src_map->pmap,
3234 			    src_entry->start,
3235 			    src_entry->end,
3236 			    src_entry->protection & ~VM_PROT_WRITE);
3237 		}
3238 
3239 		/*
3240 		 * Make a copy of the object.
3241 		 */
3242 		size = src_entry->end - src_entry->start;
3243 		if ((src_object = src_entry->object.vm_object) != NULL) {
3244 			VM_OBJECT_WLOCK(src_object);
3245 			charged = ENTRY_CHARGED(src_entry);
3246 			if (src_object->handle == NULL &&
3247 			    (src_object->type == OBJT_DEFAULT ||
3248 			    src_object->type == OBJT_SWAP)) {
3249 				vm_object_collapse(src_object);
3250 				if ((src_object->flags & (OBJ_NOSPLIT |
3251 				    OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
3252 					vm_object_split(src_entry);
3253 					src_object =
3254 					    src_entry->object.vm_object;
3255 				}
3256 			}
3257 			vm_object_reference_locked(src_object);
3258 			vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
3259 			if (src_entry->cred != NULL &&
3260 			    !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
3261 				KASSERT(src_object->cred == NULL,
3262 				    ("OVERCOMMIT: vm_map_copy_entry: cred %p",
3263 				     src_object));
3264 				src_object->cred = src_entry->cred;
3265 				src_object->charge = size;
3266 			}
3267 			VM_OBJECT_WUNLOCK(src_object);
3268 			dst_entry->object.vm_object = src_object;
3269 			if (charged) {
3270 				cred = curthread->td_ucred;
3271 				crhold(cred);
3272 				dst_entry->cred = cred;
3273 				*fork_charge += size;
3274 				if (!(src_entry->eflags &
3275 				      MAP_ENTRY_NEEDS_COPY)) {
3276 					crhold(cred);
3277 					src_entry->cred = cred;
3278 					*fork_charge += size;
3279 				}
3280 			}
3281 			src_entry->eflags |= MAP_ENTRY_COW |
3282 			    MAP_ENTRY_NEEDS_COPY;
3283 			dst_entry->eflags |= MAP_ENTRY_COW |
3284 			    MAP_ENTRY_NEEDS_COPY;
3285 			dst_entry->offset = src_entry->offset;
3286 			if (src_entry->eflags & MAP_ENTRY_VN_WRITECNT) {
3287 				/*
3288 				 * MAP_ENTRY_VN_WRITECNT cannot
3289 				 * indicate write reference from
3290 				 * src_entry, since the entry is
3291 				 * marked as needs copy.  Allocate a
3292 				 * fake entry that is used to
3293 				 * decrement object->un_pager.vnp.writecount
3294 				 * at the appropriate time.  Attach
3295 				 * fake_entry to the deferred list.
3296 				 */
3297 				fake_entry = vm_map_entry_create(dst_map);
3298 				fake_entry->eflags = MAP_ENTRY_VN_WRITECNT;
3299 				src_entry->eflags &= ~MAP_ENTRY_VN_WRITECNT;
3300 				vm_object_reference(src_object);
3301 				fake_entry->object.vm_object = src_object;
3302 				fake_entry->start = src_entry->start;
3303 				fake_entry->end = src_entry->end;
3304 				fake_entry->next = curthread->td_map_def_user;
3305 				curthread->td_map_def_user = fake_entry;
3306 			}
3307 
3308 			pmap_copy(dst_map->pmap, src_map->pmap,
3309 			    dst_entry->start, dst_entry->end - dst_entry->start,
3310 			    src_entry->start);
3311 		} else {
3312 			dst_entry->object.vm_object = NULL;
3313 			dst_entry->offset = 0;
3314 			if (src_entry->cred != NULL) {
3315 				dst_entry->cred = curthread->td_ucred;
3316 				crhold(dst_entry->cred);
3317 				*fork_charge += size;
3318 			}
3319 		}
3320 	} else {
3321 		/*
3322 		 * We don't want to make writeable wired pages copy-on-write.
3323 		 * Immediately copy these pages into the new map by simulating
3324 		 * page faults.  The new pages are pageable.
3325 		 */
3326 		vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry,
3327 		    fork_charge);
3328 	}
3329 }
3330 
3331 /*
3332  * vmspace_map_entry_forked:
3333  * Update the newly-forked vmspace each time a map entry is inherited
3334  * or copied.  The values for vm_dsize and vm_tsize are approximate
3335  * (and mostly-obsolete ideas in the face of mmap(2) et al.)
3336  */
3337 static void
3338 vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2,
3339     vm_map_entry_t entry)
3340 {
3341 	vm_size_t entrysize;
3342 	vm_offset_t newend;
3343 
3344 	if ((entry->eflags & MAP_ENTRY_GUARD) != 0)
3345 		return;
3346 	entrysize = entry->end - entry->start;
3347 	vm2->vm_map.size += entrysize;
3348 	if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) {
3349 		vm2->vm_ssize += btoc(entrysize);
3350 	} else if (entry->start >= (vm_offset_t)vm1->vm_daddr &&
3351 	    entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) {
3352 		newend = MIN(entry->end,
3353 		    (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize));
3354 		vm2->vm_dsize += btoc(newend - entry->start);
3355 	} else if (entry->start >= (vm_offset_t)vm1->vm_taddr &&
3356 	    entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) {
3357 		newend = MIN(entry->end,
3358 		    (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize));
3359 		vm2->vm_tsize += btoc(newend - entry->start);
3360 	}
3361 }
3362 
3363 /*
3364  * vmspace_fork:
3365  * Create a new process vmspace structure and vm_map
3366  * based on those of an existing process.  The new map
3367  * is based on the old map, according to the inheritance
3368  * values on the regions in that map.
3369  *
3370  * XXX It might be worth coalescing the entries added to the new vmspace.
3371  *
3372  * The source map must not be locked.
3373  */
3374 struct vmspace *
3375 vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge)
3376 {
3377 	struct vmspace *vm2;
3378 	vm_map_t new_map, old_map;
3379 	vm_map_entry_t new_entry, old_entry;
3380 	vm_object_t object;
3381 	int locked;
3382 	vm_inherit_t inh;
3383 
3384 	old_map = &vm1->vm_map;
3385 	/* Copy immutable fields of vm1 to vm2. */
3386 	vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset, NULL);
3387 	if (vm2 == NULL)
3388 		return (NULL);
3389 	vm2->vm_taddr = vm1->vm_taddr;
3390 	vm2->vm_daddr = vm1->vm_daddr;
3391 	vm2->vm_maxsaddr = vm1->vm_maxsaddr;
3392 	vm_map_lock(old_map);
3393 	if (old_map->busy)
3394 		vm_map_wait_busy(old_map);
3395 	new_map = &vm2->vm_map;
3396 	locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */
3397 	KASSERT(locked, ("vmspace_fork: lock failed"));
3398 
3399 	old_entry = old_map->header.next;
3400 
3401 	while (old_entry != &old_map->header) {
3402 		if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP)
3403 			panic("vm_map_fork: encountered a submap");
3404 
3405 		inh = old_entry->inheritance;
3406 		if ((old_entry->eflags & MAP_ENTRY_GUARD) != 0 &&
3407 		    inh != VM_INHERIT_NONE)
3408 			inh = VM_INHERIT_COPY;
3409 
3410 		switch (inh) {
3411 		case VM_INHERIT_NONE:
3412 			break;
3413 
3414 		case VM_INHERIT_SHARE:
3415 			/*
3416 			 * Clone the entry, creating the shared object if necessary.
3417 			 */
3418 			object = old_entry->object.vm_object;
3419 			if (object == NULL) {
3420 				object = vm_object_allocate(OBJT_DEFAULT,
3421 					atop(old_entry->end - old_entry->start));
3422 				old_entry->object.vm_object = object;
3423 				old_entry->offset = 0;
3424 				if (old_entry->cred != NULL) {
3425 					object->cred = old_entry->cred;
3426 					object->charge = old_entry->end -
3427 					    old_entry->start;
3428 					old_entry->cred = NULL;
3429 				}
3430 			}
3431 
3432 			/*
3433 			 * Add the reference before calling vm_object_shadow
3434 			 * to insure that a shadow object is created.
3435 			 */
3436 			vm_object_reference(object);
3437 			if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3438 				vm_object_shadow(&old_entry->object.vm_object,
3439 				    &old_entry->offset,
3440 				    old_entry->end - old_entry->start);
3441 				old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
3442 				/* Transfer the second reference too. */
3443 				vm_object_reference(
3444 				    old_entry->object.vm_object);
3445 
3446 				/*
3447 				 * As in vm_map_simplify_entry(), the
3448 				 * vnode lock will not be acquired in
3449 				 * this call to vm_object_deallocate().
3450 				 */
3451 				vm_object_deallocate(object);
3452 				object = old_entry->object.vm_object;
3453 			}
3454 			VM_OBJECT_WLOCK(object);
3455 			vm_object_clear_flag(object, OBJ_ONEMAPPING);
3456 			if (old_entry->cred != NULL) {
3457 				KASSERT(object->cred == NULL, ("vmspace_fork both cred"));
3458 				object->cred = old_entry->cred;
3459 				object->charge = old_entry->end - old_entry->start;
3460 				old_entry->cred = NULL;
3461 			}
3462 
3463 			/*
3464 			 * Assert the correct state of the vnode
3465 			 * v_writecount while the object is locked, to
3466 			 * not relock it later for the assertion
3467 			 * correctness.
3468 			 */
3469 			if (old_entry->eflags & MAP_ENTRY_VN_WRITECNT &&
3470 			    object->type == OBJT_VNODE) {
3471 				KASSERT(((struct vnode *)object->handle)->
3472 				    v_writecount > 0,
3473 				    ("vmspace_fork: v_writecount %p", object));
3474 				KASSERT(object->un_pager.vnp.writemappings > 0,
3475 				    ("vmspace_fork: vnp.writecount %p",
3476 				    object));
3477 			}
3478 			VM_OBJECT_WUNLOCK(object);
3479 
3480 			/*
3481 			 * Clone the entry, referencing the shared object.
3482 			 */
3483 			new_entry = vm_map_entry_create(new_map);
3484 			*new_entry = *old_entry;
3485 			new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
3486 			    MAP_ENTRY_IN_TRANSITION);
3487 			new_entry->wiring_thread = NULL;
3488 			new_entry->wired_count = 0;
3489 			if (new_entry->eflags & MAP_ENTRY_VN_WRITECNT) {
3490 				vnode_pager_update_writecount(object,
3491 				    new_entry->start, new_entry->end);
3492 			}
3493 
3494 			/*
3495 			 * Insert the entry into the new map -- we know we're
3496 			 * inserting at the end of the new map.
3497 			 */
3498 			vm_map_entry_link(new_map, new_map->header.prev,
3499 			    new_entry);
3500 			vmspace_map_entry_forked(vm1, vm2, new_entry);
3501 
3502 			/*
3503 			 * Update the physical map
3504 			 */
3505 			pmap_copy(new_map->pmap, old_map->pmap,
3506 			    new_entry->start,
3507 			    (old_entry->end - old_entry->start),
3508 			    old_entry->start);
3509 			break;
3510 
3511 		case VM_INHERIT_COPY:
3512 			/*
3513 			 * Clone the entry and link into the map.
3514 			 */
3515 			new_entry = vm_map_entry_create(new_map);
3516 			*new_entry = *old_entry;
3517 			/*
3518 			 * Copied entry is COW over the old object.
3519 			 */
3520 			new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
3521 			    MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_VN_WRITECNT);
3522 			new_entry->wiring_thread = NULL;
3523 			new_entry->wired_count = 0;
3524 			new_entry->object.vm_object = NULL;
3525 			new_entry->cred = NULL;
3526 			vm_map_entry_link(new_map, new_map->header.prev,
3527 			    new_entry);
3528 			vmspace_map_entry_forked(vm1, vm2, new_entry);
3529 			vm_map_copy_entry(old_map, new_map, old_entry,
3530 			    new_entry, fork_charge);
3531 			break;
3532 
3533 		case VM_INHERIT_ZERO:
3534 			/*
3535 			 * Create a new anonymous mapping entry modelled from
3536 			 * the old one.
3537 			 */
3538 			new_entry = vm_map_entry_create(new_map);
3539 			memset(new_entry, 0, sizeof(*new_entry));
3540 
3541 			new_entry->start = old_entry->start;
3542 			new_entry->end = old_entry->end;
3543 			new_entry->eflags = old_entry->eflags &
3544 			    ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION |
3545 			    MAP_ENTRY_VN_WRITECNT);
3546 			new_entry->protection = old_entry->protection;
3547 			new_entry->max_protection = old_entry->max_protection;
3548 			new_entry->inheritance = VM_INHERIT_ZERO;
3549 
3550 			vm_map_entry_link(new_map, new_map->header.prev,
3551 			    new_entry);
3552 			vmspace_map_entry_forked(vm1, vm2, new_entry);
3553 
3554 			new_entry->cred = curthread->td_ucred;
3555 			crhold(new_entry->cred);
3556 			*fork_charge += (new_entry->end - new_entry->start);
3557 
3558 			break;
3559 		}
3560 		old_entry = old_entry->next;
3561 	}
3562 	/*
3563 	 * Use inlined vm_map_unlock() to postpone handling the deferred
3564 	 * map entries, which cannot be done until both old_map and
3565 	 * new_map locks are released.
3566 	 */
3567 	sx_xunlock(&old_map->lock);
3568 	sx_xunlock(&new_map->lock);
3569 	vm_map_process_deferred();
3570 
3571 	return (vm2);
3572 }
3573 
3574 /*
3575  * Create a process's stack for exec_new_vmspace().  This function is never
3576  * asked to wire the newly created stack.
3577  */
3578 int
3579 vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
3580     vm_prot_t prot, vm_prot_t max, int cow)
3581 {
3582 	vm_size_t growsize, init_ssize;
3583 	rlim_t vmemlim;
3584 	int rv;
3585 
3586 	MPASS((map->flags & MAP_WIREFUTURE) == 0);
3587 	growsize = sgrowsiz;
3588 	init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
3589 	vm_map_lock(map);
3590 	vmemlim = lim_cur(curthread, RLIMIT_VMEM);
3591 	/* If we would blow our VMEM resource limit, no go */
3592 	if (map->size + init_ssize > vmemlim) {
3593 		rv = KERN_NO_SPACE;
3594 		goto out;
3595 	}
3596 	rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot,
3597 	    max, cow);
3598 out:
3599 	vm_map_unlock(map);
3600 	return (rv);
3601 }
3602 
3603 static int stack_guard_page = 1;
3604 SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN,
3605     &stack_guard_page, 0,
3606     "Specifies the number of guard pages for a stack that grows");
3607 
3608 static int
3609 vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
3610     vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow)
3611 {
3612 	vm_map_entry_t new_entry, prev_entry;
3613 	vm_offset_t bot, gap_bot, gap_top, top;
3614 	vm_size_t init_ssize, sgp;
3615 	int orient, rv;
3616 
3617 	/*
3618 	 * The stack orientation is piggybacked with the cow argument.
3619 	 * Extract it into orient and mask the cow argument so that we
3620 	 * don't pass it around further.
3621 	 */
3622 	orient = cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP);
3623 	KASSERT(orient != 0, ("No stack grow direction"));
3624 	KASSERT(orient != (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP),
3625 	    ("bi-dir stack"));
3626 
3627 	if (addrbos < vm_map_min(map) ||
3628 	    addrbos + max_ssize > vm_map_max(map) ||
3629 	    addrbos + max_ssize <= addrbos)
3630 		return (KERN_INVALID_ADDRESS);
3631 	sgp = (vm_size_t)stack_guard_page * PAGE_SIZE;
3632 	if (sgp >= max_ssize)
3633 		return (KERN_INVALID_ARGUMENT);
3634 
3635 	init_ssize = growsize;
3636 	if (max_ssize < init_ssize + sgp)
3637 		init_ssize = max_ssize - sgp;
3638 
3639 	/* If addr is already mapped, no go */
3640 	if (vm_map_lookup_entry(map, addrbos, &prev_entry))
3641 		return (KERN_NO_SPACE);
3642 
3643 	/*
3644 	 * If we can't accommodate max_ssize in the current mapping, no go.
3645 	 */
3646 	if ((prev_entry->next != &map->header) &&
3647 	    (prev_entry->next->start < addrbos + max_ssize))
3648 		return (KERN_NO_SPACE);
3649 
3650 	/*
3651 	 * We initially map a stack of only init_ssize.  We will grow as
3652 	 * needed later.  Depending on the orientation of the stack (i.e.
3653 	 * the grow direction) we either map at the top of the range, the
3654 	 * bottom of the range or in the middle.
3655 	 *
3656 	 * Note: we would normally expect prot and max to be VM_PROT_ALL,
3657 	 * and cow to be 0.  Possibly we should eliminate these as input
3658 	 * parameters, and just pass these values here in the insert call.
3659 	 */
3660 	if (orient == MAP_STACK_GROWS_DOWN) {
3661 		bot = addrbos + max_ssize - init_ssize;
3662 		top = bot + init_ssize;
3663 		gap_bot = addrbos;
3664 		gap_top = bot;
3665 	} else /* if (orient == MAP_STACK_GROWS_UP) */ {
3666 		bot = addrbos;
3667 		top = bot + init_ssize;
3668 		gap_bot = top;
3669 		gap_top = addrbos + max_ssize;
3670 	}
3671 	rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow);
3672 	if (rv != KERN_SUCCESS)
3673 		return (rv);
3674 	new_entry = prev_entry->next;
3675 	KASSERT(new_entry->end == top || new_entry->start == bot,
3676 	    ("Bad entry start/end for new stack entry"));
3677 	KASSERT((orient & MAP_STACK_GROWS_DOWN) == 0 ||
3678 	    (new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0,
3679 	    ("new entry lacks MAP_ENTRY_GROWS_DOWN"));
3680 	KASSERT((orient & MAP_STACK_GROWS_UP) == 0 ||
3681 	    (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0,
3682 	    ("new entry lacks MAP_ENTRY_GROWS_UP"));
3683 	rv = vm_map_insert(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE,
3684 	    VM_PROT_NONE, MAP_CREATE_GUARD | (orient == MAP_STACK_GROWS_DOWN ?
3685 	    MAP_CREATE_STACK_GAP_DN : MAP_CREATE_STACK_GAP_UP));
3686 	if (rv != KERN_SUCCESS)
3687 		(void)vm_map_delete(map, bot, top);
3688 	return (rv);
3689 }
3690 
3691 /*
3692  * Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if we
3693  * successfully grow the stack.
3694  */
3695 static int
3696 vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry)
3697 {
3698 	vm_map_entry_t stack_entry;
3699 	struct proc *p;
3700 	struct vmspace *vm;
3701 	struct ucred *cred;
3702 	vm_offset_t gap_end, gap_start, grow_start;
3703 	size_t grow_amount, guard, max_grow;
3704 	rlim_t lmemlim, stacklim, vmemlim;
3705 	int rv, rv1;
3706 	bool gap_deleted, grow_down, is_procstack;
3707 #ifdef notyet
3708 	uint64_t limit;
3709 #endif
3710 #ifdef RACCT
3711 	int error;
3712 #endif
3713 
3714 	p = curproc;
3715 	vm = p->p_vmspace;
3716 
3717 	/*
3718 	 * Disallow stack growth when the access is performed by a
3719 	 * debugger or AIO daemon.  The reason is that the wrong
3720 	 * resource limits are applied.
3721 	 */
3722 	if (map != &p->p_vmspace->vm_map || p->p_textvp == NULL)
3723 		return (KERN_FAILURE);
3724 
3725 	MPASS(!map->system_map);
3726 
3727 	guard = stack_guard_page * PAGE_SIZE;
3728 	lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK);
3729 	stacklim = lim_cur(curthread, RLIMIT_STACK);
3730 	vmemlim = lim_cur(curthread, RLIMIT_VMEM);
3731 retry:
3732 	/* If addr is not in a hole for a stack grow area, no need to grow. */
3733 	if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry))
3734 		return (KERN_FAILURE);
3735 	if ((gap_entry->eflags & MAP_ENTRY_GUARD) == 0)
3736 		return (KERN_SUCCESS);
3737 	if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_DN) != 0) {
3738 		stack_entry = gap_entry->next;
3739 		if ((stack_entry->eflags & MAP_ENTRY_GROWS_DOWN) == 0 ||
3740 		    stack_entry->start != gap_entry->end)
3741 			return (KERN_FAILURE);
3742 		grow_amount = round_page(stack_entry->start - addr);
3743 		grow_down = true;
3744 	} else if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_UP) != 0) {
3745 		stack_entry = gap_entry->prev;
3746 		if ((stack_entry->eflags & MAP_ENTRY_GROWS_UP) == 0 ||
3747 		    stack_entry->end != gap_entry->start)
3748 			return (KERN_FAILURE);
3749 		grow_amount = round_page(addr + 1 - stack_entry->end);
3750 		grow_down = false;
3751 	} else {
3752 		return (KERN_FAILURE);
3753 	}
3754 	max_grow = gap_entry->end - gap_entry->start;
3755 	if (guard > max_grow)
3756 		return (KERN_NO_SPACE);
3757 	max_grow -= guard;
3758 	if (grow_amount > max_grow)
3759 		return (KERN_NO_SPACE);
3760 
3761 	/*
3762 	 * If this is the main process stack, see if we're over the stack
3763 	 * limit.
3764 	 */
3765 	is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr &&
3766 	    addr < (vm_offset_t)p->p_sysent->sv_usrstack;
3767 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim))
3768 		return (KERN_NO_SPACE);
3769 
3770 #ifdef RACCT
3771 	if (racct_enable) {
3772 		PROC_LOCK(p);
3773 		if (is_procstack && racct_set(p, RACCT_STACK,
3774 		    ctob(vm->vm_ssize) + grow_amount)) {
3775 			PROC_UNLOCK(p);
3776 			return (KERN_NO_SPACE);
3777 		}
3778 		PROC_UNLOCK(p);
3779 	}
3780 #endif
3781 
3782 	grow_amount = roundup(grow_amount, sgrowsiz);
3783 	if (grow_amount > max_grow)
3784 		grow_amount = max_grow;
3785 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
3786 		grow_amount = trunc_page((vm_size_t)stacklim) -
3787 		    ctob(vm->vm_ssize);
3788 	}
3789 
3790 #ifdef notyet
3791 	PROC_LOCK(p);
3792 	limit = racct_get_available(p, RACCT_STACK);
3793 	PROC_UNLOCK(p);
3794 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit))
3795 		grow_amount = limit - ctob(vm->vm_ssize);
3796 #endif
3797 
3798 	if (!old_mlock && (map->flags & MAP_WIREFUTURE) != 0) {
3799 		if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) {
3800 			rv = KERN_NO_SPACE;
3801 			goto out;
3802 		}
3803 #ifdef RACCT
3804 		if (racct_enable) {
3805 			PROC_LOCK(p);
3806 			if (racct_set(p, RACCT_MEMLOCK,
3807 			    ptoa(pmap_wired_count(map->pmap)) + grow_amount)) {
3808 				PROC_UNLOCK(p);
3809 				rv = KERN_NO_SPACE;
3810 				goto out;
3811 			}
3812 			PROC_UNLOCK(p);
3813 		}
3814 #endif
3815 	}
3816 
3817 	/* If we would blow our VMEM resource limit, no go */
3818 	if (map->size + grow_amount > vmemlim) {
3819 		rv = KERN_NO_SPACE;
3820 		goto out;
3821 	}
3822 #ifdef RACCT
3823 	if (racct_enable) {
3824 		PROC_LOCK(p);
3825 		if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) {
3826 			PROC_UNLOCK(p);
3827 			rv = KERN_NO_SPACE;
3828 			goto out;
3829 		}
3830 		PROC_UNLOCK(p);
3831 	}
3832 #endif
3833 
3834 	if (vm_map_lock_upgrade(map)) {
3835 		gap_entry = NULL;
3836 		vm_map_lock_read(map);
3837 		goto retry;
3838 	}
3839 
3840 	if (grow_down) {
3841 		grow_start = gap_entry->end - grow_amount;
3842 		if (gap_entry->start + grow_amount == gap_entry->end) {
3843 			gap_start = gap_entry->start;
3844 			gap_end = gap_entry->end;
3845 			vm_map_entry_delete(map, gap_entry);
3846 			gap_deleted = true;
3847 		} else {
3848 			MPASS(gap_entry->start < gap_entry->end - grow_amount);
3849 			gap_entry->end -= grow_amount;
3850 			vm_map_entry_resize_free(map, gap_entry);
3851 			gap_deleted = false;
3852 		}
3853 		rv = vm_map_insert(map, NULL, 0, grow_start,
3854 		    grow_start + grow_amount,
3855 		    stack_entry->protection, stack_entry->max_protection,
3856 		    MAP_STACK_GROWS_DOWN);
3857 		if (rv != KERN_SUCCESS) {
3858 			if (gap_deleted) {
3859 				rv1 = vm_map_insert(map, NULL, 0, gap_start,
3860 				    gap_end, VM_PROT_NONE, VM_PROT_NONE,
3861 				    MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP_DN);
3862 				MPASS(rv1 == KERN_SUCCESS);
3863 			} else {
3864 				gap_entry->end += grow_amount;
3865 				vm_map_entry_resize_free(map, gap_entry);
3866 			}
3867 		}
3868 	} else {
3869 		grow_start = stack_entry->end;
3870 		cred = stack_entry->cred;
3871 		if (cred == NULL && stack_entry->object.vm_object != NULL)
3872 			cred = stack_entry->object.vm_object->cred;
3873 		if (cred != NULL && !swap_reserve_by_cred(grow_amount, cred))
3874 			rv = KERN_NO_SPACE;
3875 		/* Grow the underlying object if applicable. */
3876 		else if (stack_entry->object.vm_object == NULL ||
3877 		    vm_object_coalesce(stack_entry->object.vm_object,
3878 		    stack_entry->offset,
3879 		    (vm_size_t)(stack_entry->end - stack_entry->start),
3880 		    (vm_size_t)grow_amount, cred != NULL)) {
3881 			if (gap_entry->start + grow_amount == gap_entry->end)
3882 				vm_map_entry_delete(map, gap_entry);
3883 			else
3884 				gap_entry->start += grow_amount;
3885 			stack_entry->end += grow_amount;
3886 			map->size += grow_amount;
3887 			vm_map_entry_resize_free(map, stack_entry);
3888 			rv = KERN_SUCCESS;
3889 		} else
3890 			rv = KERN_FAILURE;
3891 	}
3892 	if (rv == KERN_SUCCESS && is_procstack)
3893 		vm->vm_ssize += btoc(grow_amount);
3894 
3895 	/*
3896 	 * Heed the MAP_WIREFUTURE flag if it was set for this process.
3897 	 */
3898 	if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) {
3899 		vm_map_unlock(map);
3900 		vm_map_wire(map, grow_start, grow_start + grow_amount,
3901 		    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
3902 		vm_map_lock_read(map);
3903 	} else
3904 		vm_map_lock_downgrade(map);
3905 
3906 out:
3907 #ifdef RACCT
3908 	if (racct_enable && rv != KERN_SUCCESS) {
3909 		PROC_LOCK(p);
3910 		error = racct_set(p, RACCT_VMEM, map->size);
3911 		KASSERT(error == 0, ("decreasing RACCT_VMEM failed"));
3912 		if (!old_mlock) {
3913 			error = racct_set(p, RACCT_MEMLOCK,
3914 			    ptoa(pmap_wired_count(map->pmap)));
3915 			KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed"));
3916 		}
3917 	    	error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize));
3918 		KASSERT(error == 0, ("decreasing RACCT_STACK failed"));
3919 		PROC_UNLOCK(p);
3920 	}
3921 #endif
3922 
3923 	return (rv);
3924 }
3925 
3926 /*
3927  * Unshare the specified VM space for exec.  If other processes are
3928  * mapped to it, then create a new one.  The new vmspace is null.
3929  */
3930 int
3931 vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
3932 {
3933 	struct vmspace *oldvmspace = p->p_vmspace;
3934 	struct vmspace *newvmspace;
3935 
3936 	KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0,
3937 	    ("vmspace_exec recursed"));
3938 	newvmspace = vmspace_alloc(minuser, maxuser, NULL);
3939 	if (newvmspace == NULL)
3940 		return (ENOMEM);
3941 	newvmspace->vm_swrss = oldvmspace->vm_swrss;
3942 	/*
3943 	 * This code is written like this for prototype purposes.  The
3944 	 * goal is to avoid running down the vmspace here, but let the
3945 	 * other process's that are still using the vmspace to finally
3946 	 * run it down.  Even though there is little or no chance of blocking
3947 	 * here, it is a good idea to keep this form for future mods.
3948 	 */
3949 	PROC_VMSPACE_LOCK(p);
3950 	p->p_vmspace = newvmspace;
3951 	PROC_VMSPACE_UNLOCK(p);
3952 	if (p == curthread->td_proc)
3953 		pmap_activate(curthread);
3954 	curthread->td_pflags |= TDP_EXECVMSPC;
3955 	return (0);
3956 }
3957 
3958 /*
3959  * Unshare the specified VM space for forcing COW.  This
3960  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
3961  */
3962 int
3963 vmspace_unshare(struct proc *p)
3964 {
3965 	struct vmspace *oldvmspace = p->p_vmspace;
3966 	struct vmspace *newvmspace;
3967 	vm_ooffset_t fork_charge;
3968 
3969 	if (oldvmspace->vm_refcnt == 1)
3970 		return (0);
3971 	fork_charge = 0;
3972 	newvmspace = vmspace_fork(oldvmspace, &fork_charge);
3973 	if (newvmspace == NULL)
3974 		return (ENOMEM);
3975 	if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) {
3976 		vmspace_free(newvmspace);
3977 		return (ENOMEM);
3978 	}
3979 	PROC_VMSPACE_LOCK(p);
3980 	p->p_vmspace = newvmspace;
3981 	PROC_VMSPACE_UNLOCK(p);
3982 	if (p == curthread->td_proc)
3983 		pmap_activate(curthread);
3984 	vmspace_free(oldvmspace);
3985 	return (0);
3986 }
3987 
3988 /*
3989  *	vm_map_lookup:
3990  *
3991  *	Finds the VM object, offset, and
3992  *	protection for a given virtual address in the
3993  *	specified map, assuming a page fault of the
3994  *	type specified.
3995  *
3996  *	Leaves the map in question locked for read; return
3997  *	values are guaranteed until a vm_map_lookup_done
3998  *	call is performed.  Note that the map argument
3999  *	is in/out; the returned map must be used in
4000  *	the call to vm_map_lookup_done.
4001  *
4002  *	A handle (out_entry) is returned for use in
4003  *	vm_map_lookup_done, to make that fast.
4004  *
4005  *	If a lookup is requested with "write protection"
4006  *	specified, the map may be changed to perform virtual
4007  *	copying operations, although the data referenced will
4008  *	remain the same.
4009  */
4010 int
4011 vm_map_lookup(vm_map_t *var_map,		/* IN/OUT */
4012 	      vm_offset_t vaddr,
4013 	      vm_prot_t fault_typea,
4014 	      vm_map_entry_t *out_entry,	/* OUT */
4015 	      vm_object_t *object,		/* OUT */
4016 	      vm_pindex_t *pindex,		/* OUT */
4017 	      vm_prot_t *out_prot,		/* OUT */
4018 	      boolean_t *wired)			/* OUT */
4019 {
4020 	vm_map_entry_t entry;
4021 	vm_map_t map = *var_map;
4022 	vm_prot_t prot;
4023 	vm_prot_t fault_type = fault_typea;
4024 	vm_object_t eobject;
4025 	vm_size_t size;
4026 	struct ucred *cred;
4027 
4028 RetryLookup:
4029 
4030 	vm_map_lock_read(map);
4031 
4032 RetryLookupLocked:
4033 	/*
4034 	 * Lookup the faulting address.
4035 	 */
4036 	if (!vm_map_lookup_entry(map, vaddr, out_entry)) {
4037 		vm_map_unlock_read(map);
4038 		return (KERN_INVALID_ADDRESS);
4039 	}
4040 
4041 	entry = *out_entry;
4042 
4043 	/*
4044 	 * Handle submaps.
4045 	 */
4046 	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
4047 		vm_map_t old_map = map;
4048 
4049 		*var_map = map = entry->object.sub_map;
4050 		vm_map_unlock_read(old_map);
4051 		goto RetryLookup;
4052 	}
4053 
4054 	/*
4055 	 * Check whether this task is allowed to have this page.
4056 	 */
4057 	prot = entry->protection;
4058 	if ((fault_typea & VM_PROT_FAULT_LOOKUP) != 0) {
4059 		fault_typea &= ~VM_PROT_FAULT_LOOKUP;
4060 		if (prot == VM_PROT_NONE && map != kernel_map &&
4061 		    (entry->eflags & MAP_ENTRY_GUARD) != 0 &&
4062 		    (entry->eflags & (MAP_ENTRY_STACK_GAP_DN |
4063 		    MAP_ENTRY_STACK_GAP_UP)) != 0 &&
4064 		    vm_map_growstack(map, vaddr, entry) == KERN_SUCCESS)
4065 			goto RetryLookupLocked;
4066 	}
4067 	fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
4068 	if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) {
4069 		vm_map_unlock_read(map);
4070 		return (KERN_PROTECTION_FAILURE);
4071 	}
4072 	KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags &
4073 	    (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) !=
4074 	    (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY),
4075 	    ("entry %p flags %x", entry, entry->eflags));
4076 	if ((fault_typea & VM_PROT_COPY) != 0 &&
4077 	    (entry->max_protection & VM_PROT_WRITE) == 0 &&
4078 	    (entry->eflags & MAP_ENTRY_COW) == 0) {
4079 		vm_map_unlock_read(map);
4080 		return (KERN_PROTECTION_FAILURE);
4081 	}
4082 
4083 	/*
4084 	 * If this page is not pageable, we have to get it for all possible
4085 	 * accesses.
4086 	 */
4087 	*wired = (entry->wired_count != 0);
4088 	if (*wired)
4089 		fault_type = entry->protection;
4090 	size = entry->end - entry->start;
4091 	/*
4092 	 * If the entry was copy-on-write, we either ...
4093 	 */
4094 	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4095 		/*
4096 		 * If we want to write the page, we may as well handle that
4097 		 * now since we've got the map locked.
4098 		 *
4099 		 * If we don't need to write the page, we just demote the
4100 		 * permissions allowed.
4101 		 */
4102 		if ((fault_type & VM_PROT_WRITE) != 0 ||
4103 		    (fault_typea & VM_PROT_COPY) != 0) {
4104 			/*
4105 			 * Make a new object, and place it in the object
4106 			 * chain.  Note that no new references have appeared
4107 			 * -- one just moved from the map to the new
4108 			 * object.
4109 			 */
4110 			if (vm_map_lock_upgrade(map))
4111 				goto RetryLookup;
4112 
4113 			if (entry->cred == NULL) {
4114 				/*
4115 				 * The debugger owner is charged for
4116 				 * the memory.
4117 				 */
4118 				cred = curthread->td_ucred;
4119 				crhold(cred);
4120 				if (!swap_reserve_by_cred(size, cred)) {
4121 					crfree(cred);
4122 					vm_map_unlock(map);
4123 					return (KERN_RESOURCE_SHORTAGE);
4124 				}
4125 				entry->cred = cred;
4126 			}
4127 			vm_object_shadow(&entry->object.vm_object,
4128 			    &entry->offset, size);
4129 			entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
4130 			eobject = entry->object.vm_object;
4131 			if (eobject->cred != NULL) {
4132 				/*
4133 				 * The object was not shadowed.
4134 				 */
4135 				swap_release_by_cred(size, entry->cred);
4136 				crfree(entry->cred);
4137 				entry->cred = NULL;
4138 			} else if (entry->cred != NULL) {
4139 				VM_OBJECT_WLOCK(eobject);
4140 				eobject->cred = entry->cred;
4141 				eobject->charge = size;
4142 				VM_OBJECT_WUNLOCK(eobject);
4143 				entry->cred = NULL;
4144 			}
4145 
4146 			vm_map_lock_downgrade(map);
4147 		} else {
4148 			/*
4149 			 * We're attempting to read a copy-on-write page --
4150 			 * don't allow writes.
4151 			 */
4152 			prot &= ~VM_PROT_WRITE;
4153 		}
4154 	}
4155 
4156 	/*
4157 	 * Create an object if necessary.
4158 	 */
4159 	if (entry->object.vm_object == NULL &&
4160 	    !map->system_map) {
4161 		if (vm_map_lock_upgrade(map))
4162 			goto RetryLookup;
4163 		entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT,
4164 		    atop(size));
4165 		entry->offset = 0;
4166 		if (entry->cred != NULL) {
4167 			VM_OBJECT_WLOCK(entry->object.vm_object);
4168 			entry->object.vm_object->cred = entry->cred;
4169 			entry->object.vm_object->charge = size;
4170 			VM_OBJECT_WUNLOCK(entry->object.vm_object);
4171 			entry->cred = NULL;
4172 		}
4173 		vm_map_lock_downgrade(map);
4174 	}
4175 
4176 	/*
4177 	 * Return the object/offset from this entry.  If the entry was
4178 	 * copy-on-write or empty, it has been fixed up.
4179 	 */
4180 	*pindex = UOFF_TO_IDX((vaddr - entry->start) + entry->offset);
4181 	*object = entry->object.vm_object;
4182 
4183 	*out_prot = prot;
4184 	return (KERN_SUCCESS);
4185 }
4186 
4187 /*
4188  *	vm_map_lookup_locked:
4189  *
4190  *	Lookup the faulting address.  A version of vm_map_lookup that returns
4191  *      KERN_FAILURE instead of blocking on map lock or memory allocation.
4192  */
4193 int
4194 vm_map_lookup_locked(vm_map_t *var_map,		/* IN/OUT */
4195 		     vm_offset_t vaddr,
4196 		     vm_prot_t fault_typea,
4197 		     vm_map_entry_t *out_entry,	/* OUT */
4198 		     vm_object_t *object,	/* OUT */
4199 		     vm_pindex_t *pindex,	/* OUT */
4200 		     vm_prot_t *out_prot,	/* OUT */
4201 		     boolean_t *wired)		/* OUT */
4202 {
4203 	vm_map_entry_t entry;
4204 	vm_map_t map = *var_map;
4205 	vm_prot_t prot;
4206 	vm_prot_t fault_type = fault_typea;
4207 
4208 	/*
4209 	 * Lookup the faulting address.
4210 	 */
4211 	if (!vm_map_lookup_entry(map, vaddr, out_entry))
4212 		return (KERN_INVALID_ADDRESS);
4213 
4214 	entry = *out_entry;
4215 
4216 	/*
4217 	 * Fail if the entry refers to a submap.
4218 	 */
4219 	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
4220 		return (KERN_FAILURE);
4221 
4222 	/*
4223 	 * Check whether this task is allowed to have this page.
4224 	 */
4225 	prot = entry->protection;
4226 	fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
4227 	if ((fault_type & prot) != fault_type)
4228 		return (KERN_PROTECTION_FAILURE);
4229 
4230 	/*
4231 	 * If this page is not pageable, we have to get it for all possible
4232 	 * accesses.
4233 	 */
4234 	*wired = (entry->wired_count != 0);
4235 	if (*wired)
4236 		fault_type = entry->protection;
4237 
4238 	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4239 		/*
4240 		 * Fail if the entry was copy-on-write for a write fault.
4241 		 */
4242 		if (fault_type & VM_PROT_WRITE)
4243 			return (KERN_FAILURE);
4244 		/*
4245 		 * We're attempting to read a copy-on-write page --
4246 		 * don't allow writes.
4247 		 */
4248 		prot &= ~VM_PROT_WRITE;
4249 	}
4250 
4251 	/*
4252 	 * Fail if an object should be created.
4253 	 */
4254 	if (entry->object.vm_object == NULL && !map->system_map)
4255 		return (KERN_FAILURE);
4256 
4257 	/*
4258 	 * Return the object/offset from this entry.  If the entry was
4259 	 * copy-on-write or empty, it has been fixed up.
4260 	 */
4261 	*pindex = UOFF_TO_IDX((vaddr - entry->start) + entry->offset);
4262 	*object = entry->object.vm_object;
4263 
4264 	*out_prot = prot;
4265 	return (KERN_SUCCESS);
4266 }
4267 
4268 /*
4269  *	vm_map_lookup_done:
4270  *
4271  *	Releases locks acquired by a vm_map_lookup
4272  *	(according to the handle returned by that lookup).
4273  */
4274 void
4275 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
4276 {
4277 	/*
4278 	 * Unlock the main-level map
4279 	 */
4280 	vm_map_unlock_read(map);
4281 }
4282 
4283 #include "opt_ddb.h"
4284 #ifdef DDB
4285 #include <sys/kernel.h>
4286 
4287 #include <ddb/ddb.h>
4288 
4289 static void
4290 vm_map_print(vm_map_t map)
4291 {
4292 	vm_map_entry_t entry;
4293 
4294 	db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
4295 	    (void *)map,
4296 	    (void *)map->pmap, map->nentries, map->timestamp);
4297 
4298 	db_indent += 2;
4299 	for (entry = map->header.next; entry != &map->header;
4300 	    entry = entry->next) {
4301 		db_iprintf("map entry %p: start=%p, end=%p, eflags=%#x, \n",
4302 		    (void *)entry, (void *)entry->start, (void *)entry->end,
4303 		    entry->eflags);
4304 		{
4305 			static char *inheritance_name[4] =
4306 			{"share", "copy", "none", "donate_copy"};
4307 
4308 			db_iprintf(" prot=%x/%x/%s",
4309 			    entry->protection,
4310 			    entry->max_protection,
4311 			    inheritance_name[(int)(unsigned char)entry->inheritance]);
4312 			if (entry->wired_count != 0)
4313 				db_printf(", wired");
4314 		}
4315 		if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
4316 			db_printf(", share=%p, offset=0x%jx\n",
4317 			    (void *)entry->object.sub_map,
4318 			    (uintmax_t)entry->offset);
4319 			if ((entry->prev == &map->header) ||
4320 			    (entry->prev->object.sub_map !=
4321 				entry->object.sub_map)) {
4322 				db_indent += 2;
4323 				vm_map_print((vm_map_t)entry->object.sub_map);
4324 				db_indent -= 2;
4325 			}
4326 		} else {
4327 			if (entry->cred != NULL)
4328 				db_printf(", ruid %d", entry->cred->cr_ruid);
4329 			db_printf(", object=%p, offset=0x%jx",
4330 			    (void *)entry->object.vm_object,
4331 			    (uintmax_t)entry->offset);
4332 			if (entry->object.vm_object && entry->object.vm_object->cred)
4333 				db_printf(", obj ruid %d charge %jx",
4334 				    entry->object.vm_object->cred->cr_ruid,
4335 				    (uintmax_t)entry->object.vm_object->charge);
4336 			if (entry->eflags & MAP_ENTRY_COW)
4337 				db_printf(", copy (%s)",
4338 				    (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
4339 			db_printf("\n");
4340 
4341 			if ((entry->prev == &map->header) ||
4342 			    (entry->prev->object.vm_object !=
4343 				entry->object.vm_object)) {
4344 				db_indent += 2;
4345 				vm_object_print((db_expr_t)(intptr_t)
4346 						entry->object.vm_object,
4347 						0, 0, (char *)0);
4348 				db_indent -= 2;
4349 			}
4350 		}
4351 	}
4352 	db_indent -= 2;
4353 }
4354 
4355 DB_SHOW_COMMAND(map, map)
4356 {
4357 
4358 	if (!have_addr) {
4359 		db_printf("usage: show map <addr>\n");
4360 		return;
4361 	}
4362 	vm_map_print((vm_map_t)addr);
4363 }
4364 
4365 DB_SHOW_COMMAND(procvm, procvm)
4366 {
4367 	struct proc *p;
4368 
4369 	if (have_addr) {
4370 		p = db_lookup_proc(addr);
4371 	} else {
4372 		p = curproc;
4373 	}
4374 
4375 	db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
4376 	    (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
4377 	    (void *)vmspace_pmap(p->p_vmspace));
4378 
4379 	vm_map_print((vm_map_t)&p->p_vmspace->vm_map);
4380 }
4381 
4382 #endif /* DDB */
4383