xref: /freebsd/sys/vm/vm_map.c (revision 731d06abf2105cc0873fa84e972178f9f37ca760)
1 /*-
2  * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
3  *
4  * Copyright (c) 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * The Mach Operating System project at Carnegie-Mellon University.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	from: @(#)vm_map.c	8.3 (Berkeley) 1/12/94
35  *
36  *
37  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
38  * All rights reserved.
39  *
40  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
41  *
42  * Permission to use, copy, modify and distribute this software and
43  * its documentation is hereby granted, provided that both the copyright
44  * notice and this permission notice appear in all copies of the
45  * software, derivative works or modified versions, and any portions
46  * thereof, and that both notices appear in supporting documentation.
47  *
48  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
49  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
50  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
51  *
52  * Carnegie Mellon requests users of this software to return to
53  *
54  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
55  *  School of Computer Science
56  *  Carnegie Mellon University
57  *  Pittsburgh PA 15213-3890
58  *
59  * any improvements or extensions that they make and grant Carnegie the
60  * rights to redistribute these changes.
61  */
62 
63 /*
64  *	Virtual memory mapping module.
65  */
66 
67 #include <sys/cdefs.h>
68 __FBSDID("$FreeBSD$");
69 
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/kernel.h>
73 #include <sys/ktr.h>
74 #include <sys/lock.h>
75 #include <sys/mutex.h>
76 #include <sys/proc.h>
77 #include <sys/vmmeter.h>
78 #include <sys/mman.h>
79 #include <sys/vnode.h>
80 #include <sys/racct.h>
81 #include <sys/resourcevar.h>
82 #include <sys/rwlock.h>
83 #include <sys/file.h>
84 #include <sys/sysctl.h>
85 #include <sys/sysent.h>
86 #include <sys/shm.h>
87 
88 #include <vm/vm.h>
89 #include <vm/vm_param.h>
90 #include <vm/pmap.h>
91 #include <vm/vm_map.h>
92 #include <vm/vm_page.h>
93 #include <vm/vm_object.h>
94 #include <vm/vm_pager.h>
95 #include <vm/vm_kern.h>
96 #include <vm/vm_extern.h>
97 #include <vm/vnode_pager.h>
98 #include <vm/swap_pager.h>
99 #include <vm/uma.h>
100 
101 /*
102  *	Virtual memory maps provide for the mapping, protection,
103  *	and sharing of virtual memory objects.  In addition,
104  *	this module provides for an efficient virtual copy of
105  *	memory from one map to another.
106  *
107  *	Synchronization is required prior to most operations.
108  *
109  *	Maps consist of an ordered doubly-linked list of simple
110  *	entries; a self-adjusting binary search tree of these
111  *	entries is used to speed up lookups.
112  *
113  *	Since portions of maps are specified by start/end addresses,
114  *	which may not align with existing map entries, all
115  *	routines merely "clip" entries to these start/end values.
116  *	[That is, an entry is split into two, bordering at a
117  *	start or end value.]  Note that these clippings may not
118  *	always be necessary (as the two resulting entries are then
119  *	not changed); however, the clipping is done for convenience.
120  *
121  *	As mentioned above, virtual copy operations are performed
122  *	by copying VM object references from one map to
123  *	another, and then marking both regions as copy-on-write.
124  */
125 
126 static struct mtx map_sleep_mtx;
127 static uma_zone_t mapentzone;
128 static uma_zone_t kmapentzone;
129 static uma_zone_t mapzone;
130 static uma_zone_t vmspace_zone;
131 static int vmspace_zinit(void *mem, int size, int flags);
132 static int vm_map_zinit(void *mem, int ize, int flags);
133 static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min,
134     vm_offset_t max);
135 static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map);
136 static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry);
137 static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry);
138 static int vm_map_growstack(vm_map_t map, vm_offset_t addr,
139     vm_map_entry_t gap_entry);
140 static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
141     vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags);
142 #ifdef INVARIANTS
143 static void vm_map_zdtor(void *mem, int size, void *arg);
144 static void vmspace_zdtor(void *mem, int size, void *arg);
145 #endif
146 static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos,
147     vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max,
148     int cow);
149 static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
150     vm_offset_t failed_addr);
151 
152 #define	ENTRY_CHARGED(e) ((e)->cred != NULL || \
153     ((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \
154      !((e)->eflags & MAP_ENTRY_NEEDS_COPY)))
155 
156 /*
157  * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type
158  * stable.
159  */
160 #define PROC_VMSPACE_LOCK(p) do { } while (0)
161 #define PROC_VMSPACE_UNLOCK(p) do { } while (0)
162 
163 /*
164  *	VM_MAP_RANGE_CHECK:	[ internal use only ]
165  *
166  *	Asserts that the starting and ending region
167  *	addresses fall within the valid range of the map.
168  */
169 #define	VM_MAP_RANGE_CHECK(map, start, end)		\
170 		{					\
171 		if (start < vm_map_min(map))		\
172 			start = vm_map_min(map);	\
173 		if (end > vm_map_max(map))		\
174 			end = vm_map_max(map);		\
175 		if (start > end)			\
176 			start = end;			\
177 		}
178 
179 /*
180  *	vm_map_startup:
181  *
182  *	Initialize the vm_map module.  Must be called before
183  *	any other vm_map routines.
184  *
185  *	Map and entry structures are allocated from the general
186  *	purpose memory pool with some exceptions:
187  *
188  *	- The kernel map and kmem submap are allocated statically.
189  *	- Kernel map entries are allocated out of a static pool.
190  *
191  *	These restrictions are necessary since malloc() uses the
192  *	maps and requires map entries.
193  */
194 
195 void
196 vm_map_startup(void)
197 {
198 	mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF);
199 	mapzone = uma_zcreate("MAP", sizeof(struct vm_map), NULL,
200 #ifdef INVARIANTS
201 	    vm_map_zdtor,
202 #else
203 	    NULL,
204 #endif
205 	    vm_map_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
206 	uma_prealloc(mapzone, MAX_KMAP);
207 	kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
208 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
209 	    UMA_ZONE_MTXCLASS | UMA_ZONE_VM);
210 	mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
211 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
212 	vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
213 #ifdef INVARIANTS
214 	    vmspace_zdtor,
215 #else
216 	    NULL,
217 #endif
218 	    vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
219 }
220 
221 static int
222 vmspace_zinit(void *mem, int size, int flags)
223 {
224 	struct vmspace *vm;
225 
226 	vm = (struct vmspace *)mem;
227 
228 	vm->vm_map.pmap = NULL;
229 	(void)vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map), flags);
230 	PMAP_LOCK_INIT(vmspace_pmap(vm));
231 	return (0);
232 }
233 
234 static int
235 vm_map_zinit(void *mem, int size, int flags)
236 {
237 	vm_map_t map;
238 
239 	map = (vm_map_t)mem;
240 	memset(map, 0, sizeof(*map));
241 	mtx_init(&map->system_mtx, "vm map (system)", NULL, MTX_DEF | MTX_DUPOK);
242 	sx_init(&map->lock, "vm map (user)");
243 	return (0);
244 }
245 
246 #ifdef INVARIANTS
247 static void
248 vmspace_zdtor(void *mem, int size, void *arg)
249 {
250 	struct vmspace *vm;
251 
252 	vm = (struct vmspace *)mem;
253 
254 	vm_map_zdtor(&vm->vm_map, sizeof(vm->vm_map), arg);
255 }
256 static void
257 vm_map_zdtor(void *mem, int size, void *arg)
258 {
259 	vm_map_t map;
260 
261 	map = (vm_map_t)mem;
262 	KASSERT(map->nentries == 0,
263 	    ("map %p nentries == %d on free.",
264 	    map, map->nentries));
265 	KASSERT(map->size == 0,
266 	    ("map %p size == %lu on free.",
267 	    map, (unsigned long)map->size));
268 }
269 #endif	/* INVARIANTS */
270 
271 /*
272  * Allocate a vmspace structure, including a vm_map and pmap,
273  * and initialize those structures.  The refcnt is set to 1.
274  *
275  * If 'pinit' is NULL then the embedded pmap is initialized via pmap_pinit().
276  */
277 struct vmspace *
278 vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit)
279 {
280 	struct vmspace *vm;
281 
282 	vm = uma_zalloc(vmspace_zone, M_WAITOK);
283 	KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL"));
284 	if (!pinit(vmspace_pmap(vm))) {
285 		uma_zfree(vmspace_zone, vm);
286 		return (NULL);
287 	}
288 	CTR1(KTR_VM, "vmspace_alloc: %p", vm);
289 	_vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max);
290 	vm->vm_refcnt = 1;
291 	vm->vm_shm = NULL;
292 	vm->vm_swrss = 0;
293 	vm->vm_tsize = 0;
294 	vm->vm_dsize = 0;
295 	vm->vm_ssize = 0;
296 	vm->vm_taddr = 0;
297 	vm->vm_daddr = 0;
298 	vm->vm_maxsaddr = 0;
299 	return (vm);
300 }
301 
302 #ifdef RACCT
303 static void
304 vmspace_container_reset(struct proc *p)
305 {
306 
307 	PROC_LOCK(p);
308 	racct_set(p, RACCT_DATA, 0);
309 	racct_set(p, RACCT_STACK, 0);
310 	racct_set(p, RACCT_RSS, 0);
311 	racct_set(p, RACCT_MEMLOCK, 0);
312 	racct_set(p, RACCT_VMEM, 0);
313 	PROC_UNLOCK(p);
314 }
315 #endif
316 
317 static inline void
318 vmspace_dofree(struct vmspace *vm)
319 {
320 
321 	CTR1(KTR_VM, "vmspace_free: %p", vm);
322 
323 	/*
324 	 * Make sure any SysV shm is freed, it might not have been in
325 	 * exit1().
326 	 */
327 	shmexit(vm);
328 
329 	/*
330 	 * Lock the map, to wait out all other references to it.
331 	 * Delete all of the mappings and pages they hold, then call
332 	 * the pmap module to reclaim anything left.
333 	 */
334 	(void)vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map),
335 	    vm_map_max(&vm->vm_map));
336 
337 	pmap_release(vmspace_pmap(vm));
338 	vm->vm_map.pmap = NULL;
339 	uma_zfree(vmspace_zone, vm);
340 }
341 
342 void
343 vmspace_free(struct vmspace *vm)
344 {
345 
346 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
347 	    "vmspace_free() called");
348 
349 	if (vm->vm_refcnt == 0)
350 		panic("vmspace_free: attempt to free already freed vmspace");
351 
352 	if (atomic_fetchadd_int(&vm->vm_refcnt, -1) == 1)
353 		vmspace_dofree(vm);
354 }
355 
356 void
357 vmspace_exitfree(struct proc *p)
358 {
359 	struct vmspace *vm;
360 
361 	PROC_VMSPACE_LOCK(p);
362 	vm = p->p_vmspace;
363 	p->p_vmspace = NULL;
364 	PROC_VMSPACE_UNLOCK(p);
365 	KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace"));
366 	vmspace_free(vm);
367 }
368 
369 void
370 vmspace_exit(struct thread *td)
371 {
372 	int refcnt;
373 	struct vmspace *vm;
374 	struct proc *p;
375 
376 	/*
377 	 * Release user portion of address space.
378 	 * This releases references to vnodes,
379 	 * which could cause I/O if the file has been unlinked.
380 	 * Need to do this early enough that we can still sleep.
381 	 *
382 	 * The last exiting process to reach this point releases as
383 	 * much of the environment as it can. vmspace_dofree() is the
384 	 * slower fallback in case another process had a temporary
385 	 * reference to the vmspace.
386 	 */
387 
388 	p = td->td_proc;
389 	vm = p->p_vmspace;
390 	atomic_add_int(&vmspace0.vm_refcnt, 1);
391 	refcnt = vm->vm_refcnt;
392 	do {
393 		if (refcnt > 1 && p->p_vmspace != &vmspace0) {
394 			/* Switch now since other proc might free vmspace */
395 			PROC_VMSPACE_LOCK(p);
396 			p->p_vmspace = &vmspace0;
397 			PROC_VMSPACE_UNLOCK(p);
398 			pmap_activate(td);
399 		}
400 	} while (!atomic_fcmpset_int(&vm->vm_refcnt, &refcnt, refcnt - 1));
401 	if (refcnt == 1) {
402 		if (p->p_vmspace != vm) {
403 			/* vmspace not yet freed, switch back */
404 			PROC_VMSPACE_LOCK(p);
405 			p->p_vmspace = vm;
406 			PROC_VMSPACE_UNLOCK(p);
407 			pmap_activate(td);
408 		}
409 		pmap_remove_pages(vmspace_pmap(vm));
410 		/* Switch now since this proc will free vmspace */
411 		PROC_VMSPACE_LOCK(p);
412 		p->p_vmspace = &vmspace0;
413 		PROC_VMSPACE_UNLOCK(p);
414 		pmap_activate(td);
415 		vmspace_dofree(vm);
416 	}
417 #ifdef RACCT
418 	if (racct_enable)
419 		vmspace_container_reset(p);
420 #endif
421 }
422 
423 /* Acquire reference to vmspace owned by another process. */
424 
425 struct vmspace *
426 vmspace_acquire_ref(struct proc *p)
427 {
428 	struct vmspace *vm;
429 	int refcnt;
430 
431 	PROC_VMSPACE_LOCK(p);
432 	vm = p->p_vmspace;
433 	if (vm == NULL) {
434 		PROC_VMSPACE_UNLOCK(p);
435 		return (NULL);
436 	}
437 	refcnt = vm->vm_refcnt;
438 	do {
439 		if (refcnt <= 0) { 	/* Avoid 0->1 transition */
440 			PROC_VMSPACE_UNLOCK(p);
441 			return (NULL);
442 		}
443 	} while (!atomic_fcmpset_int(&vm->vm_refcnt, &refcnt, refcnt + 1));
444 	if (vm != p->p_vmspace) {
445 		PROC_VMSPACE_UNLOCK(p);
446 		vmspace_free(vm);
447 		return (NULL);
448 	}
449 	PROC_VMSPACE_UNLOCK(p);
450 	return (vm);
451 }
452 
453 /*
454  * Switch between vmspaces in an AIO kernel process.
455  *
456  * The AIO kernel processes switch to and from a user process's
457  * vmspace while performing an I/O operation on behalf of a user
458  * process.  The new vmspace is either the vmspace of a user process
459  * obtained from an active AIO request or the initial vmspace of the
460  * AIO kernel process (when it is idling).  Because user processes
461  * will block to drain any active AIO requests before proceeding in
462  * exit() or execve(), the vmspace reference count for these vmspaces
463  * can never be 0.  This allows for a much simpler implementation than
464  * the loop in vmspace_acquire_ref() above.  Similarly, AIO kernel
465  * processes hold an extra reference on their initial vmspace for the
466  * life of the process so that this guarantee is true for any vmspace
467  * passed as 'newvm'.
468  */
469 void
470 vmspace_switch_aio(struct vmspace *newvm)
471 {
472 	struct vmspace *oldvm;
473 
474 	/* XXX: Need some way to assert that this is an aio daemon. */
475 
476 	KASSERT(newvm->vm_refcnt > 0,
477 	    ("vmspace_switch_aio: newvm unreferenced"));
478 
479 	oldvm = curproc->p_vmspace;
480 	if (oldvm == newvm)
481 		return;
482 
483 	/*
484 	 * Point to the new address space and refer to it.
485 	 */
486 	curproc->p_vmspace = newvm;
487 	atomic_add_int(&newvm->vm_refcnt, 1);
488 
489 	/* Activate the new mapping. */
490 	pmap_activate(curthread);
491 
492 	/* Remove the daemon's reference to the old address space. */
493 	KASSERT(oldvm->vm_refcnt > 1,
494 	    ("vmspace_switch_aio: oldvm dropping last reference"));
495 	vmspace_free(oldvm);
496 }
497 
498 void
499 _vm_map_lock(vm_map_t map, const char *file, int line)
500 {
501 
502 	if (map->system_map)
503 		mtx_lock_flags_(&map->system_mtx, 0, file, line);
504 	else
505 		sx_xlock_(&map->lock, file, line);
506 	map->timestamp++;
507 }
508 
509 static void
510 vm_map_process_deferred(void)
511 {
512 	struct thread *td;
513 	vm_map_entry_t entry, next;
514 	vm_object_t object;
515 
516 	td = curthread;
517 	entry = td->td_map_def_user;
518 	td->td_map_def_user = NULL;
519 	while (entry != NULL) {
520 		next = entry->next;
521 		if ((entry->eflags & MAP_ENTRY_VN_WRITECNT) != 0) {
522 			/*
523 			 * Decrement the object's writemappings and
524 			 * possibly the vnode's v_writecount.
525 			 */
526 			KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
527 			    ("Submap with writecount"));
528 			object = entry->object.vm_object;
529 			KASSERT(object != NULL, ("No object for writecount"));
530 			vnode_pager_release_writecount(object, entry->start,
531 			    entry->end);
532 		}
533 		vm_map_entry_deallocate(entry, FALSE);
534 		entry = next;
535 	}
536 }
537 
538 void
539 _vm_map_unlock(vm_map_t map, const char *file, int line)
540 {
541 
542 	if (map->system_map)
543 		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
544 	else {
545 		sx_xunlock_(&map->lock, file, line);
546 		vm_map_process_deferred();
547 	}
548 }
549 
550 void
551 _vm_map_lock_read(vm_map_t map, const char *file, int line)
552 {
553 
554 	if (map->system_map)
555 		mtx_lock_flags_(&map->system_mtx, 0, file, line);
556 	else
557 		sx_slock_(&map->lock, file, line);
558 }
559 
560 void
561 _vm_map_unlock_read(vm_map_t map, const char *file, int line)
562 {
563 
564 	if (map->system_map)
565 		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
566 	else {
567 		sx_sunlock_(&map->lock, file, line);
568 		vm_map_process_deferred();
569 	}
570 }
571 
572 int
573 _vm_map_trylock(vm_map_t map, const char *file, int line)
574 {
575 	int error;
576 
577 	error = map->system_map ?
578 	    !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
579 	    !sx_try_xlock_(&map->lock, file, line);
580 	if (error == 0)
581 		map->timestamp++;
582 	return (error == 0);
583 }
584 
585 int
586 _vm_map_trylock_read(vm_map_t map, const char *file, int line)
587 {
588 	int error;
589 
590 	error = map->system_map ?
591 	    !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
592 	    !sx_try_slock_(&map->lock, file, line);
593 	return (error == 0);
594 }
595 
596 /*
597  *	_vm_map_lock_upgrade:	[ internal use only ]
598  *
599  *	Tries to upgrade a read (shared) lock on the specified map to a write
600  *	(exclusive) lock.  Returns the value "0" if the upgrade succeeds and a
601  *	non-zero value if the upgrade fails.  If the upgrade fails, the map is
602  *	returned without a read or write lock held.
603  *
604  *	Requires that the map be read locked.
605  */
606 int
607 _vm_map_lock_upgrade(vm_map_t map, const char *file, int line)
608 {
609 	unsigned int last_timestamp;
610 
611 	if (map->system_map) {
612 		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
613 	} else {
614 		if (!sx_try_upgrade_(&map->lock, file, line)) {
615 			last_timestamp = map->timestamp;
616 			sx_sunlock_(&map->lock, file, line);
617 			vm_map_process_deferred();
618 			/*
619 			 * If the map's timestamp does not change while the
620 			 * map is unlocked, then the upgrade succeeds.
621 			 */
622 			sx_xlock_(&map->lock, file, line);
623 			if (last_timestamp != map->timestamp) {
624 				sx_xunlock_(&map->lock, file, line);
625 				return (1);
626 			}
627 		}
628 	}
629 	map->timestamp++;
630 	return (0);
631 }
632 
633 void
634 _vm_map_lock_downgrade(vm_map_t map, const char *file, int line)
635 {
636 
637 	if (map->system_map) {
638 		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
639 	} else
640 		sx_downgrade_(&map->lock, file, line);
641 }
642 
643 /*
644  *	vm_map_locked:
645  *
646  *	Returns a non-zero value if the caller holds a write (exclusive) lock
647  *	on the specified map and the value "0" otherwise.
648  */
649 int
650 vm_map_locked(vm_map_t map)
651 {
652 
653 	if (map->system_map)
654 		return (mtx_owned(&map->system_mtx));
655 	else
656 		return (sx_xlocked(&map->lock));
657 }
658 
659 #ifdef INVARIANTS
660 static void
661 _vm_map_assert_locked(vm_map_t map, const char *file, int line)
662 {
663 
664 	if (map->system_map)
665 		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
666 	else
667 		sx_assert_(&map->lock, SA_XLOCKED, file, line);
668 }
669 
670 #define	VM_MAP_ASSERT_LOCKED(map) \
671     _vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE)
672 
673 static void
674 _vm_map_assert_consistent(vm_map_t map)
675 {
676 	vm_map_entry_t entry;
677 	vm_map_entry_t child;
678 	vm_size_t max_left, max_right;
679 
680 	for (entry = map->header.next; entry != &map->header;
681 	    entry = entry->next) {
682 		KASSERT(entry->prev->end <= entry->start,
683 		    ("map %p prev->end = %jx, start = %jx", map,
684 		    (uintmax_t)entry->prev->end, (uintmax_t)entry->start));
685 		KASSERT(entry->start < entry->end,
686 		    ("map %p start = %jx, end = %jx", map,
687 		    (uintmax_t)entry->start, (uintmax_t)entry->end));
688 		KASSERT(entry->end <= entry->next->start,
689 		    ("map %p end = %jx, next->start = %jx", map,
690 		    (uintmax_t)entry->end, (uintmax_t)entry->next->start));
691 		KASSERT(entry->left == NULL ||
692 		    entry->left->start < entry->start,
693 		    ("map %p left->start = %jx, start = %jx", map,
694 		    (uintmax_t)entry->left->start, (uintmax_t)entry->start));
695 		KASSERT(entry->right == NULL ||
696 		    entry->start < entry->right->start,
697 		    ("map %p start = %jx, right->start = %jx", map,
698 		    (uintmax_t)entry->start, (uintmax_t)entry->right->start));
699 		child = entry->left;
700 		max_left = (child != NULL) ? child->max_free :
701 			entry->start - entry->prev->end;
702 		child = entry->right;
703 		max_right = (child != NULL) ? child->max_free :
704 			entry->next->start - entry->end;
705 		KASSERT(entry->max_free == MAX(max_left, max_right),
706 		    ("map %p max = %jx, max_left = %jx, max_right = %jx", map,
707 		     (uintmax_t)entry->max_free,
708 		     (uintmax_t)max_left, (uintmax_t)max_right));
709 	}
710 }
711 
712 #define VM_MAP_ASSERT_CONSISTENT(map) \
713     _vm_map_assert_consistent(map)
714 #else
715 #define	VM_MAP_ASSERT_LOCKED(map)
716 #define VM_MAP_ASSERT_CONSISTENT(map)
717 #endif
718 
719 /*
720  *	_vm_map_unlock_and_wait:
721  *
722  *	Atomically releases the lock on the specified map and puts the calling
723  *	thread to sleep.  The calling thread will remain asleep until either
724  *	vm_map_wakeup() is performed on the map or the specified timeout is
725  *	exceeded.
726  *
727  *	WARNING!  This function does not perform deferred deallocations of
728  *	objects and map	entries.  Therefore, the calling thread is expected to
729  *	reacquire the map lock after reawakening and later perform an ordinary
730  *	unlock operation, such as vm_map_unlock(), before completing its
731  *	operation on the map.
732  */
733 int
734 _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line)
735 {
736 
737 	mtx_lock(&map_sleep_mtx);
738 	if (map->system_map)
739 		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
740 	else
741 		sx_xunlock_(&map->lock, file, line);
742 	return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps",
743 	    timo));
744 }
745 
746 /*
747  *	vm_map_wakeup:
748  *
749  *	Awaken any threads that have slept on the map using
750  *	vm_map_unlock_and_wait().
751  */
752 void
753 vm_map_wakeup(vm_map_t map)
754 {
755 
756 	/*
757 	 * Acquire and release map_sleep_mtx to prevent a wakeup()
758 	 * from being performed (and lost) between the map unlock
759 	 * and the msleep() in _vm_map_unlock_and_wait().
760 	 */
761 	mtx_lock(&map_sleep_mtx);
762 	mtx_unlock(&map_sleep_mtx);
763 	wakeup(&map->root);
764 }
765 
766 void
767 vm_map_busy(vm_map_t map)
768 {
769 
770 	VM_MAP_ASSERT_LOCKED(map);
771 	map->busy++;
772 }
773 
774 void
775 vm_map_unbusy(vm_map_t map)
776 {
777 
778 	VM_MAP_ASSERT_LOCKED(map);
779 	KASSERT(map->busy, ("vm_map_unbusy: not busy"));
780 	if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) {
781 		vm_map_modflags(map, 0, MAP_BUSY_WAKEUP);
782 		wakeup(&map->busy);
783 	}
784 }
785 
786 void
787 vm_map_wait_busy(vm_map_t map)
788 {
789 
790 	VM_MAP_ASSERT_LOCKED(map);
791 	while (map->busy) {
792 		vm_map_modflags(map, MAP_BUSY_WAKEUP, 0);
793 		if (map->system_map)
794 			msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0);
795 		else
796 			sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0);
797 	}
798 	map->timestamp++;
799 }
800 
801 long
802 vmspace_resident_count(struct vmspace *vmspace)
803 {
804 	return pmap_resident_count(vmspace_pmap(vmspace));
805 }
806 
807 /*
808  *	vm_map_create:
809  *
810  *	Creates and returns a new empty VM map with
811  *	the given physical map structure, and having
812  *	the given lower and upper address bounds.
813  */
814 vm_map_t
815 vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max)
816 {
817 	vm_map_t result;
818 
819 	result = uma_zalloc(mapzone, M_WAITOK);
820 	CTR1(KTR_VM, "vm_map_create: %p", result);
821 	_vm_map_init(result, pmap, min, max);
822 	return (result);
823 }
824 
825 /*
826  * Initialize an existing vm_map structure
827  * such as that in the vmspace structure.
828  */
829 static void
830 _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
831 {
832 
833 	map->header.next = map->header.prev = &map->header;
834 	map->header.eflags = MAP_ENTRY_HEADER;
835 	map->needs_wakeup = FALSE;
836 	map->system_map = 0;
837 	map->pmap = pmap;
838 	map->header.end = min;
839 	map->header.start = max;
840 	map->flags = 0;
841 	map->root = NULL;
842 	map->timestamp = 0;
843 	map->busy = 0;
844 	map->anon_loc = 0;
845 }
846 
847 void
848 vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
849 {
850 
851 	_vm_map_init(map, pmap, min, max);
852 	mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK);
853 	sx_init(&map->lock, "user map");
854 }
855 
856 /*
857  *	vm_map_entry_dispose:	[ internal use only ]
858  *
859  *	Inverse of vm_map_entry_create.
860  */
861 static void
862 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry)
863 {
864 	uma_zfree(map->system_map ? kmapentzone : mapentzone, entry);
865 }
866 
867 /*
868  *	vm_map_entry_create:	[ internal use only ]
869  *
870  *	Allocates a VM map entry for insertion.
871  *	No entry fields are filled in.
872  */
873 static vm_map_entry_t
874 vm_map_entry_create(vm_map_t map)
875 {
876 	vm_map_entry_t new_entry;
877 
878 	if (map->system_map)
879 		new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
880 	else
881 		new_entry = uma_zalloc(mapentzone, M_WAITOK);
882 	if (new_entry == NULL)
883 		panic("vm_map_entry_create: kernel resources exhausted");
884 	return (new_entry);
885 }
886 
887 /*
888  *	vm_map_entry_set_behavior:
889  *
890  *	Set the expected access behavior, either normal, random, or
891  *	sequential.
892  */
893 static inline void
894 vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
895 {
896 	entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
897 	    (behavior & MAP_ENTRY_BEHAV_MASK);
898 }
899 
900 /*
901  *	vm_map_entry_set_max_free:
902  *
903  *	Set the max_free field in a vm_map_entry.
904  */
905 static inline void
906 vm_map_entry_set_max_free(vm_map_entry_t entry)
907 {
908 	vm_map_entry_t child;
909 	vm_size_t max_left, max_right;
910 
911 	child = entry->left;
912 	max_left = (child != NULL) ? child->max_free :
913 	    entry->start - entry->prev->end;
914 	child = entry->right;
915 	max_right = (child != NULL) ? child->max_free :
916 	    entry->next->start - entry->end;
917 	entry->max_free = MAX(max_left, max_right);
918 }
919 
920 #define SPLAY_LEFT_STEP(root, y, rlist, test) do {	\
921 	y = root->left;					\
922 	if (y != NULL && (test)) {			\
923 		/* Rotate right and make y root. */	\
924 		root->left = y->right;			\
925 		y->right = root;			\
926 		vm_map_entry_set_max_free(root);	\
927 		root = y;				\
928 		y = root->left;				\
929 	}						\
930 	/* Put root on rlist. */			\
931 	root->left = rlist;				\
932 	rlist = root;					\
933 	root = y;					\
934 } while (0)
935 
936 #define SPLAY_RIGHT_STEP(root, y, llist, test) do {	\
937 	y = root->right;				\
938 	if (y != NULL && (test)) {			\
939 		/* Rotate left and make y root. */	\
940 		root->right = y->left;			\
941 		y->left = root;				\
942 		vm_map_entry_set_max_free(root);	\
943 		root = y;				\
944 		y = root->right;			\
945 	}						\
946 	/* Put root on llist. */			\
947 	root->right = llist;				\
948 	llist = root;					\
949 	root = y;					\
950 } while (0)
951 
952 /*
953  * Walk down the tree until we find addr or a NULL pointer where addr would go,
954  * breaking off left and right subtrees of nodes less than, or greater than
955  * addr.  Treat pointers to nodes with max_free < length as NULL pointers.
956  * llist and rlist are the two sides in reverse order (bottom-up), with llist
957  * linked by the right pointer and rlist linked by the left pointer in the
958  * vm_map_entry.
959  */
960 static vm_map_entry_t
961 vm_map_splay_split(vm_offset_t addr, vm_size_t length,
962     vm_map_entry_t root, vm_map_entry_t *out_llist, vm_map_entry_t *out_rlist)
963 {
964 	vm_map_entry_t llist, rlist;
965 	vm_map_entry_t y;
966 
967 	llist = NULL;
968 	rlist = NULL;
969 	while (root != NULL && root->max_free >= length) {
970 		if (addr < root->start) {
971 			SPLAY_LEFT_STEP(root, y, rlist,
972 			    y->max_free >= length && addr < y->start);
973 		} else if (addr >= root->end) {
974 			SPLAY_RIGHT_STEP(root, y, llist,
975 			    y->max_free >= length && addr >= y->end);
976 		} else
977 			break;
978 	}
979 	*out_llist = llist;
980 	*out_rlist = rlist;
981 	return (root);
982 }
983 
984 static void
985 vm_map_splay_findnext(vm_map_entry_t root, vm_map_entry_t *iolist)
986 {
987 	vm_map_entry_t rlist, y;
988 
989 	root = root->right;
990 	rlist = *iolist;
991 	while (root != NULL)
992 		SPLAY_LEFT_STEP(root, y, rlist, true);
993 	*iolist = rlist;
994 }
995 
996 static void
997 vm_map_splay_findprev(vm_map_entry_t root, vm_map_entry_t *iolist)
998 {
999 	vm_map_entry_t llist, y;
1000 
1001 	root = root->left;
1002 	llist = *iolist;
1003 	while (root != NULL)
1004 		SPLAY_RIGHT_STEP(root, y, llist, true);
1005 	*iolist = llist;
1006 }
1007 
1008 /*
1009  * Walk back up the two spines, flip the pointers and set max_free.  The
1010  * subtrees of the root go at the bottom of llist and rlist.
1011  */
1012 static vm_map_entry_t
1013 vm_map_splay_merge(vm_map_entry_t root,
1014     vm_map_entry_t llist, vm_map_entry_t rlist,
1015     vm_map_entry_t ltree, vm_map_entry_t rtree)
1016 {
1017 	vm_map_entry_t y;
1018 
1019 	while (llist != NULL) {
1020 		y = llist->right;
1021 		llist->right = ltree;
1022 		vm_map_entry_set_max_free(llist);
1023 		ltree = llist;
1024 		llist = y;
1025 	}
1026 	while (rlist != NULL) {
1027 		y = rlist->left;
1028 		rlist->left = rtree;
1029 		vm_map_entry_set_max_free(rlist);
1030 		rtree = rlist;
1031 		rlist = y;
1032 	}
1033 
1034 	/*
1035 	 * Final assembly: add ltree and rtree as subtrees of root.
1036 	 */
1037 	root->left = ltree;
1038 	root->right = rtree;
1039 	vm_map_entry_set_max_free(root);
1040 
1041 	return (root);
1042 }
1043 
1044 /*
1045  *	vm_map_entry_splay:
1046  *
1047  *	The Sleator and Tarjan top-down splay algorithm with the
1048  *	following variation.  Max_free must be computed bottom-up, so
1049  *	on the downward pass, maintain the left and right spines in
1050  *	reverse order.  Then, make a second pass up each side to fix
1051  *	the pointers and compute max_free.  The time bound is O(log n)
1052  *	amortized.
1053  *
1054  *	The new root is the vm_map_entry containing "addr", or else an
1055  *	adjacent entry (lower if possible) if addr is not in the tree.
1056  *
1057  *	The map must be locked, and leaves it so.
1058  *
1059  *	Returns: the new root.
1060  */
1061 static vm_map_entry_t
1062 vm_map_entry_splay(vm_offset_t addr, vm_map_entry_t root)
1063 {
1064 	vm_map_entry_t llist, rlist;
1065 
1066 	root = vm_map_splay_split(addr, 0, root, &llist, &rlist);
1067 	if (root != NULL) {
1068 		/* do nothing */
1069 	} else if (llist != NULL) {
1070 		/*
1071 		 * Recover the greatest node in the left
1072 		 * subtree and make it the root.
1073 		 */
1074 		root = llist;
1075 		llist = root->right;
1076 		root->right = NULL;
1077 	} else if (rlist != NULL) {
1078 		/*
1079 		 * Recover the least node in the right
1080 		 * subtree and make it the root.
1081 		 */
1082 		root = rlist;
1083 		rlist = root->left;
1084 		root->left = NULL;
1085 	} else {
1086 		/* There is no root. */
1087 		return (NULL);
1088 	}
1089 	return (vm_map_splay_merge(root, llist, rlist,
1090 	    root->left, root->right));
1091 }
1092 
1093 /*
1094  *	vm_map_entry_{un,}link:
1095  *
1096  *	Insert/remove entries from maps.
1097  */
1098 static void
1099 vm_map_entry_link(vm_map_t map,
1100 		  vm_map_entry_t entry)
1101 {
1102 	vm_map_entry_t llist, rlist, root;
1103 
1104 	CTR3(KTR_VM,
1105 	    "vm_map_entry_link: map %p, nentries %d, entry %p", map,
1106 	    map->nentries, entry);
1107 	VM_MAP_ASSERT_LOCKED(map);
1108 	map->nentries++;
1109 	root = map->root;
1110 	root = vm_map_splay_split(entry->start, 0, root, &llist, &rlist);
1111 	KASSERT(root == NULL,
1112 	    ("vm_map_entry_link: link object already mapped"));
1113 	entry->prev = (llist == NULL) ? &map->header : llist;
1114 	entry->next = (rlist == NULL) ? &map->header : rlist;
1115 	entry->prev->next = entry->next->prev = entry;
1116 	root = vm_map_splay_merge(entry, llist, rlist, NULL, NULL);
1117 	map->root = entry;
1118 	VM_MAP_ASSERT_CONSISTENT(map);
1119 }
1120 
1121 enum unlink_merge_type {
1122 	UNLINK_MERGE_PREV,
1123 	UNLINK_MERGE_NONE,
1124 	UNLINK_MERGE_NEXT
1125 };
1126 
1127 static void
1128 vm_map_entry_unlink(vm_map_t map,
1129 		    vm_map_entry_t entry,
1130 		    enum unlink_merge_type op)
1131 {
1132 	vm_map_entry_t llist, rlist, root, y;
1133 
1134 	VM_MAP_ASSERT_LOCKED(map);
1135 	llist = entry->prev;
1136 	rlist = entry->next;
1137 	llist->next = rlist;
1138 	rlist->prev = llist;
1139 	root = map->root;
1140 	root = vm_map_splay_split(entry->start, 0, root, &llist, &rlist);
1141 	KASSERT(root != NULL,
1142 	    ("vm_map_entry_unlink: unlink object not mapped"));
1143 
1144 	switch (op) {
1145 	case UNLINK_MERGE_PREV:
1146 		vm_map_splay_findprev(root, &llist);
1147 		llist->end = root->end;
1148 		y = root->right;
1149 		root = llist;
1150 		llist = root->right;
1151 		root->right = y;
1152 		break;
1153 	case UNLINK_MERGE_NEXT:
1154 		vm_map_splay_findnext(root, &rlist);
1155 		rlist->start = root->start;
1156 		rlist->offset = root->offset;
1157 		y = root->left;
1158 		root = rlist;
1159 		rlist = root->left;
1160 		root->left = y;
1161 		break;
1162 	case UNLINK_MERGE_NONE:
1163 		vm_map_splay_findprev(root, &llist);
1164 		vm_map_splay_findnext(root, &rlist);
1165 		if (llist != NULL) {
1166 			root = llist;
1167 			llist = root->right;
1168 			root->right = NULL;
1169 		} else if (rlist != NULL) {
1170 			root = rlist;
1171 			rlist = root->left;
1172 			root->left = NULL;
1173 		} else
1174 			root = NULL;
1175 		break;
1176 	}
1177 	if (root != NULL)
1178 		root = vm_map_splay_merge(root, llist, rlist,
1179 		    root->left, root->right);
1180 	map->root = root;
1181 	VM_MAP_ASSERT_CONSISTENT(map);
1182 	map->nentries--;
1183 	CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map,
1184 	    map->nentries, entry);
1185 }
1186 
1187 /*
1188  *	vm_map_entry_resize_free:
1189  *
1190  *	Recompute the amount of free space following a modified vm_map_entry
1191  *	and propagate those values up the tree.  Call this function after
1192  *	resizing a map entry in-place by changing the end value, without a
1193  *	call to vm_map_entry_link() or _unlink().
1194  *
1195  *	The map must be locked, and leaves it so.
1196  */
1197 static void
1198 vm_map_entry_resize_free(vm_map_t map, vm_map_entry_t entry)
1199 {
1200 	vm_map_entry_t llist, rlist, root;
1201 
1202 	VM_MAP_ASSERT_LOCKED(map);
1203 	root = map->root;
1204 	root = vm_map_splay_split(entry->start, 0, root, &llist, &rlist);
1205 	KASSERT(root != NULL,
1206 	    ("vm_map_entry_resize_free: resize_free object not mapped"));
1207 	vm_map_splay_findnext(root, &rlist);
1208 	root->right = NULL;
1209 	map->root = vm_map_splay_merge(root, llist, rlist,
1210 	    root->left, root->right);
1211 	VM_MAP_ASSERT_CONSISTENT(map);
1212 	CTR3(KTR_VM, "vm_map_entry_resize_free: map %p, nentries %d, entry %p", map,
1213 	    map->nentries, entry);
1214 }
1215 
1216 /*
1217  *	vm_map_lookup_entry:	[ internal use only ]
1218  *
1219  *	Finds the map entry containing (or
1220  *	immediately preceding) the specified address
1221  *	in the given map; the entry is returned
1222  *	in the "entry" parameter.  The boolean
1223  *	result indicates whether the address is
1224  *	actually contained in the map.
1225  */
1226 boolean_t
1227 vm_map_lookup_entry(
1228 	vm_map_t map,
1229 	vm_offset_t address,
1230 	vm_map_entry_t *entry)	/* OUT */
1231 {
1232 	vm_map_entry_t cur, lbound;
1233 	boolean_t locked;
1234 
1235 	/*
1236 	 * If the map is empty, then the map entry immediately preceding
1237 	 * "address" is the map's header.
1238 	 */
1239 	cur = map->root;
1240 	if (cur == NULL) {
1241 		*entry = &map->header;
1242 		return (FALSE);
1243 	}
1244 	if (address >= cur->start && cur->end > address) {
1245 		*entry = cur;
1246 		return (TRUE);
1247 	}
1248 	if ((locked = vm_map_locked(map)) ||
1249 	    sx_try_upgrade(&map->lock)) {
1250 		/*
1251 		 * Splay requires a write lock on the map.  However, it only
1252 		 * restructures the binary search tree; it does not otherwise
1253 		 * change the map.  Thus, the map's timestamp need not change
1254 		 * on a temporary upgrade.
1255 		 */
1256 		map->root = cur = vm_map_entry_splay(address, cur);
1257 		VM_MAP_ASSERT_CONSISTENT(map);
1258 		if (!locked)
1259 			sx_downgrade(&map->lock);
1260 
1261 		/*
1262 		 * If "address" is contained within a map entry, the new root
1263 		 * is that map entry.  Otherwise, the new root is a map entry
1264 		 * immediately before or after "address".
1265 		 */
1266 		if (address < cur->start) {
1267 			*entry = &map->header;
1268 			return (FALSE);
1269 		}
1270 		*entry = cur;
1271 		return (address < cur->end);
1272 	}
1273 	/*
1274 	 * Since the map is only locked for read access, perform a
1275 	 * standard binary search tree lookup for "address".
1276 	 */
1277 	lbound = &map->header;
1278 	do {
1279 		if (address < cur->start) {
1280 			cur = cur->left;
1281 		} else if (cur->end <= address) {
1282 			lbound = cur;
1283 			cur = cur->right;
1284 		} else {
1285 			*entry = cur;
1286 			return (TRUE);
1287 		}
1288 	} while (cur != NULL);
1289 	*entry = lbound;
1290 	return (FALSE);
1291 }
1292 
1293 /*
1294  *	vm_map_insert:
1295  *
1296  *	Inserts the given whole VM object into the target
1297  *	map at the specified address range.  The object's
1298  *	size should match that of the address range.
1299  *
1300  *	Requires that the map be locked, and leaves it so.
1301  *
1302  *	If object is non-NULL, ref count must be bumped by caller
1303  *	prior to making call to account for the new entry.
1304  */
1305 int
1306 vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1307     vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow)
1308 {
1309 	vm_map_entry_t new_entry, prev_entry, temp_entry;
1310 	struct ucred *cred;
1311 	vm_eflags_t protoeflags;
1312 	vm_inherit_t inheritance;
1313 
1314 	VM_MAP_ASSERT_LOCKED(map);
1315 	KASSERT(object != kernel_object ||
1316 	    (cow & MAP_COPY_ON_WRITE) == 0,
1317 	    ("vm_map_insert: kernel object and COW"));
1318 	KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0,
1319 	    ("vm_map_insert: paradoxical MAP_NOFAULT request"));
1320 	KASSERT((prot & ~max) == 0,
1321 	    ("prot %#x is not subset of max_prot %#x", prot, max));
1322 
1323 	/*
1324 	 * Check that the start and end points are not bogus.
1325 	 */
1326 	if (start < vm_map_min(map) || end > vm_map_max(map) ||
1327 	    start >= end)
1328 		return (KERN_INVALID_ADDRESS);
1329 
1330 	/*
1331 	 * Find the entry prior to the proposed starting address; if it's part
1332 	 * of an existing entry, this range is bogus.
1333 	 */
1334 	if (vm_map_lookup_entry(map, start, &temp_entry))
1335 		return (KERN_NO_SPACE);
1336 
1337 	prev_entry = temp_entry;
1338 
1339 	/*
1340 	 * Assert that the next entry doesn't overlap the end point.
1341 	 */
1342 	if (prev_entry->next->start < end)
1343 		return (KERN_NO_SPACE);
1344 
1345 	if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL ||
1346 	    max != VM_PROT_NONE))
1347 		return (KERN_INVALID_ARGUMENT);
1348 
1349 	protoeflags = 0;
1350 	if (cow & MAP_COPY_ON_WRITE)
1351 		protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY;
1352 	if (cow & MAP_NOFAULT)
1353 		protoeflags |= MAP_ENTRY_NOFAULT;
1354 	if (cow & MAP_DISABLE_SYNCER)
1355 		protoeflags |= MAP_ENTRY_NOSYNC;
1356 	if (cow & MAP_DISABLE_COREDUMP)
1357 		protoeflags |= MAP_ENTRY_NOCOREDUMP;
1358 	if (cow & MAP_STACK_GROWS_DOWN)
1359 		protoeflags |= MAP_ENTRY_GROWS_DOWN;
1360 	if (cow & MAP_STACK_GROWS_UP)
1361 		protoeflags |= MAP_ENTRY_GROWS_UP;
1362 	if (cow & MAP_VN_WRITECOUNT)
1363 		protoeflags |= MAP_ENTRY_VN_WRITECNT;
1364 	if ((cow & MAP_CREATE_GUARD) != 0)
1365 		protoeflags |= MAP_ENTRY_GUARD;
1366 	if ((cow & MAP_CREATE_STACK_GAP_DN) != 0)
1367 		protoeflags |= MAP_ENTRY_STACK_GAP_DN;
1368 	if ((cow & MAP_CREATE_STACK_GAP_UP) != 0)
1369 		protoeflags |= MAP_ENTRY_STACK_GAP_UP;
1370 	if (cow & MAP_INHERIT_SHARE)
1371 		inheritance = VM_INHERIT_SHARE;
1372 	else
1373 		inheritance = VM_INHERIT_DEFAULT;
1374 
1375 	cred = NULL;
1376 	if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0)
1377 		goto charged;
1378 	if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) &&
1379 	    ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) {
1380 		if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start))
1381 			return (KERN_RESOURCE_SHORTAGE);
1382 		KASSERT(object == NULL ||
1383 		    (protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 ||
1384 		    object->cred == NULL,
1385 		    ("overcommit: vm_map_insert o %p", object));
1386 		cred = curthread->td_ucred;
1387 	}
1388 
1389 charged:
1390 	/* Expand the kernel pmap, if necessary. */
1391 	if (map == kernel_map && end > kernel_vm_end)
1392 		pmap_growkernel(end);
1393 	if (object != NULL) {
1394 		/*
1395 		 * OBJ_ONEMAPPING must be cleared unless this mapping
1396 		 * is trivially proven to be the only mapping for any
1397 		 * of the object's pages.  (Object granularity
1398 		 * reference counting is insufficient to recognize
1399 		 * aliases with precision.)
1400 		 */
1401 		VM_OBJECT_WLOCK(object);
1402 		if (object->ref_count > 1 || object->shadow_count != 0)
1403 			vm_object_clear_flag(object, OBJ_ONEMAPPING);
1404 		VM_OBJECT_WUNLOCK(object);
1405 	} else if ((prev_entry->eflags & ~MAP_ENTRY_USER_WIRED) ==
1406 	    protoeflags &&
1407 	    (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 &&
1408 	    prev_entry->end == start && (prev_entry->cred == cred ||
1409 	    (prev_entry->object.vm_object != NULL &&
1410 	    prev_entry->object.vm_object->cred == cred)) &&
1411 	    vm_object_coalesce(prev_entry->object.vm_object,
1412 	    prev_entry->offset,
1413 	    (vm_size_t)(prev_entry->end - prev_entry->start),
1414 	    (vm_size_t)(end - prev_entry->end), cred != NULL &&
1415 	    (protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)) {
1416 		/*
1417 		 * We were able to extend the object.  Determine if we
1418 		 * can extend the previous map entry to include the
1419 		 * new range as well.
1420 		 */
1421 		if (prev_entry->inheritance == inheritance &&
1422 		    prev_entry->protection == prot &&
1423 		    prev_entry->max_protection == max &&
1424 		    prev_entry->wired_count == 0) {
1425 			KASSERT((prev_entry->eflags & MAP_ENTRY_USER_WIRED) ==
1426 			    0, ("prev_entry %p has incoherent wiring",
1427 			    prev_entry));
1428 			if ((prev_entry->eflags & MAP_ENTRY_GUARD) == 0)
1429 				map->size += end - prev_entry->end;
1430 			prev_entry->end = end;
1431 			vm_map_entry_resize_free(map, prev_entry);
1432 			vm_map_simplify_entry(map, prev_entry);
1433 			return (KERN_SUCCESS);
1434 		}
1435 
1436 		/*
1437 		 * If we can extend the object but cannot extend the
1438 		 * map entry, we have to create a new map entry.  We
1439 		 * must bump the ref count on the extended object to
1440 		 * account for it.  object may be NULL.
1441 		 */
1442 		object = prev_entry->object.vm_object;
1443 		offset = prev_entry->offset +
1444 		    (prev_entry->end - prev_entry->start);
1445 		vm_object_reference(object);
1446 		if (cred != NULL && object != NULL && object->cred != NULL &&
1447 		    !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
1448 			/* Object already accounts for this uid. */
1449 			cred = NULL;
1450 		}
1451 	}
1452 	if (cred != NULL)
1453 		crhold(cred);
1454 
1455 	/*
1456 	 * Create a new entry
1457 	 */
1458 	new_entry = vm_map_entry_create(map);
1459 	new_entry->start = start;
1460 	new_entry->end = end;
1461 	new_entry->cred = NULL;
1462 
1463 	new_entry->eflags = protoeflags;
1464 	new_entry->object.vm_object = object;
1465 	new_entry->offset = offset;
1466 
1467 	new_entry->inheritance = inheritance;
1468 	new_entry->protection = prot;
1469 	new_entry->max_protection = max;
1470 	new_entry->wired_count = 0;
1471 	new_entry->wiring_thread = NULL;
1472 	new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT;
1473 	new_entry->next_read = start;
1474 
1475 	KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry),
1476 	    ("overcommit: vm_map_insert leaks vm_map %p", new_entry));
1477 	new_entry->cred = cred;
1478 
1479 	/*
1480 	 * Insert the new entry into the list
1481 	 */
1482 	vm_map_entry_link(map, new_entry);
1483 	if ((new_entry->eflags & MAP_ENTRY_GUARD) == 0)
1484 		map->size += new_entry->end - new_entry->start;
1485 
1486 	/*
1487 	 * Try to coalesce the new entry with both the previous and next
1488 	 * entries in the list.  Previously, we only attempted to coalesce
1489 	 * with the previous entry when object is NULL.  Here, we handle the
1490 	 * other cases, which are less common.
1491 	 */
1492 	vm_map_simplify_entry(map, new_entry);
1493 
1494 	if ((cow & (MAP_PREFAULT | MAP_PREFAULT_PARTIAL)) != 0) {
1495 		vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset),
1496 		    end - start, cow & MAP_PREFAULT_PARTIAL);
1497 	}
1498 
1499 	return (KERN_SUCCESS);
1500 }
1501 
1502 /*
1503  *	vm_map_findspace:
1504  *
1505  *	Find the first fit (lowest VM address) for "length" free bytes
1506  *	beginning at address >= start in the given map.
1507  *
1508  *	In a vm_map_entry, "max_free" is the maximum amount of
1509  *	contiguous free space between an entry in its subtree and a
1510  *	neighbor of that entry.  This allows finding a free region in
1511  *	one path down the tree, so O(log n) amortized with splay
1512  *	trees.
1513  *
1514  *	The map must be locked, and leaves it so.
1515  *
1516  *	Returns: starting address if sufficient space,
1517  *		 vm_map_max(map)-length+1 if insufficient space.
1518  */
1519 vm_offset_t
1520 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length)
1521 {
1522 	vm_map_entry_t llist, rlist, root, y;
1523 	vm_size_t left_length;
1524 
1525 	/*
1526 	 * Request must fit within min/max VM address and must avoid
1527 	 * address wrap.
1528 	 */
1529 	start = MAX(start, vm_map_min(map));
1530 	if (start + length > vm_map_max(map) || start + length < start)
1531 		return (vm_map_max(map) - length + 1);
1532 
1533 	/* Empty tree means wide open address space. */
1534 	if (map->root == NULL)
1535 		return (start);
1536 
1537 	/*
1538 	 * After splay, if start comes before root node, then there
1539 	 * must be a gap from start to the root.
1540 	 */
1541 	root = vm_map_splay_split(start, length, map->root,
1542 	    &llist, &rlist);
1543 	if (root != NULL)
1544 		start = root->end;
1545 	else if (rlist != NULL) {
1546 		root = rlist;
1547 		rlist = root->left;
1548 		root->left = NULL;
1549 	} else {
1550 		root = llist;
1551 		llist = root->right;
1552 		root->right = NULL;
1553 	}
1554 	map->root = vm_map_splay_merge(root, llist, rlist,
1555 	    root->left, root->right);
1556 	VM_MAP_ASSERT_CONSISTENT(map);
1557 	if (start + length <= root->start)
1558 		return (start);
1559 
1560 	/*
1561 	 * Root is the last node that might begin its gap before
1562 	 * start, and this is the last comparison where address
1563 	 * wrap might be a problem.
1564 	 */
1565 	if (root->right == NULL &&
1566 	    start + length <= vm_map_max(map))
1567 		return (start);
1568 
1569 	/* With max_free, can immediately tell if no solution. */
1570 	if (root->right == NULL || length > root->right->max_free)
1571 		return (vm_map_max(map) - length + 1);
1572 
1573 	/*
1574 	 * Splay for the least large-enough gap in the right subtree.
1575 	 */
1576 	llist = NULL;
1577         rlist = NULL;
1578 	for (left_length = 0; ;
1579 	     left_length = root->left != NULL ?
1580 	     root->left->max_free : root->start - llist->end) {
1581 		if (length <= left_length)
1582 			SPLAY_LEFT_STEP(root, y, rlist,
1583 			    length <= (y->left != NULL ?
1584 			    y->left->max_free : y->start - llist->end));
1585 		else
1586 			SPLAY_RIGHT_STEP(root, y, llist,
1587 			    length > (y->left != NULL ?
1588 			    y->left->max_free : y->start - root->end));
1589 		if (root == NULL)
1590 			break;
1591 	}
1592 	root = llist;
1593 	llist = root->right;
1594 	if ((y = rlist) == NULL)
1595 		root->right = NULL;
1596 	else {
1597 		rlist = y->left;
1598 		y->left = NULL;
1599 		root->right = y->right;
1600 	}
1601 	root = vm_map_splay_merge(root, llist, rlist,
1602 	    root->left, root->right);
1603 	if (y != NULL) {
1604 		y->right = root->right;
1605 		vm_map_entry_set_max_free(y);
1606 		root->right = y;
1607 		vm_map_entry_set_max_free(root);
1608 	}
1609 	map->root = root;
1610 	VM_MAP_ASSERT_CONSISTENT(map);
1611 	return (root->end);
1612 }
1613 
1614 int
1615 vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1616     vm_offset_t start, vm_size_t length, vm_prot_t prot,
1617     vm_prot_t max, int cow)
1618 {
1619 	vm_offset_t end;
1620 	int result;
1621 
1622 	end = start + length;
1623 	KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
1624 	    object == NULL,
1625 	    ("vm_map_fixed: non-NULL backing object for stack"));
1626 	vm_map_lock(map);
1627 	VM_MAP_RANGE_CHECK(map, start, end);
1628 	if ((cow & MAP_CHECK_EXCL) == 0)
1629 		vm_map_delete(map, start, end);
1630 	if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
1631 		result = vm_map_stack_locked(map, start, length, sgrowsiz,
1632 		    prot, max, cow);
1633 	} else {
1634 		result = vm_map_insert(map, object, offset, start, end,
1635 		    prot, max, cow);
1636 	}
1637 	vm_map_unlock(map);
1638 	return (result);
1639 }
1640 
1641 static const int aslr_pages_rnd_64[2] = {0x1000, 0x10};
1642 static const int aslr_pages_rnd_32[2] = {0x100, 0x4};
1643 
1644 static int cluster_anon = 1;
1645 SYSCTL_INT(_vm, OID_AUTO, cluster_anon, CTLFLAG_RW,
1646     &cluster_anon, 0,
1647     "Cluster anonymous mappings: 0 = no, 1 = yes if no hint, 2 = always");
1648 
1649 static bool
1650 clustering_anon_allowed(vm_offset_t addr)
1651 {
1652 
1653 	switch (cluster_anon) {
1654 	case 0:
1655 		return (false);
1656 	case 1:
1657 		return (addr == 0);
1658 	case 2:
1659 	default:
1660 		return (true);
1661 	}
1662 }
1663 
1664 static long aslr_restarts;
1665 SYSCTL_LONG(_vm, OID_AUTO, aslr_restarts, CTLFLAG_RD,
1666     &aslr_restarts, 0,
1667     "Number of aslr failures");
1668 
1669 #define	MAP_32BIT_MAX_ADDR	((vm_offset_t)1 << 31)
1670 
1671 /*
1672  * Searches for the specified amount of free space in the given map with the
1673  * specified alignment.  Performs an address-ordered, first-fit search from
1674  * the given address "*addr", with an optional upper bound "max_addr".  If the
1675  * parameter "alignment" is zero, then the alignment is computed from the
1676  * given (object, offset) pair so as to enable the greatest possible use of
1677  * superpage mappings.  Returns KERN_SUCCESS and the address of the free space
1678  * in "*addr" if successful.  Otherwise, returns KERN_NO_SPACE.
1679  *
1680  * The map must be locked.  Initially, there must be at least "length" bytes
1681  * of free space at the given address.
1682  */
1683 static int
1684 vm_map_alignspace(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1685     vm_offset_t *addr, vm_size_t length, vm_offset_t max_addr,
1686     vm_offset_t alignment)
1687 {
1688 	vm_offset_t aligned_addr, free_addr;
1689 
1690 	VM_MAP_ASSERT_LOCKED(map);
1691 	free_addr = *addr;
1692 	KASSERT(free_addr == vm_map_findspace(map, free_addr, length),
1693 	    ("caller failed to provide space %d at address %p",
1694 	     (int)length, (void*)free_addr));
1695 	for (;;) {
1696 		/*
1697 		 * At the start of every iteration, the free space at address
1698 		 * "*addr" is at least "length" bytes.
1699 		 */
1700 		if (alignment == 0)
1701 			pmap_align_superpage(object, offset, addr, length);
1702 		else if ((*addr & (alignment - 1)) != 0) {
1703 			*addr &= ~(alignment - 1);
1704 			*addr += alignment;
1705 		}
1706 		aligned_addr = *addr;
1707 		if (aligned_addr == free_addr) {
1708 			/*
1709 			 * Alignment did not change "*addr", so "*addr" must
1710 			 * still provide sufficient free space.
1711 			 */
1712 			return (KERN_SUCCESS);
1713 		}
1714 
1715 		/*
1716 		 * Test for address wrap on "*addr".  A wrapped "*addr" could
1717 		 * be a valid address, in which case vm_map_findspace() cannot
1718 		 * be relied upon to fail.
1719 		 */
1720 		if (aligned_addr < free_addr)
1721 			return (KERN_NO_SPACE);
1722 		*addr = vm_map_findspace(map, aligned_addr, length);
1723 		if (*addr + length > vm_map_max(map) ||
1724 		    (max_addr != 0 && *addr + length > max_addr))
1725 			return (KERN_NO_SPACE);
1726 		free_addr = *addr;
1727 		if (free_addr == aligned_addr) {
1728 			/*
1729 			 * If a successful call to vm_map_findspace() did not
1730 			 * change "*addr", then "*addr" must still be aligned
1731 			 * and provide sufficient free space.
1732 			 */
1733 			return (KERN_SUCCESS);
1734 		}
1735 	}
1736 }
1737 
1738 /*
1739  *	vm_map_find finds an unallocated region in the target address
1740  *	map with the given length.  The search is defined to be
1741  *	first-fit from the specified address; the region found is
1742  *	returned in the same parameter.
1743  *
1744  *	If object is non-NULL, ref count must be bumped by caller
1745  *	prior to making call to account for the new entry.
1746  */
1747 int
1748 vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1749 	    vm_offset_t *addr,	/* IN/OUT */
1750 	    vm_size_t length, vm_offset_t max_addr, int find_space,
1751 	    vm_prot_t prot, vm_prot_t max, int cow)
1752 {
1753 	vm_offset_t alignment, curr_min_addr, min_addr;
1754 	int gap, pidx, rv, try;
1755 	bool cluster, en_aslr, update_anon;
1756 
1757 	KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
1758 	    object == NULL,
1759 	    ("vm_map_find: non-NULL backing object for stack"));
1760 	MPASS((cow & MAP_REMAP) == 0 || (find_space == VMFS_NO_SPACE &&
1761 	    (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0));
1762 	if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL ||
1763 	    (object->flags & OBJ_COLORED) == 0))
1764 		find_space = VMFS_ANY_SPACE;
1765 	if (find_space >> 8 != 0) {
1766 		KASSERT((find_space & 0xff) == 0, ("bad VMFS flags"));
1767 		alignment = (vm_offset_t)1 << (find_space >> 8);
1768 	} else
1769 		alignment = 0;
1770 	en_aslr = (map->flags & MAP_ASLR) != 0;
1771 	update_anon = cluster = clustering_anon_allowed(*addr) &&
1772 	    (map->flags & MAP_IS_SUB_MAP) == 0 && max_addr == 0 &&
1773 	    find_space != VMFS_NO_SPACE && object == NULL &&
1774 	    (cow & (MAP_INHERIT_SHARE | MAP_STACK_GROWS_UP |
1775 	    MAP_STACK_GROWS_DOWN)) == 0 && prot != PROT_NONE;
1776 	curr_min_addr = min_addr = *addr;
1777 	if (en_aslr && min_addr == 0 && !cluster &&
1778 	    find_space != VMFS_NO_SPACE &&
1779 	    (map->flags & MAP_ASLR_IGNSTART) != 0)
1780 		curr_min_addr = min_addr = vm_map_min(map);
1781 	try = 0;
1782 	vm_map_lock(map);
1783 	if (cluster) {
1784 		curr_min_addr = map->anon_loc;
1785 		if (curr_min_addr == 0)
1786 			cluster = false;
1787 	}
1788 	if (find_space != VMFS_NO_SPACE) {
1789 		KASSERT(find_space == VMFS_ANY_SPACE ||
1790 		    find_space == VMFS_OPTIMAL_SPACE ||
1791 		    find_space == VMFS_SUPER_SPACE ||
1792 		    alignment != 0, ("unexpected VMFS flag"));
1793 again:
1794 		/*
1795 		 * When creating an anonymous mapping, try clustering
1796 		 * with an existing anonymous mapping first.
1797 		 *
1798 		 * We make up to two attempts to find address space
1799 		 * for a given find_space value. The first attempt may
1800 		 * apply randomization or may cluster with an existing
1801 		 * anonymous mapping. If this first attempt fails,
1802 		 * perform a first-fit search of the available address
1803 		 * space.
1804 		 *
1805 		 * If all tries failed, and find_space is
1806 		 * VMFS_OPTIMAL_SPACE, fallback to VMFS_ANY_SPACE.
1807 		 * Again enable clustering and randomization.
1808 		 */
1809 		try++;
1810 		MPASS(try <= 2);
1811 
1812 		if (try == 2) {
1813 			/*
1814 			 * Second try: we failed either to find a
1815 			 * suitable region for randomizing the
1816 			 * allocation, or to cluster with an existing
1817 			 * mapping.  Retry with free run.
1818 			 */
1819 			curr_min_addr = (map->flags & MAP_ASLR_IGNSTART) != 0 ?
1820 			    vm_map_min(map) : min_addr;
1821 			atomic_add_long(&aslr_restarts, 1);
1822 		}
1823 
1824 		if (try == 1 && en_aslr && !cluster) {
1825 			/*
1826 			 * Find space for allocation, including
1827 			 * gap needed for later randomization.
1828 			 */
1829 			pidx = MAXPAGESIZES > 1 && pagesizes[1] != 0 &&
1830 			    (find_space == VMFS_SUPER_SPACE || find_space ==
1831 			    VMFS_OPTIMAL_SPACE) ? 1 : 0;
1832 			gap = vm_map_max(map) > MAP_32BIT_MAX_ADDR &&
1833 			    (max_addr == 0 || max_addr > MAP_32BIT_MAX_ADDR) ?
1834 			    aslr_pages_rnd_64[pidx] : aslr_pages_rnd_32[pidx];
1835 			*addr = vm_map_findspace(map, curr_min_addr,
1836 			    length + gap * pagesizes[pidx]);
1837 			if (*addr + length + gap * pagesizes[pidx] >
1838 			    vm_map_max(map))
1839 				goto again;
1840 			/* And randomize the start address. */
1841 			*addr += (arc4random() % gap) * pagesizes[pidx];
1842 			if (max_addr != 0 && *addr + length > max_addr)
1843 				goto again;
1844 		} else {
1845 			*addr = vm_map_findspace(map, curr_min_addr, length);
1846 			if (*addr + length > vm_map_max(map) ||
1847 			    (max_addr != 0 && *addr + length > max_addr)) {
1848 				if (cluster) {
1849 					cluster = false;
1850 					MPASS(try == 1);
1851 					goto again;
1852 				}
1853 				rv = KERN_NO_SPACE;
1854 				goto done;
1855 			}
1856 		}
1857 
1858 		if (find_space != VMFS_ANY_SPACE &&
1859 		    (rv = vm_map_alignspace(map, object, offset, addr, length,
1860 		    max_addr, alignment)) != KERN_SUCCESS) {
1861 			if (find_space == VMFS_OPTIMAL_SPACE) {
1862 				find_space = VMFS_ANY_SPACE;
1863 				curr_min_addr = min_addr;
1864 				cluster = update_anon;
1865 				try = 0;
1866 				goto again;
1867 			}
1868 			goto done;
1869 		}
1870 	} else if ((cow & MAP_REMAP) != 0) {
1871 		if (*addr < vm_map_min(map) ||
1872 		    *addr + length > vm_map_max(map) ||
1873 		    *addr + length <= length) {
1874 			rv = KERN_INVALID_ADDRESS;
1875 			goto done;
1876 		}
1877 		vm_map_delete(map, *addr, *addr + length);
1878 	}
1879 	if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
1880 		rv = vm_map_stack_locked(map, *addr, length, sgrowsiz, prot,
1881 		    max, cow);
1882 	} else {
1883 		rv = vm_map_insert(map, object, offset, *addr, *addr + length,
1884 		    prot, max, cow);
1885 	}
1886 	if (rv == KERN_SUCCESS && update_anon)
1887 		map->anon_loc = *addr + length;
1888 done:
1889 	vm_map_unlock(map);
1890 	return (rv);
1891 }
1892 
1893 /*
1894  *	vm_map_find_min() is a variant of vm_map_find() that takes an
1895  *	additional parameter (min_addr) and treats the given address
1896  *	(*addr) differently.  Specifically, it treats *addr as a hint
1897  *	and not as the minimum address where the mapping is created.
1898  *
1899  *	This function works in two phases.  First, it tries to
1900  *	allocate above the hint.  If that fails and the hint is
1901  *	greater than min_addr, it performs a second pass, replacing
1902  *	the hint with min_addr as the minimum address for the
1903  *	allocation.
1904  */
1905 int
1906 vm_map_find_min(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1907     vm_offset_t *addr, vm_size_t length, vm_offset_t min_addr,
1908     vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max,
1909     int cow)
1910 {
1911 	vm_offset_t hint;
1912 	int rv;
1913 
1914 	hint = *addr;
1915 	for (;;) {
1916 		rv = vm_map_find(map, object, offset, addr, length, max_addr,
1917 		    find_space, prot, max, cow);
1918 		if (rv == KERN_SUCCESS || min_addr >= hint)
1919 			return (rv);
1920 		*addr = hint = min_addr;
1921 	}
1922 }
1923 
1924 /*
1925  * A map entry with any of the following flags set must not be merged with
1926  * another entry.
1927  */
1928 #define	MAP_ENTRY_NOMERGE_MASK	(MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP | \
1929 	    MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP)
1930 
1931 static bool
1932 vm_map_mergeable_neighbors(vm_map_entry_t prev, vm_map_entry_t entry)
1933 {
1934 
1935 	KASSERT((prev->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 ||
1936 	    (entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0,
1937 	    ("vm_map_mergeable_neighbors: neither %p nor %p are mergeable",
1938 	    prev, entry));
1939 	return (prev->end == entry->start &&
1940 	    prev->object.vm_object == entry->object.vm_object &&
1941 	    (prev->object.vm_object == NULL ||
1942 	    prev->offset + (prev->end - prev->start) == entry->offset) &&
1943 	    prev->eflags == entry->eflags &&
1944 	    prev->protection == entry->protection &&
1945 	    prev->max_protection == entry->max_protection &&
1946 	    prev->inheritance == entry->inheritance &&
1947 	    prev->wired_count == entry->wired_count &&
1948 	    prev->cred == entry->cred);
1949 }
1950 
1951 static void
1952 vm_map_merged_neighbor_dispose(vm_map_t map, vm_map_entry_t entry)
1953 {
1954 
1955 	/*
1956 	 * If the backing object is a vnode object, vm_object_deallocate()
1957 	 * calls vrele().  However, vrele() does not lock the vnode because
1958 	 * the vnode has additional references.  Thus, the map lock can be
1959 	 * kept without causing a lock-order reversal with the vnode lock.
1960 	 *
1961 	 * Since we count the number of virtual page mappings in
1962 	 * object->un_pager.vnp.writemappings, the writemappings value
1963 	 * should not be adjusted when the entry is disposed of.
1964 	 */
1965 	if (entry->object.vm_object != NULL)
1966 		vm_object_deallocate(entry->object.vm_object);
1967 	if (entry->cred != NULL)
1968 		crfree(entry->cred);
1969 	vm_map_entry_dispose(map, entry);
1970 }
1971 
1972 /*
1973  *	vm_map_simplify_entry:
1974  *
1975  *	Simplify the given map entry by merging with either neighbor.  This
1976  *	routine also has the ability to merge with both neighbors.
1977  *
1978  *	The map must be locked.
1979  *
1980  *	This routine guarantees that the passed entry remains valid (though
1981  *	possibly extended).  When merging, this routine may delete one or
1982  *	both neighbors.
1983  */
1984 void
1985 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry)
1986 {
1987 	vm_map_entry_t next, prev;
1988 
1989 	if ((entry->eflags & MAP_ENTRY_NOMERGE_MASK) != 0)
1990 		return;
1991 	prev = entry->prev;
1992 	if (vm_map_mergeable_neighbors(prev, entry)) {
1993 		vm_map_entry_unlink(map, prev, UNLINK_MERGE_NEXT);
1994 		vm_map_merged_neighbor_dispose(map, prev);
1995 	}
1996 	next = entry->next;
1997 	if (vm_map_mergeable_neighbors(entry, next)) {
1998 		vm_map_entry_unlink(map, next, UNLINK_MERGE_PREV);
1999 		vm_map_merged_neighbor_dispose(map, next);
2000 	}
2001 }
2002 
2003 /*
2004  *	vm_map_clip_start:	[ internal use only ]
2005  *
2006  *	Asserts that the given entry begins at or after
2007  *	the specified address; if necessary,
2008  *	it splits the entry into two.
2009  */
2010 #define vm_map_clip_start(map, entry, startaddr) \
2011 { \
2012 	if (startaddr > entry->start) \
2013 		_vm_map_clip_start(map, entry, startaddr); \
2014 }
2015 
2016 /*
2017  *	This routine is called only when it is known that
2018  *	the entry must be split.
2019  */
2020 static void
2021 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start)
2022 {
2023 	vm_map_entry_t new_entry;
2024 
2025 	VM_MAP_ASSERT_LOCKED(map);
2026 	KASSERT(entry->end > start && entry->start < start,
2027 	    ("_vm_map_clip_start: invalid clip of entry %p", entry));
2028 
2029 	/*
2030 	 * Split off the front portion -- note that we must insert the new
2031 	 * entry BEFORE this one, so that this entry has the specified
2032 	 * starting address.
2033 	 */
2034 	vm_map_simplify_entry(map, entry);
2035 
2036 	/*
2037 	 * If there is no object backing this entry, we might as well create
2038 	 * one now.  If we defer it, an object can get created after the map
2039 	 * is clipped, and individual objects will be created for the split-up
2040 	 * map.  This is a bit of a hack, but is also about the best place to
2041 	 * put this improvement.
2042 	 */
2043 	if (entry->object.vm_object == NULL && !map->system_map &&
2044 	    (entry->eflags & MAP_ENTRY_GUARD) == 0) {
2045 		vm_object_t object;
2046 		object = vm_object_allocate(OBJT_DEFAULT,
2047 				atop(entry->end - entry->start));
2048 		entry->object.vm_object = object;
2049 		entry->offset = 0;
2050 		if (entry->cred != NULL) {
2051 			object->cred = entry->cred;
2052 			object->charge = entry->end - entry->start;
2053 			entry->cred = NULL;
2054 		}
2055 	} else if (entry->object.vm_object != NULL &&
2056 		   ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
2057 		   entry->cred != NULL) {
2058 		VM_OBJECT_WLOCK(entry->object.vm_object);
2059 		KASSERT(entry->object.vm_object->cred == NULL,
2060 		    ("OVERCOMMIT: vm_entry_clip_start: both cred e %p", entry));
2061 		entry->object.vm_object->cred = entry->cred;
2062 		entry->object.vm_object->charge = entry->end - entry->start;
2063 		VM_OBJECT_WUNLOCK(entry->object.vm_object);
2064 		entry->cred = NULL;
2065 	}
2066 
2067 	new_entry = vm_map_entry_create(map);
2068 	*new_entry = *entry;
2069 
2070 	new_entry->end = start;
2071 	entry->offset += (start - entry->start);
2072 	entry->start = start;
2073 	if (new_entry->cred != NULL)
2074 		crhold(entry->cred);
2075 
2076 	vm_map_entry_link(map, new_entry);
2077 
2078 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
2079 		vm_object_reference(new_entry->object.vm_object);
2080 		/*
2081 		 * The object->un_pager.vnp.writemappings for the
2082 		 * object of MAP_ENTRY_VN_WRITECNT type entry shall be
2083 		 * kept as is here.  The virtual pages are
2084 		 * re-distributed among the clipped entries, so the sum is
2085 		 * left the same.
2086 		 */
2087 	}
2088 }
2089 
2090 /*
2091  *	vm_map_clip_end:	[ internal use only ]
2092  *
2093  *	Asserts that the given entry ends at or before
2094  *	the specified address; if necessary,
2095  *	it splits the entry into two.
2096  */
2097 #define vm_map_clip_end(map, entry, endaddr) \
2098 { \
2099 	if ((endaddr) < (entry->end)) \
2100 		_vm_map_clip_end((map), (entry), (endaddr)); \
2101 }
2102 
2103 /*
2104  *	This routine is called only when it is known that
2105  *	the entry must be split.
2106  */
2107 static void
2108 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end)
2109 {
2110 	vm_map_entry_t new_entry;
2111 
2112 	VM_MAP_ASSERT_LOCKED(map);
2113 	KASSERT(entry->start < end && entry->end > end,
2114 	    ("_vm_map_clip_end: invalid clip of entry %p", entry));
2115 
2116 	/*
2117 	 * If there is no object backing this entry, we might as well create
2118 	 * one now.  If we defer it, an object can get created after the map
2119 	 * is clipped, and individual objects will be created for the split-up
2120 	 * map.  This is a bit of a hack, but is also about the best place to
2121 	 * put this improvement.
2122 	 */
2123 	if (entry->object.vm_object == NULL && !map->system_map &&
2124 	    (entry->eflags & MAP_ENTRY_GUARD) == 0) {
2125 		vm_object_t object;
2126 		object = vm_object_allocate(OBJT_DEFAULT,
2127 				atop(entry->end - entry->start));
2128 		entry->object.vm_object = object;
2129 		entry->offset = 0;
2130 		if (entry->cred != NULL) {
2131 			object->cred = entry->cred;
2132 			object->charge = entry->end - entry->start;
2133 			entry->cred = NULL;
2134 		}
2135 	} else if (entry->object.vm_object != NULL &&
2136 		   ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
2137 		   entry->cred != NULL) {
2138 		VM_OBJECT_WLOCK(entry->object.vm_object);
2139 		KASSERT(entry->object.vm_object->cred == NULL,
2140 		    ("OVERCOMMIT: vm_entry_clip_end: both cred e %p", entry));
2141 		entry->object.vm_object->cred = entry->cred;
2142 		entry->object.vm_object->charge = entry->end - entry->start;
2143 		VM_OBJECT_WUNLOCK(entry->object.vm_object);
2144 		entry->cred = NULL;
2145 	}
2146 
2147 	/*
2148 	 * Create a new entry and insert it AFTER the specified entry
2149 	 */
2150 	new_entry = vm_map_entry_create(map);
2151 	*new_entry = *entry;
2152 
2153 	new_entry->start = entry->end = end;
2154 	new_entry->offset += (end - entry->start);
2155 	if (new_entry->cred != NULL)
2156 		crhold(entry->cred);
2157 
2158 	vm_map_entry_link(map, new_entry);
2159 
2160 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
2161 		vm_object_reference(new_entry->object.vm_object);
2162 	}
2163 }
2164 
2165 /*
2166  *	vm_map_submap:		[ kernel use only ]
2167  *
2168  *	Mark the given range as handled by a subordinate map.
2169  *
2170  *	This range must have been created with vm_map_find,
2171  *	and no other operations may have been performed on this
2172  *	range prior to calling vm_map_submap.
2173  *
2174  *	Only a limited number of operations can be performed
2175  *	within this rage after calling vm_map_submap:
2176  *		vm_fault
2177  *	[Don't try vm_map_copy!]
2178  *
2179  *	To remove a submapping, one must first remove the
2180  *	range from the superior map, and then destroy the
2181  *	submap (if desired).  [Better yet, don't try it.]
2182  */
2183 int
2184 vm_map_submap(
2185 	vm_map_t map,
2186 	vm_offset_t start,
2187 	vm_offset_t end,
2188 	vm_map_t submap)
2189 {
2190 	vm_map_entry_t entry;
2191 	int result;
2192 
2193 	result = KERN_INVALID_ARGUMENT;
2194 
2195 	vm_map_lock(submap);
2196 	submap->flags |= MAP_IS_SUB_MAP;
2197 	vm_map_unlock(submap);
2198 
2199 	vm_map_lock(map);
2200 
2201 	VM_MAP_RANGE_CHECK(map, start, end);
2202 
2203 	if (vm_map_lookup_entry(map, start, &entry)) {
2204 		vm_map_clip_start(map, entry, start);
2205 	} else
2206 		entry = entry->next;
2207 
2208 	vm_map_clip_end(map, entry, end);
2209 
2210 	if ((entry->start == start) && (entry->end == end) &&
2211 	    ((entry->eflags & MAP_ENTRY_COW) == 0) &&
2212 	    (entry->object.vm_object == NULL)) {
2213 		entry->object.sub_map = submap;
2214 		entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
2215 		result = KERN_SUCCESS;
2216 	}
2217 	vm_map_unlock(map);
2218 
2219 	if (result != KERN_SUCCESS) {
2220 		vm_map_lock(submap);
2221 		submap->flags &= ~MAP_IS_SUB_MAP;
2222 		vm_map_unlock(submap);
2223 	}
2224 	return (result);
2225 }
2226 
2227 /*
2228  * The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified
2229  */
2230 #define	MAX_INIT_PT	96
2231 
2232 /*
2233  *	vm_map_pmap_enter:
2234  *
2235  *	Preload the specified map's pmap with mappings to the specified
2236  *	object's memory-resident pages.  No further physical pages are
2237  *	allocated, and no further virtual pages are retrieved from secondary
2238  *	storage.  If the specified flags include MAP_PREFAULT_PARTIAL, then a
2239  *	limited number of page mappings are created at the low-end of the
2240  *	specified address range.  (For this purpose, a superpage mapping
2241  *	counts as one page mapping.)  Otherwise, all resident pages within
2242  *	the specified address range are mapped.
2243  */
2244 static void
2245 vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
2246     vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
2247 {
2248 	vm_offset_t start;
2249 	vm_page_t p, p_start;
2250 	vm_pindex_t mask, psize, threshold, tmpidx;
2251 
2252 	if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
2253 		return;
2254 	VM_OBJECT_RLOCK(object);
2255 	if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
2256 		VM_OBJECT_RUNLOCK(object);
2257 		VM_OBJECT_WLOCK(object);
2258 		if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
2259 			pmap_object_init_pt(map->pmap, addr, object, pindex,
2260 			    size);
2261 			VM_OBJECT_WUNLOCK(object);
2262 			return;
2263 		}
2264 		VM_OBJECT_LOCK_DOWNGRADE(object);
2265 	}
2266 
2267 	psize = atop(size);
2268 	if (psize + pindex > object->size) {
2269 		if (object->size < pindex) {
2270 			VM_OBJECT_RUNLOCK(object);
2271 			return;
2272 		}
2273 		psize = object->size - pindex;
2274 	}
2275 
2276 	start = 0;
2277 	p_start = NULL;
2278 	threshold = MAX_INIT_PT;
2279 
2280 	p = vm_page_find_least(object, pindex);
2281 	/*
2282 	 * Assert: the variable p is either (1) the page with the
2283 	 * least pindex greater than or equal to the parameter pindex
2284 	 * or (2) NULL.
2285 	 */
2286 	for (;
2287 	     p != NULL && (tmpidx = p->pindex - pindex) < psize;
2288 	     p = TAILQ_NEXT(p, listq)) {
2289 		/*
2290 		 * don't allow an madvise to blow away our really
2291 		 * free pages allocating pv entries.
2292 		 */
2293 		if (((flags & MAP_PREFAULT_MADVISE) != 0 &&
2294 		    vm_page_count_severe()) ||
2295 		    ((flags & MAP_PREFAULT_PARTIAL) != 0 &&
2296 		    tmpidx >= threshold)) {
2297 			psize = tmpidx;
2298 			break;
2299 		}
2300 		if (p->valid == VM_PAGE_BITS_ALL) {
2301 			if (p_start == NULL) {
2302 				start = addr + ptoa(tmpidx);
2303 				p_start = p;
2304 			}
2305 			/* Jump ahead if a superpage mapping is possible. */
2306 			if (p->psind > 0 && ((addr + ptoa(tmpidx)) &
2307 			    (pagesizes[p->psind] - 1)) == 0) {
2308 				mask = atop(pagesizes[p->psind]) - 1;
2309 				if (tmpidx + mask < psize &&
2310 				    vm_page_ps_test(p, PS_ALL_VALID, NULL)) {
2311 					p += mask;
2312 					threshold += mask;
2313 				}
2314 			}
2315 		} else if (p_start != NULL) {
2316 			pmap_enter_object(map->pmap, start, addr +
2317 			    ptoa(tmpidx), p_start, prot);
2318 			p_start = NULL;
2319 		}
2320 	}
2321 	if (p_start != NULL)
2322 		pmap_enter_object(map->pmap, start, addr + ptoa(psize),
2323 		    p_start, prot);
2324 	VM_OBJECT_RUNLOCK(object);
2325 }
2326 
2327 /*
2328  *	vm_map_protect:
2329  *
2330  *	Sets the protection of the specified address
2331  *	region in the target map.  If "set_max" is
2332  *	specified, the maximum protection is to be set;
2333  *	otherwise, only the current protection is affected.
2334  */
2335 int
2336 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
2337 	       vm_prot_t new_prot, boolean_t set_max)
2338 {
2339 	vm_map_entry_t current, entry;
2340 	vm_object_t obj;
2341 	struct ucred *cred;
2342 	vm_prot_t old_prot;
2343 
2344 	if (start == end)
2345 		return (KERN_SUCCESS);
2346 
2347 	vm_map_lock(map);
2348 
2349 	/*
2350 	 * Ensure that we are not concurrently wiring pages.  vm_map_wire() may
2351 	 * need to fault pages into the map and will drop the map lock while
2352 	 * doing so, and the VM object may end up in an inconsistent state if we
2353 	 * update the protection on the map entry in between faults.
2354 	 */
2355 	vm_map_wait_busy(map);
2356 
2357 	VM_MAP_RANGE_CHECK(map, start, end);
2358 
2359 	if (vm_map_lookup_entry(map, start, &entry)) {
2360 		vm_map_clip_start(map, entry, start);
2361 	} else {
2362 		entry = entry->next;
2363 	}
2364 
2365 	/*
2366 	 * Make a first pass to check for protection violations.
2367 	 */
2368 	for (current = entry; current->start < end; current = current->next) {
2369 		if ((current->eflags & MAP_ENTRY_GUARD) != 0)
2370 			continue;
2371 		if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
2372 			vm_map_unlock(map);
2373 			return (KERN_INVALID_ARGUMENT);
2374 		}
2375 		if ((new_prot & current->max_protection) != new_prot) {
2376 			vm_map_unlock(map);
2377 			return (KERN_PROTECTION_FAILURE);
2378 		}
2379 	}
2380 
2381 	/*
2382 	 * Do an accounting pass for private read-only mappings that
2383 	 * now will do cow due to allowed write (e.g. debugger sets
2384 	 * breakpoint on text segment)
2385 	 */
2386 	for (current = entry; current->start < end; current = current->next) {
2387 
2388 		vm_map_clip_end(map, current, end);
2389 
2390 		if (set_max ||
2391 		    ((new_prot & ~(current->protection)) & VM_PROT_WRITE) == 0 ||
2392 		    ENTRY_CHARGED(current) ||
2393 		    (current->eflags & MAP_ENTRY_GUARD) != 0) {
2394 			continue;
2395 		}
2396 
2397 		cred = curthread->td_ucred;
2398 		obj = current->object.vm_object;
2399 
2400 		if (obj == NULL || (current->eflags & MAP_ENTRY_NEEDS_COPY)) {
2401 			if (!swap_reserve(current->end - current->start)) {
2402 				vm_map_unlock(map);
2403 				return (KERN_RESOURCE_SHORTAGE);
2404 			}
2405 			crhold(cred);
2406 			current->cred = cred;
2407 			continue;
2408 		}
2409 
2410 		VM_OBJECT_WLOCK(obj);
2411 		if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP) {
2412 			VM_OBJECT_WUNLOCK(obj);
2413 			continue;
2414 		}
2415 
2416 		/*
2417 		 * Charge for the whole object allocation now, since
2418 		 * we cannot distinguish between non-charged and
2419 		 * charged clipped mapping of the same object later.
2420 		 */
2421 		KASSERT(obj->charge == 0,
2422 		    ("vm_map_protect: object %p overcharged (entry %p)",
2423 		    obj, current));
2424 		if (!swap_reserve(ptoa(obj->size))) {
2425 			VM_OBJECT_WUNLOCK(obj);
2426 			vm_map_unlock(map);
2427 			return (KERN_RESOURCE_SHORTAGE);
2428 		}
2429 
2430 		crhold(cred);
2431 		obj->cred = cred;
2432 		obj->charge = ptoa(obj->size);
2433 		VM_OBJECT_WUNLOCK(obj);
2434 	}
2435 
2436 	/*
2437 	 * Go back and fix up protections. [Note that clipping is not
2438 	 * necessary the second time.]
2439 	 */
2440 	for (current = entry; current->start < end; current = current->next) {
2441 		if ((current->eflags & MAP_ENTRY_GUARD) != 0)
2442 			continue;
2443 
2444 		old_prot = current->protection;
2445 
2446 		if (set_max)
2447 			current->protection =
2448 			    (current->max_protection = new_prot) &
2449 			    old_prot;
2450 		else
2451 			current->protection = new_prot;
2452 
2453 		/*
2454 		 * For user wired map entries, the normal lazy evaluation of
2455 		 * write access upgrades through soft page faults is
2456 		 * undesirable.  Instead, immediately copy any pages that are
2457 		 * copy-on-write and enable write access in the physical map.
2458 		 */
2459 		if ((current->eflags & MAP_ENTRY_USER_WIRED) != 0 &&
2460 		    (current->protection & VM_PROT_WRITE) != 0 &&
2461 		    (old_prot & VM_PROT_WRITE) == 0)
2462 			vm_fault_copy_entry(map, map, current, current, NULL);
2463 
2464 		/*
2465 		 * When restricting access, update the physical map.  Worry
2466 		 * about copy-on-write here.
2467 		 */
2468 		if ((old_prot & ~current->protection) != 0) {
2469 #define MASK(entry)	(((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
2470 							VM_PROT_ALL)
2471 			pmap_protect(map->pmap, current->start,
2472 			    current->end,
2473 			    current->protection & MASK(current));
2474 #undef	MASK
2475 		}
2476 		vm_map_simplify_entry(map, current);
2477 	}
2478 	vm_map_unlock(map);
2479 	return (KERN_SUCCESS);
2480 }
2481 
2482 /*
2483  *	vm_map_madvise:
2484  *
2485  *	This routine traverses a processes map handling the madvise
2486  *	system call.  Advisories are classified as either those effecting
2487  *	the vm_map_entry structure, or those effecting the underlying
2488  *	objects.
2489  */
2490 int
2491 vm_map_madvise(
2492 	vm_map_t map,
2493 	vm_offset_t start,
2494 	vm_offset_t end,
2495 	int behav)
2496 {
2497 	vm_map_entry_t current, entry;
2498 	bool modify_map;
2499 
2500 	/*
2501 	 * Some madvise calls directly modify the vm_map_entry, in which case
2502 	 * we need to use an exclusive lock on the map and we need to perform
2503 	 * various clipping operations.  Otherwise we only need a read-lock
2504 	 * on the map.
2505 	 */
2506 	switch(behav) {
2507 	case MADV_NORMAL:
2508 	case MADV_SEQUENTIAL:
2509 	case MADV_RANDOM:
2510 	case MADV_NOSYNC:
2511 	case MADV_AUTOSYNC:
2512 	case MADV_NOCORE:
2513 	case MADV_CORE:
2514 		if (start == end)
2515 			return (0);
2516 		modify_map = true;
2517 		vm_map_lock(map);
2518 		break;
2519 	case MADV_WILLNEED:
2520 	case MADV_DONTNEED:
2521 	case MADV_FREE:
2522 		if (start == end)
2523 			return (0);
2524 		modify_map = false;
2525 		vm_map_lock_read(map);
2526 		break;
2527 	default:
2528 		return (EINVAL);
2529 	}
2530 
2531 	/*
2532 	 * Locate starting entry and clip if necessary.
2533 	 */
2534 	VM_MAP_RANGE_CHECK(map, start, end);
2535 
2536 	if (vm_map_lookup_entry(map, start, &entry)) {
2537 		if (modify_map)
2538 			vm_map_clip_start(map, entry, start);
2539 	} else {
2540 		entry = entry->next;
2541 	}
2542 
2543 	if (modify_map) {
2544 		/*
2545 		 * madvise behaviors that are implemented in the vm_map_entry.
2546 		 *
2547 		 * We clip the vm_map_entry so that behavioral changes are
2548 		 * limited to the specified address range.
2549 		 */
2550 		for (current = entry; current->start < end;
2551 		    current = current->next) {
2552 			if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
2553 				continue;
2554 
2555 			vm_map_clip_end(map, current, end);
2556 
2557 			switch (behav) {
2558 			case MADV_NORMAL:
2559 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
2560 				break;
2561 			case MADV_SEQUENTIAL:
2562 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
2563 				break;
2564 			case MADV_RANDOM:
2565 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
2566 				break;
2567 			case MADV_NOSYNC:
2568 				current->eflags |= MAP_ENTRY_NOSYNC;
2569 				break;
2570 			case MADV_AUTOSYNC:
2571 				current->eflags &= ~MAP_ENTRY_NOSYNC;
2572 				break;
2573 			case MADV_NOCORE:
2574 				current->eflags |= MAP_ENTRY_NOCOREDUMP;
2575 				break;
2576 			case MADV_CORE:
2577 				current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
2578 				break;
2579 			default:
2580 				break;
2581 			}
2582 			vm_map_simplify_entry(map, current);
2583 		}
2584 		vm_map_unlock(map);
2585 	} else {
2586 		vm_pindex_t pstart, pend;
2587 
2588 		/*
2589 		 * madvise behaviors that are implemented in the underlying
2590 		 * vm_object.
2591 		 *
2592 		 * Since we don't clip the vm_map_entry, we have to clip
2593 		 * the vm_object pindex and count.
2594 		 */
2595 		for (current = entry; current->start < end;
2596 		    current = current->next) {
2597 			vm_offset_t useEnd, useStart;
2598 
2599 			if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
2600 				continue;
2601 
2602 			pstart = OFF_TO_IDX(current->offset);
2603 			pend = pstart + atop(current->end - current->start);
2604 			useStart = current->start;
2605 			useEnd = current->end;
2606 
2607 			if (current->start < start) {
2608 				pstart += atop(start - current->start);
2609 				useStart = start;
2610 			}
2611 			if (current->end > end) {
2612 				pend -= atop(current->end - end);
2613 				useEnd = end;
2614 			}
2615 
2616 			if (pstart >= pend)
2617 				continue;
2618 
2619 			/*
2620 			 * Perform the pmap_advise() before clearing
2621 			 * PGA_REFERENCED in vm_page_advise().  Otherwise, a
2622 			 * concurrent pmap operation, such as pmap_remove(),
2623 			 * could clear a reference in the pmap and set
2624 			 * PGA_REFERENCED on the page before the pmap_advise()
2625 			 * had completed.  Consequently, the page would appear
2626 			 * referenced based upon an old reference that
2627 			 * occurred before this pmap_advise() ran.
2628 			 */
2629 			if (behav == MADV_DONTNEED || behav == MADV_FREE)
2630 				pmap_advise(map->pmap, useStart, useEnd,
2631 				    behav);
2632 
2633 			vm_object_madvise(current->object.vm_object, pstart,
2634 			    pend, behav);
2635 
2636 			/*
2637 			 * Pre-populate paging structures in the
2638 			 * WILLNEED case.  For wired entries, the
2639 			 * paging structures are already populated.
2640 			 */
2641 			if (behav == MADV_WILLNEED &&
2642 			    current->wired_count == 0) {
2643 				vm_map_pmap_enter(map,
2644 				    useStart,
2645 				    current->protection,
2646 				    current->object.vm_object,
2647 				    pstart,
2648 				    ptoa(pend - pstart),
2649 				    MAP_PREFAULT_MADVISE
2650 				);
2651 			}
2652 		}
2653 		vm_map_unlock_read(map);
2654 	}
2655 	return (0);
2656 }
2657 
2658 
2659 /*
2660  *	vm_map_inherit:
2661  *
2662  *	Sets the inheritance of the specified address
2663  *	range in the target map.  Inheritance
2664  *	affects how the map will be shared with
2665  *	child maps at the time of vmspace_fork.
2666  */
2667 int
2668 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
2669 	       vm_inherit_t new_inheritance)
2670 {
2671 	vm_map_entry_t entry;
2672 	vm_map_entry_t temp_entry;
2673 
2674 	switch (new_inheritance) {
2675 	case VM_INHERIT_NONE:
2676 	case VM_INHERIT_COPY:
2677 	case VM_INHERIT_SHARE:
2678 	case VM_INHERIT_ZERO:
2679 		break;
2680 	default:
2681 		return (KERN_INVALID_ARGUMENT);
2682 	}
2683 	if (start == end)
2684 		return (KERN_SUCCESS);
2685 	vm_map_lock(map);
2686 	VM_MAP_RANGE_CHECK(map, start, end);
2687 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
2688 		entry = temp_entry;
2689 		vm_map_clip_start(map, entry, start);
2690 	} else
2691 		entry = temp_entry->next;
2692 	while (entry->start < end) {
2693 		vm_map_clip_end(map, entry, end);
2694 		if ((entry->eflags & MAP_ENTRY_GUARD) == 0 ||
2695 		    new_inheritance != VM_INHERIT_ZERO)
2696 			entry->inheritance = new_inheritance;
2697 		vm_map_simplify_entry(map, entry);
2698 		entry = entry->next;
2699 	}
2700 	vm_map_unlock(map);
2701 	return (KERN_SUCCESS);
2702 }
2703 
2704 /*
2705  *	vm_map_unwire:
2706  *
2707  *	Implements both kernel and user unwiring.
2708  */
2709 int
2710 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
2711     int flags)
2712 {
2713 	vm_map_entry_t entry, first_entry, tmp_entry;
2714 	vm_offset_t saved_start;
2715 	unsigned int last_timestamp;
2716 	int rv;
2717 	boolean_t need_wakeup, result, user_unwire;
2718 
2719 	if (start == end)
2720 		return (KERN_SUCCESS);
2721 	user_unwire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
2722 	vm_map_lock(map);
2723 	VM_MAP_RANGE_CHECK(map, start, end);
2724 	if (!vm_map_lookup_entry(map, start, &first_entry)) {
2725 		if (flags & VM_MAP_WIRE_HOLESOK)
2726 			first_entry = first_entry->next;
2727 		else {
2728 			vm_map_unlock(map);
2729 			return (KERN_INVALID_ADDRESS);
2730 		}
2731 	}
2732 	last_timestamp = map->timestamp;
2733 	entry = first_entry;
2734 	while (entry->start < end) {
2735 		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
2736 			/*
2737 			 * We have not yet clipped the entry.
2738 			 */
2739 			saved_start = (start >= entry->start) ? start :
2740 			    entry->start;
2741 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2742 			if (vm_map_unlock_and_wait(map, 0)) {
2743 				/*
2744 				 * Allow interruption of user unwiring?
2745 				 */
2746 			}
2747 			vm_map_lock(map);
2748 			if (last_timestamp+1 != map->timestamp) {
2749 				/*
2750 				 * Look again for the entry because the map was
2751 				 * modified while it was unlocked.
2752 				 * Specifically, the entry may have been
2753 				 * clipped, merged, or deleted.
2754 				 */
2755 				if (!vm_map_lookup_entry(map, saved_start,
2756 				    &tmp_entry)) {
2757 					if (flags & VM_MAP_WIRE_HOLESOK)
2758 						tmp_entry = tmp_entry->next;
2759 					else {
2760 						if (saved_start == start) {
2761 							/*
2762 							 * First_entry has been deleted.
2763 							 */
2764 							vm_map_unlock(map);
2765 							return (KERN_INVALID_ADDRESS);
2766 						}
2767 						end = saved_start;
2768 						rv = KERN_INVALID_ADDRESS;
2769 						goto done;
2770 					}
2771 				}
2772 				if (entry == first_entry)
2773 					first_entry = tmp_entry;
2774 				else
2775 					first_entry = NULL;
2776 				entry = tmp_entry;
2777 			}
2778 			last_timestamp = map->timestamp;
2779 			continue;
2780 		}
2781 		vm_map_clip_start(map, entry, start);
2782 		vm_map_clip_end(map, entry, end);
2783 		/*
2784 		 * Mark the entry in case the map lock is released.  (See
2785 		 * above.)
2786 		 */
2787 		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
2788 		    entry->wiring_thread == NULL,
2789 		    ("owned map entry %p", entry));
2790 		entry->eflags |= MAP_ENTRY_IN_TRANSITION;
2791 		entry->wiring_thread = curthread;
2792 		/*
2793 		 * Check the map for holes in the specified region.
2794 		 * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
2795 		 */
2796 		if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
2797 		    (entry->end < end && entry->next->start > entry->end)) {
2798 			end = entry->end;
2799 			rv = KERN_INVALID_ADDRESS;
2800 			goto done;
2801 		}
2802 		/*
2803 		 * If system unwiring, require that the entry is system wired.
2804 		 */
2805 		if (!user_unwire &&
2806 		    vm_map_entry_system_wired_count(entry) == 0) {
2807 			end = entry->end;
2808 			rv = KERN_INVALID_ARGUMENT;
2809 			goto done;
2810 		}
2811 		entry = entry->next;
2812 	}
2813 	rv = KERN_SUCCESS;
2814 done:
2815 	need_wakeup = FALSE;
2816 	if (first_entry == NULL) {
2817 		result = vm_map_lookup_entry(map, start, &first_entry);
2818 		if (!result && (flags & VM_MAP_WIRE_HOLESOK))
2819 			first_entry = first_entry->next;
2820 		else
2821 			KASSERT(result, ("vm_map_unwire: lookup failed"));
2822 	}
2823 	for (entry = first_entry; entry->start < end; entry = entry->next) {
2824 		/*
2825 		 * If VM_MAP_WIRE_HOLESOK was specified, an empty
2826 		 * space in the unwired region could have been mapped
2827 		 * while the map lock was dropped for draining
2828 		 * MAP_ENTRY_IN_TRANSITION.  Moreover, another thread
2829 		 * could be simultaneously wiring this new mapping
2830 		 * entry.  Detect these cases and skip any entries
2831 		 * marked as in transition by us.
2832 		 */
2833 		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
2834 		    entry->wiring_thread != curthread) {
2835 			KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0,
2836 			    ("vm_map_unwire: !HOLESOK and new/changed entry"));
2837 			continue;
2838 		}
2839 
2840 		if (rv == KERN_SUCCESS && (!user_unwire ||
2841 		    (entry->eflags & MAP_ENTRY_USER_WIRED))) {
2842 			if (user_unwire)
2843 				entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2844 			if (entry->wired_count == 1)
2845 				vm_map_entry_unwire(map, entry);
2846 			else
2847 				entry->wired_count--;
2848 		}
2849 		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
2850 		    ("vm_map_unwire: in-transition flag missing %p", entry));
2851 		KASSERT(entry->wiring_thread == curthread,
2852 		    ("vm_map_unwire: alien wire %p", entry));
2853 		entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
2854 		entry->wiring_thread = NULL;
2855 		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
2856 			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
2857 			need_wakeup = TRUE;
2858 		}
2859 		vm_map_simplify_entry(map, entry);
2860 	}
2861 	vm_map_unlock(map);
2862 	if (need_wakeup)
2863 		vm_map_wakeup(map);
2864 	return (rv);
2865 }
2866 
2867 /*
2868  *	vm_map_wire_entry_failure:
2869  *
2870  *	Handle a wiring failure on the given entry.
2871  *
2872  *	The map should be locked.
2873  */
2874 static void
2875 vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
2876     vm_offset_t failed_addr)
2877 {
2878 
2879 	VM_MAP_ASSERT_LOCKED(map);
2880 	KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 &&
2881 	    entry->wired_count == 1,
2882 	    ("vm_map_wire_entry_failure: entry %p isn't being wired", entry));
2883 	KASSERT(failed_addr < entry->end,
2884 	    ("vm_map_wire_entry_failure: entry %p was fully wired", entry));
2885 
2886 	/*
2887 	 * If any pages at the start of this entry were successfully wired,
2888 	 * then unwire them.
2889 	 */
2890 	if (failed_addr > entry->start) {
2891 		pmap_unwire(map->pmap, entry->start, failed_addr);
2892 		vm_object_unwire(entry->object.vm_object, entry->offset,
2893 		    failed_addr - entry->start, PQ_ACTIVE);
2894 	}
2895 
2896 	/*
2897 	 * Assign an out-of-range value to represent the failure to wire this
2898 	 * entry.
2899 	 */
2900 	entry->wired_count = -1;
2901 }
2902 
2903 /*
2904  *	vm_map_wire:
2905  *
2906  *	Implements both kernel and user wiring.
2907  */
2908 int
2909 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,
2910     int flags)
2911 {
2912 	vm_map_entry_t entry, first_entry, tmp_entry;
2913 	vm_offset_t faddr, saved_end, saved_start;
2914 	unsigned int last_timestamp;
2915 	int rv;
2916 	boolean_t need_wakeup, result, user_wire;
2917 	vm_prot_t prot;
2918 
2919 	if (start == end)
2920 		return (KERN_SUCCESS);
2921 	prot = 0;
2922 	if (flags & VM_MAP_WIRE_WRITE)
2923 		prot |= VM_PROT_WRITE;
2924 	user_wire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
2925 	vm_map_lock(map);
2926 	VM_MAP_RANGE_CHECK(map, start, end);
2927 	if (!vm_map_lookup_entry(map, start, &first_entry)) {
2928 		if (flags & VM_MAP_WIRE_HOLESOK)
2929 			first_entry = first_entry->next;
2930 		else {
2931 			vm_map_unlock(map);
2932 			return (KERN_INVALID_ADDRESS);
2933 		}
2934 	}
2935 	last_timestamp = map->timestamp;
2936 	entry = first_entry;
2937 	while (entry->start < end) {
2938 		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
2939 			/*
2940 			 * We have not yet clipped the entry.
2941 			 */
2942 			saved_start = (start >= entry->start) ? start :
2943 			    entry->start;
2944 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2945 			if (vm_map_unlock_and_wait(map, 0)) {
2946 				/*
2947 				 * Allow interruption of user wiring?
2948 				 */
2949 			}
2950 			vm_map_lock(map);
2951 			if (last_timestamp + 1 != map->timestamp) {
2952 				/*
2953 				 * Look again for the entry because the map was
2954 				 * modified while it was unlocked.
2955 				 * Specifically, the entry may have been
2956 				 * clipped, merged, or deleted.
2957 				 */
2958 				if (!vm_map_lookup_entry(map, saved_start,
2959 				    &tmp_entry)) {
2960 					if (flags & VM_MAP_WIRE_HOLESOK)
2961 						tmp_entry = tmp_entry->next;
2962 					else {
2963 						if (saved_start == start) {
2964 							/*
2965 							 * first_entry has been deleted.
2966 							 */
2967 							vm_map_unlock(map);
2968 							return (KERN_INVALID_ADDRESS);
2969 						}
2970 						end = saved_start;
2971 						rv = KERN_INVALID_ADDRESS;
2972 						goto done;
2973 					}
2974 				}
2975 				if (entry == first_entry)
2976 					first_entry = tmp_entry;
2977 				else
2978 					first_entry = NULL;
2979 				entry = tmp_entry;
2980 			}
2981 			last_timestamp = map->timestamp;
2982 			continue;
2983 		}
2984 		vm_map_clip_start(map, entry, start);
2985 		vm_map_clip_end(map, entry, end);
2986 		/*
2987 		 * Mark the entry in case the map lock is released.  (See
2988 		 * above.)
2989 		 */
2990 		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
2991 		    entry->wiring_thread == NULL,
2992 		    ("owned map entry %p", entry));
2993 		entry->eflags |= MAP_ENTRY_IN_TRANSITION;
2994 		entry->wiring_thread = curthread;
2995 		if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0
2996 		    || (entry->protection & prot) != prot) {
2997 			entry->eflags |= MAP_ENTRY_WIRE_SKIPPED;
2998 			if ((flags & VM_MAP_WIRE_HOLESOK) == 0) {
2999 				end = entry->end;
3000 				rv = KERN_INVALID_ADDRESS;
3001 				goto done;
3002 			}
3003 			goto next_entry;
3004 		}
3005 		if (entry->wired_count == 0) {
3006 			entry->wired_count++;
3007 			saved_start = entry->start;
3008 			saved_end = entry->end;
3009 
3010 			/*
3011 			 * Release the map lock, relying on the in-transition
3012 			 * mark.  Mark the map busy for fork.
3013 			 */
3014 			vm_map_busy(map);
3015 			vm_map_unlock(map);
3016 
3017 			faddr = saved_start;
3018 			do {
3019 				/*
3020 				 * Simulate a fault to get the page and enter
3021 				 * it into the physical map.
3022 				 */
3023 				if ((rv = vm_fault(map, faddr, VM_PROT_NONE,
3024 				    VM_FAULT_WIRE)) != KERN_SUCCESS)
3025 					break;
3026 			} while ((faddr += PAGE_SIZE) < saved_end);
3027 			vm_map_lock(map);
3028 			vm_map_unbusy(map);
3029 			if (last_timestamp + 1 != map->timestamp) {
3030 				/*
3031 				 * Look again for the entry because the map was
3032 				 * modified while it was unlocked.  The entry
3033 				 * may have been clipped, but NOT merged or
3034 				 * deleted.
3035 				 */
3036 				result = vm_map_lookup_entry(map, saved_start,
3037 				    &tmp_entry);
3038 				KASSERT(result, ("vm_map_wire: lookup failed"));
3039 				if (entry == first_entry)
3040 					first_entry = tmp_entry;
3041 				else
3042 					first_entry = NULL;
3043 				entry = tmp_entry;
3044 				while (entry->end < saved_end) {
3045 					/*
3046 					 * In case of failure, handle entries
3047 					 * that were not fully wired here;
3048 					 * fully wired entries are handled
3049 					 * later.
3050 					 */
3051 					if (rv != KERN_SUCCESS &&
3052 					    faddr < entry->end)
3053 						vm_map_wire_entry_failure(map,
3054 						    entry, faddr);
3055 					entry = entry->next;
3056 				}
3057 			}
3058 			last_timestamp = map->timestamp;
3059 			if (rv != KERN_SUCCESS) {
3060 				vm_map_wire_entry_failure(map, entry, faddr);
3061 				end = entry->end;
3062 				goto done;
3063 			}
3064 		} else if (!user_wire ||
3065 			   (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
3066 			entry->wired_count++;
3067 		}
3068 		/*
3069 		 * Check the map for holes in the specified region.
3070 		 * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
3071 		 */
3072 	next_entry:
3073 		if ((flags & VM_MAP_WIRE_HOLESOK) == 0 &&
3074 		    entry->end < end && entry->next->start > entry->end) {
3075 			end = entry->end;
3076 			rv = KERN_INVALID_ADDRESS;
3077 			goto done;
3078 		}
3079 		entry = entry->next;
3080 	}
3081 	rv = KERN_SUCCESS;
3082 done:
3083 	need_wakeup = FALSE;
3084 	if (first_entry == NULL) {
3085 		result = vm_map_lookup_entry(map, start, &first_entry);
3086 		if (!result && (flags & VM_MAP_WIRE_HOLESOK))
3087 			first_entry = first_entry->next;
3088 		else
3089 			KASSERT(result, ("vm_map_wire: lookup failed"));
3090 	}
3091 	for (entry = first_entry; entry->start < end; entry = entry->next) {
3092 		/*
3093 		 * If VM_MAP_WIRE_HOLESOK was specified, an empty
3094 		 * space in the unwired region could have been mapped
3095 		 * while the map lock was dropped for faulting in the
3096 		 * pages or draining MAP_ENTRY_IN_TRANSITION.
3097 		 * Moreover, another thread could be simultaneously
3098 		 * wiring this new mapping entry.  Detect these cases
3099 		 * and skip any entries marked as in transition not by us.
3100 		 */
3101 		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
3102 		    entry->wiring_thread != curthread) {
3103 			KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0,
3104 			    ("vm_map_wire: !HOLESOK and new/changed entry"));
3105 			continue;
3106 		}
3107 
3108 		if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0)
3109 			goto next_entry_done;
3110 
3111 		if (rv == KERN_SUCCESS) {
3112 			if (user_wire)
3113 				entry->eflags |= MAP_ENTRY_USER_WIRED;
3114 		} else if (entry->wired_count == -1) {
3115 			/*
3116 			 * Wiring failed on this entry.  Thus, unwiring is
3117 			 * unnecessary.
3118 			 */
3119 			entry->wired_count = 0;
3120 		} else if (!user_wire ||
3121 		    (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
3122 			/*
3123 			 * Undo the wiring.  Wiring succeeded on this entry
3124 			 * but failed on a later entry.
3125 			 */
3126 			if (entry->wired_count == 1)
3127 				vm_map_entry_unwire(map, entry);
3128 			else
3129 				entry->wired_count--;
3130 		}
3131 	next_entry_done:
3132 		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
3133 		    ("vm_map_wire: in-transition flag missing %p", entry));
3134 		KASSERT(entry->wiring_thread == curthread,
3135 		    ("vm_map_wire: alien wire %p", entry));
3136 		entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION |
3137 		    MAP_ENTRY_WIRE_SKIPPED);
3138 		entry->wiring_thread = NULL;
3139 		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
3140 			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
3141 			need_wakeup = TRUE;
3142 		}
3143 		vm_map_simplify_entry(map, entry);
3144 	}
3145 	vm_map_unlock(map);
3146 	if (need_wakeup)
3147 		vm_map_wakeup(map);
3148 	return (rv);
3149 }
3150 
3151 /*
3152  * vm_map_sync
3153  *
3154  * Push any dirty cached pages in the address range to their pager.
3155  * If syncio is TRUE, dirty pages are written synchronously.
3156  * If invalidate is TRUE, any cached pages are freed as well.
3157  *
3158  * If the size of the region from start to end is zero, we are
3159  * supposed to flush all modified pages within the region containing
3160  * start.  Unfortunately, a region can be split or coalesced with
3161  * neighboring regions, making it difficult to determine what the
3162  * original region was.  Therefore, we approximate this requirement by
3163  * flushing the current region containing start.
3164  *
3165  * Returns an error if any part of the specified range is not mapped.
3166  */
3167 int
3168 vm_map_sync(
3169 	vm_map_t map,
3170 	vm_offset_t start,
3171 	vm_offset_t end,
3172 	boolean_t syncio,
3173 	boolean_t invalidate)
3174 {
3175 	vm_map_entry_t current;
3176 	vm_map_entry_t entry;
3177 	vm_size_t size;
3178 	vm_object_t object;
3179 	vm_ooffset_t offset;
3180 	unsigned int last_timestamp;
3181 	boolean_t failed;
3182 
3183 	vm_map_lock_read(map);
3184 	VM_MAP_RANGE_CHECK(map, start, end);
3185 	if (!vm_map_lookup_entry(map, start, &entry)) {
3186 		vm_map_unlock_read(map);
3187 		return (KERN_INVALID_ADDRESS);
3188 	} else if (start == end) {
3189 		start = entry->start;
3190 		end = entry->end;
3191 	}
3192 	/*
3193 	 * Make a first pass to check for user-wired memory and holes.
3194 	 */
3195 	for (current = entry; current->start < end; current = current->next) {
3196 		if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) {
3197 			vm_map_unlock_read(map);
3198 			return (KERN_INVALID_ARGUMENT);
3199 		}
3200 		if (end > current->end &&
3201 		    current->end != current->next->start) {
3202 			vm_map_unlock_read(map);
3203 			return (KERN_INVALID_ADDRESS);
3204 		}
3205 	}
3206 
3207 	if (invalidate)
3208 		pmap_remove(map->pmap, start, end);
3209 	failed = FALSE;
3210 
3211 	/*
3212 	 * Make a second pass, cleaning/uncaching pages from the indicated
3213 	 * objects as we go.
3214 	 */
3215 	for (current = entry; current->start < end;) {
3216 		offset = current->offset + (start - current->start);
3217 		size = (end <= current->end ? end : current->end) - start;
3218 		if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
3219 			vm_map_t smap;
3220 			vm_map_entry_t tentry;
3221 			vm_size_t tsize;
3222 
3223 			smap = current->object.sub_map;
3224 			vm_map_lock_read(smap);
3225 			(void) vm_map_lookup_entry(smap, offset, &tentry);
3226 			tsize = tentry->end - offset;
3227 			if (tsize < size)
3228 				size = tsize;
3229 			object = tentry->object.vm_object;
3230 			offset = tentry->offset + (offset - tentry->start);
3231 			vm_map_unlock_read(smap);
3232 		} else {
3233 			object = current->object.vm_object;
3234 		}
3235 		vm_object_reference(object);
3236 		last_timestamp = map->timestamp;
3237 		vm_map_unlock_read(map);
3238 		if (!vm_object_sync(object, offset, size, syncio, invalidate))
3239 			failed = TRUE;
3240 		start += size;
3241 		vm_object_deallocate(object);
3242 		vm_map_lock_read(map);
3243 		if (last_timestamp == map->timestamp ||
3244 		    !vm_map_lookup_entry(map, start, &current))
3245 			current = current->next;
3246 	}
3247 
3248 	vm_map_unlock_read(map);
3249 	return (failed ? KERN_FAILURE : KERN_SUCCESS);
3250 }
3251 
3252 /*
3253  *	vm_map_entry_unwire:	[ internal use only ]
3254  *
3255  *	Make the region specified by this entry pageable.
3256  *
3257  *	The map in question should be locked.
3258  *	[This is the reason for this routine's existence.]
3259  */
3260 static void
3261 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
3262 {
3263 
3264 	VM_MAP_ASSERT_LOCKED(map);
3265 	KASSERT(entry->wired_count > 0,
3266 	    ("vm_map_entry_unwire: entry %p isn't wired", entry));
3267 	pmap_unwire(map->pmap, entry->start, entry->end);
3268 	vm_object_unwire(entry->object.vm_object, entry->offset, entry->end -
3269 	    entry->start, PQ_ACTIVE);
3270 	entry->wired_count = 0;
3271 }
3272 
3273 static void
3274 vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map)
3275 {
3276 
3277 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0)
3278 		vm_object_deallocate(entry->object.vm_object);
3279 	uma_zfree(system_map ? kmapentzone : mapentzone, entry);
3280 }
3281 
3282 /*
3283  *	vm_map_entry_delete:	[ internal use only ]
3284  *
3285  *	Deallocate the given entry from the target map.
3286  */
3287 static void
3288 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
3289 {
3290 	vm_object_t object;
3291 	vm_pindex_t offidxstart, offidxend, count, size1;
3292 	vm_size_t size;
3293 
3294 	vm_map_entry_unlink(map, entry, UNLINK_MERGE_NONE);
3295 	object = entry->object.vm_object;
3296 
3297 	if ((entry->eflags & MAP_ENTRY_GUARD) != 0) {
3298 		MPASS(entry->cred == NULL);
3299 		MPASS((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0);
3300 		MPASS(object == NULL);
3301 		vm_map_entry_deallocate(entry, map->system_map);
3302 		return;
3303 	}
3304 
3305 	size = entry->end - entry->start;
3306 	map->size -= size;
3307 
3308 	if (entry->cred != NULL) {
3309 		swap_release_by_cred(size, entry->cred);
3310 		crfree(entry->cred);
3311 	}
3312 
3313 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
3314 	    (object != NULL)) {
3315 		KASSERT(entry->cred == NULL || object->cred == NULL ||
3316 		    (entry->eflags & MAP_ENTRY_NEEDS_COPY),
3317 		    ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry));
3318 		count = atop(size);
3319 		offidxstart = OFF_TO_IDX(entry->offset);
3320 		offidxend = offidxstart + count;
3321 		VM_OBJECT_WLOCK(object);
3322 		if (object->ref_count != 1 && ((object->flags & (OBJ_NOSPLIT |
3323 		    OBJ_ONEMAPPING)) == OBJ_ONEMAPPING ||
3324 		    object == kernel_object)) {
3325 			vm_object_collapse(object);
3326 
3327 			/*
3328 			 * The option OBJPR_NOTMAPPED can be passed here
3329 			 * because vm_map_delete() already performed
3330 			 * pmap_remove() on the only mapping to this range
3331 			 * of pages.
3332 			 */
3333 			vm_object_page_remove(object, offidxstart, offidxend,
3334 			    OBJPR_NOTMAPPED);
3335 			if (object->type == OBJT_SWAP)
3336 				swap_pager_freespace(object, offidxstart,
3337 				    count);
3338 			if (offidxend >= object->size &&
3339 			    offidxstart < object->size) {
3340 				size1 = object->size;
3341 				object->size = offidxstart;
3342 				if (object->cred != NULL) {
3343 					size1 -= object->size;
3344 					KASSERT(object->charge >= ptoa(size1),
3345 					    ("object %p charge < 0", object));
3346 					swap_release_by_cred(ptoa(size1),
3347 					    object->cred);
3348 					object->charge -= ptoa(size1);
3349 				}
3350 			}
3351 		}
3352 		VM_OBJECT_WUNLOCK(object);
3353 	} else
3354 		entry->object.vm_object = NULL;
3355 	if (map->system_map)
3356 		vm_map_entry_deallocate(entry, TRUE);
3357 	else {
3358 		entry->next = curthread->td_map_def_user;
3359 		curthread->td_map_def_user = entry;
3360 	}
3361 }
3362 
3363 /*
3364  *	vm_map_delete:	[ internal use only ]
3365  *
3366  *	Deallocates the given address range from the target
3367  *	map.
3368  */
3369 int
3370 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
3371 {
3372 	vm_map_entry_t entry;
3373 	vm_map_entry_t first_entry;
3374 
3375 	VM_MAP_ASSERT_LOCKED(map);
3376 	if (start == end)
3377 		return (KERN_SUCCESS);
3378 
3379 	/*
3380 	 * Find the start of the region, and clip it
3381 	 */
3382 	if (!vm_map_lookup_entry(map, start, &first_entry))
3383 		entry = first_entry->next;
3384 	else {
3385 		entry = first_entry;
3386 		vm_map_clip_start(map, entry, start);
3387 	}
3388 
3389 	/*
3390 	 * Step through all entries in this region
3391 	 */
3392 	while (entry->start < end) {
3393 		vm_map_entry_t next;
3394 
3395 		/*
3396 		 * Wait for wiring or unwiring of an entry to complete.
3397 		 * Also wait for any system wirings to disappear on
3398 		 * user maps.
3399 		 */
3400 		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 ||
3401 		    (vm_map_pmap(map) != kernel_pmap &&
3402 		    vm_map_entry_system_wired_count(entry) != 0)) {
3403 			unsigned int last_timestamp;
3404 			vm_offset_t saved_start;
3405 			vm_map_entry_t tmp_entry;
3406 
3407 			saved_start = entry->start;
3408 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
3409 			last_timestamp = map->timestamp;
3410 			(void) vm_map_unlock_and_wait(map, 0);
3411 			vm_map_lock(map);
3412 			if (last_timestamp + 1 != map->timestamp) {
3413 				/*
3414 				 * Look again for the entry because the map was
3415 				 * modified while it was unlocked.
3416 				 * Specifically, the entry may have been
3417 				 * clipped, merged, or deleted.
3418 				 */
3419 				if (!vm_map_lookup_entry(map, saved_start,
3420 							 &tmp_entry))
3421 					entry = tmp_entry->next;
3422 				else {
3423 					entry = tmp_entry;
3424 					vm_map_clip_start(map, entry,
3425 							  saved_start);
3426 				}
3427 			}
3428 			continue;
3429 		}
3430 		vm_map_clip_end(map, entry, end);
3431 
3432 		next = entry->next;
3433 
3434 		/*
3435 		 * Unwire before removing addresses from the pmap; otherwise,
3436 		 * unwiring will put the entries back in the pmap.
3437 		 */
3438 		if (entry->wired_count != 0)
3439 			vm_map_entry_unwire(map, entry);
3440 
3441 		/*
3442 		 * Remove mappings for the pages, but only if the
3443 		 * mappings could exist.  For instance, it does not
3444 		 * make sense to call pmap_remove() for guard entries.
3445 		 */
3446 		if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 ||
3447 		    entry->object.vm_object != NULL)
3448 			pmap_remove(map->pmap, entry->start, entry->end);
3449 
3450 		if (entry->end == map->anon_loc)
3451 			map->anon_loc = entry->start;
3452 
3453 		/*
3454 		 * Delete the entry only after removing all pmap
3455 		 * entries pointing to its pages.  (Otherwise, its
3456 		 * page frames may be reallocated, and any modify bits
3457 		 * will be set in the wrong object!)
3458 		 */
3459 		vm_map_entry_delete(map, entry);
3460 		entry = next;
3461 	}
3462 	return (KERN_SUCCESS);
3463 }
3464 
3465 /*
3466  *	vm_map_remove:
3467  *
3468  *	Remove the given address range from the target map.
3469  *	This is the exported form of vm_map_delete.
3470  */
3471 int
3472 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
3473 {
3474 	int result;
3475 
3476 	vm_map_lock(map);
3477 	VM_MAP_RANGE_CHECK(map, start, end);
3478 	result = vm_map_delete(map, start, end);
3479 	vm_map_unlock(map);
3480 	return (result);
3481 }
3482 
3483 /*
3484  *	vm_map_check_protection:
3485  *
3486  *	Assert that the target map allows the specified privilege on the
3487  *	entire address region given.  The entire region must be allocated.
3488  *
3489  *	WARNING!  This code does not and should not check whether the
3490  *	contents of the region is accessible.  For example a smaller file
3491  *	might be mapped into a larger address space.
3492  *
3493  *	NOTE!  This code is also called by munmap().
3494  *
3495  *	The map must be locked.  A read lock is sufficient.
3496  */
3497 boolean_t
3498 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
3499 			vm_prot_t protection)
3500 {
3501 	vm_map_entry_t entry;
3502 	vm_map_entry_t tmp_entry;
3503 
3504 	if (!vm_map_lookup_entry(map, start, &tmp_entry))
3505 		return (FALSE);
3506 	entry = tmp_entry;
3507 
3508 	while (start < end) {
3509 		/*
3510 		 * No holes allowed!
3511 		 */
3512 		if (start < entry->start)
3513 			return (FALSE);
3514 		/*
3515 		 * Check protection associated with entry.
3516 		 */
3517 		if ((entry->protection & protection) != protection)
3518 			return (FALSE);
3519 		/* go to next entry */
3520 		start = entry->end;
3521 		entry = entry->next;
3522 	}
3523 	return (TRUE);
3524 }
3525 
3526 /*
3527  *	vm_map_copy_entry:
3528  *
3529  *	Copies the contents of the source entry to the destination
3530  *	entry.  The entries *must* be aligned properly.
3531  */
3532 static void
3533 vm_map_copy_entry(
3534 	vm_map_t src_map,
3535 	vm_map_t dst_map,
3536 	vm_map_entry_t src_entry,
3537 	vm_map_entry_t dst_entry,
3538 	vm_ooffset_t *fork_charge)
3539 {
3540 	vm_object_t src_object;
3541 	vm_map_entry_t fake_entry;
3542 	vm_offset_t size;
3543 	struct ucred *cred;
3544 	int charged;
3545 
3546 	VM_MAP_ASSERT_LOCKED(dst_map);
3547 
3548 	if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
3549 		return;
3550 
3551 	if (src_entry->wired_count == 0 ||
3552 	    (src_entry->protection & VM_PROT_WRITE) == 0) {
3553 		/*
3554 		 * If the source entry is marked needs_copy, it is already
3555 		 * write-protected.
3556 		 */
3557 		if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
3558 		    (src_entry->protection & VM_PROT_WRITE) != 0) {
3559 			pmap_protect(src_map->pmap,
3560 			    src_entry->start,
3561 			    src_entry->end,
3562 			    src_entry->protection & ~VM_PROT_WRITE);
3563 		}
3564 
3565 		/*
3566 		 * Make a copy of the object.
3567 		 */
3568 		size = src_entry->end - src_entry->start;
3569 		if ((src_object = src_entry->object.vm_object) != NULL) {
3570 			VM_OBJECT_WLOCK(src_object);
3571 			charged = ENTRY_CHARGED(src_entry);
3572 			if (src_object->handle == NULL &&
3573 			    (src_object->type == OBJT_DEFAULT ||
3574 			    src_object->type == OBJT_SWAP)) {
3575 				vm_object_collapse(src_object);
3576 				if ((src_object->flags & (OBJ_NOSPLIT |
3577 				    OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
3578 					vm_object_split(src_entry);
3579 					src_object =
3580 					    src_entry->object.vm_object;
3581 				}
3582 			}
3583 			vm_object_reference_locked(src_object);
3584 			vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
3585 			if (src_entry->cred != NULL &&
3586 			    !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
3587 				KASSERT(src_object->cred == NULL,
3588 				    ("OVERCOMMIT: vm_map_copy_entry: cred %p",
3589 				     src_object));
3590 				src_object->cred = src_entry->cred;
3591 				src_object->charge = size;
3592 			}
3593 			VM_OBJECT_WUNLOCK(src_object);
3594 			dst_entry->object.vm_object = src_object;
3595 			if (charged) {
3596 				cred = curthread->td_ucred;
3597 				crhold(cred);
3598 				dst_entry->cred = cred;
3599 				*fork_charge += size;
3600 				if (!(src_entry->eflags &
3601 				      MAP_ENTRY_NEEDS_COPY)) {
3602 					crhold(cred);
3603 					src_entry->cred = cred;
3604 					*fork_charge += size;
3605 				}
3606 			}
3607 			src_entry->eflags |= MAP_ENTRY_COW |
3608 			    MAP_ENTRY_NEEDS_COPY;
3609 			dst_entry->eflags |= MAP_ENTRY_COW |
3610 			    MAP_ENTRY_NEEDS_COPY;
3611 			dst_entry->offset = src_entry->offset;
3612 			if (src_entry->eflags & MAP_ENTRY_VN_WRITECNT) {
3613 				/*
3614 				 * MAP_ENTRY_VN_WRITECNT cannot
3615 				 * indicate write reference from
3616 				 * src_entry, since the entry is
3617 				 * marked as needs copy.  Allocate a
3618 				 * fake entry that is used to
3619 				 * decrement object->un_pager.vnp.writecount
3620 				 * at the appropriate time.  Attach
3621 				 * fake_entry to the deferred list.
3622 				 */
3623 				fake_entry = vm_map_entry_create(dst_map);
3624 				fake_entry->eflags = MAP_ENTRY_VN_WRITECNT;
3625 				src_entry->eflags &= ~MAP_ENTRY_VN_WRITECNT;
3626 				vm_object_reference(src_object);
3627 				fake_entry->object.vm_object = src_object;
3628 				fake_entry->start = src_entry->start;
3629 				fake_entry->end = src_entry->end;
3630 				fake_entry->next = curthread->td_map_def_user;
3631 				curthread->td_map_def_user = fake_entry;
3632 			}
3633 
3634 			pmap_copy(dst_map->pmap, src_map->pmap,
3635 			    dst_entry->start, dst_entry->end - dst_entry->start,
3636 			    src_entry->start);
3637 		} else {
3638 			dst_entry->object.vm_object = NULL;
3639 			dst_entry->offset = 0;
3640 			if (src_entry->cred != NULL) {
3641 				dst_entry->cred = curthread->td_ucred;
3642 				crhold(dst_entry->cred);
3643 				*fork_charge += size;
3644 			}
3645 		}
3646 	} else {
3647 		/*
3648 		 * We don't want to make writeable wired pages copy-on-write.
3649 		 * Immediately copy these pages into the new map by simulating
3650 		 * page faults.  The new pages are pageable.
3651 		 */
3652 		vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry,
3653 		    fork_charge);
3654 	}
3655 }
3656 
3657 /*
3658  * vmspace_map_entry_forked:
3659  * Update the newly-forked vmspace each time a map entry is inherited
3660  * or copied.  The values for vm_dsize and vm_tsize are approximate
3661  * (and mostly-obsolete ideas in the face of mmap(2) et al.)
3662  */
3663 static void
3664 vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2,
3665     vm_map_entry_t entry)
3666 {
3667 	vm_size_t entrysize;
3668 	vm_offset_t newend;
3669 
3670 	if ((entry->eflags & MAP_ENTRY_GUARD) != 0)
3671 		return;
3672 	entrysize = entry->end - entry->start;
3673 	vm2->vm_map.size += entrysize;
3674 	if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) {
3675 		vm2->vm_ssize += btoc(entrysize);
3676 	} else if (entry->start >= (vm_offset_t)vm1->vm_daddr &&
3677 	    entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) {
3678 		newend = MIN(entry->end,
3679 		    (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize));
3680 		vm2->vm_dsize += btoc(newend - entry->start);
3681 	} else if (entry->start >= (vm_offset_t)vm1->vm_taddr &&
3682 	    entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) {
3683 		newend = MIN(entry->end,
3684 		    (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize));
3685 		vm2->vm_tsize += btoc(newend - entry->start);
3686 	}
3687 }
3688 
3689 /*
3690  * vmspace_fork:
3691  * Create a new process vmspace structure and vm_map
3692  * based on those of an existing process.  The new map
3693  * is based on the old map, according to the inheritance
3694  * values on the regions in that map.
3695  *
3696  * XXX It might be worth coalescing the entries added to the new vmspace.
3697  *
3698  * The source map must not be locked.
3699  */
3700 struct vmspace *
3701 vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge)
3702 {
3703 	struct vmspace *vm2;
3704 	vm_map_t new_map, old_map;
3705 	vm_map_entry_t new_entry, old_entry;
3706 	vm_object_t object;
3707 	int error, locked;
3708 	vm_inherit_t inh;
3709 
3710 	old_map = &vm1->vm_map;
3711 	/* Copy immutable fields of vm1 to vm2. */
3712 	vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map),
3713 	    pmap_pinit);
3714 	if (vm2 == NULL)
3715 		return (NULL);
3716 
3717 	vm2->vm_taddr = vm1->vm_taddr;
3718 	vm2->vm_daddr = vm1->vm_daddr;
3719 	vm2->vm_maxsaddr = vm1->vm_maxsaddr;
3720 	vm_map_lock(old_map);
3721 	if (old_map->busy)
3722 		vm_map_wait_busy(old_map);
3723 	new_map = &vm2->vm_map;
3724 	locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */
3725 	KASSERT(locked, ("vmspace_fork: lock failed"));
3726 
3727 	error = pmap_vmspace_copy(new_map->pmap, old_map->pmap);
3728 	if (error != 0) {
3729 		sx_xunlock(&old_map->lock);
3730 		sx_xunlock(&new_map->lock);
3731 		vm_map_process_deferred();
3732 		vmspace_free(vm2);
3733 		return (NULL);
3734 	}
3735 
3736 	new_map->anon_loc = old_map->anon_loc;
3737 
3738 	old_entry = old_map->header.next;
3739 
3740 	while (old_entry != &old_map->header) {
3741 		if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP)
3742 			panic("vm_map_fork: encountered a submap");
3743 
3744 		inh = old_entry->inheritance;
3745 		if ((old_entry->eflags & MAP_ENTRY_GUARD) != 0 &&
3746 		    inh != VM_INHERIT_NONE)
3747 			inh = VM_INHERIT_COPY;
3748 
3749 		switch (inh) {
3750 		case VM_INHERIT_NONE:
3751 			break;
3752 
3753 		case VM_INHERIT_SHARE:
3754 			/*
3755 			 * Clone the entry, creating the shared object if necessary.
3756 			 */
3757 			object = old_entry->object.vm_object;
3758 			if (object == NULL) {
3759 				object = vm_object_allocate(OBJT_DEFAULT,
3760 					atop(old_entry->end - old_entry->start));
3761 				old_entry->object.vm_object = object;
3762 				old_entry->offset = 0;
3763 				if (old_entry->cred != NULL) {
3764 					object->cred = old_entry->cred;
3765 					object->charge = old_entry->end -
3766 					    old_entry->start;
3767 					old_entry->cred = NULL;
3768 				}
3769 			}
3770 
3771 			/*
3772 			 * Add the reference before calling vm_object_shadow
3773 			 * to insure that a shadow object is created.
3774 			 */
3775 			vm_object_reference(object);
3776 			if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3777 				vm_object_shadow(&old_entry->object.vm_object,
3778 				    &old_entry->offset,
3779 				    old_entry->end - old_entry->start);
3780 				old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
3781 				/* Transfer the second reference too. */
3782 				vm_object_reference(
3783 				    old_entry->object.vm_object);
3784 
3785 				/*
3786 				 * As in vm_map_simplify_entry(), the
3787 				 * vnode lock will not be acquired in
3788 				 * this call to vm_object_deallocate().
3789 				 */
3790 				vm_object_deallocate(object);
3791 				object = old_entry->object.vm_object;
3792 			}
3793 			VM_OBJECT_WLOCK(object);
3794 			vm_object_clear_flag(object, OBJ_ONEMAPPING);
3795 			if (old_entry->cred != NULL) {
3796 				KASSERT(object->cred == NULL, ("vmspace_fork both cred"));
3797 				object->cred = old_entry->cred;
3798 				object->charge = old_entry->end - old_entry->start;
3799 				old_entry->cred = NULL;
3800 			}
3801 
3802 			/*
3803 			 * Assert the correct state of the vnode
3804 			 * v_writecount while the object is locked, to
3805 			 * not relock it later for the assertion
3806 			 * correctness.
3807 			 */
3808 			if (old_entry->eflags & MAP_ENTRY_VN_WRITECNT &&
3809 			    object->type == OBJT_VNODE) {
3810 				KASSERT(((struct vnode *)object->handle)->
3811 				    v_writecount > 0,
3812 				    ("vmspace_fork: v_writecount %p", object));
3813 				KASSERT(object->un_pager.vnp.writemappings > 0,
3814 				    ("vmspace_fork: vnp.writecount %p",
3815 				    object));
3816 			}
3817 			VM_OBJECT_WUNLOCK(object);
3818 
3819 			/*
3820 			 * Clone the entry, referencing the shared object.
3821 			 */
3822 			new_entry = vm_map_entry_create(new_map);
3823 			*new_entry = *old_entry;
3824 			new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
3825 			    MAP_ENTRY_IN_TRANSITION);
3826 			new_entry->wiring_thread = NULL;
3827 			new_entry->wired_count = 0;
3828 			if (new_entry->eflags & MAP_ENTRY_VN_WRITECNT) {
3829 				vnode_pager_update_writecount(object,
3830 				    new_entry->start, new_entry->end);
3831 			}
3832 
3833 			/*
3834 			 * Insert the entry into the new map -- we know we're
3835 			 * inserting at the end of the new map.
3836 			 */
3837 			vm_map_entry_link(new_map, new_entry);
3838 			vmspace_map_entry_forked(vm1, vm2, new_entry);
3839 
3840 			/*
3841 			 * Update the physical map
3842 			 */
3843 			pmap_copy(new_map->pmap, old_map->pmap,
3844 			    new_entry->start,
3845 			    (old_entry->end - old_entry->start),
3846 			    old_entry->start);
3847 			break;
3848 
3849 		case VM_INHERIT_COPY:
3850 			/*
3851 			 * Clone the entry and link into the map.
3852 			 */
3853 			new_entry = vm_map_entry_create(new_map);
3854 			*new_entry = *old_entry;
3855 			/*
3856 			 * Copied entry is COW over the old object.
3857 			 */
3858 			new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
3859 			    MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_VN_WRITECNT);
3860 			new_entry->wiring_thread = NULL;
3861 			new_entry->wired_count = 0;
3862 			new_entry->object.vm_object = NULL;
3863 			new_entry->cred = NULL;
3864 			vm_map_entry_link(new_map, new_entry);
3865 			vmspace_map_entry_forked(vm1, vm2, new_entry);
3866 			vm_map_copy_entry(old_map, new_map, old_entry,
3867 			    new_entry, fork_charge);
3868 			break;
3869 
3870 		case VM_INHERIT_ZERO:
3871 			/*
3872 			 * Create a new anonymous mapping entry modelled from
3873 			 * the old one.
3874 			 */
3875 			new_entry = vm_map_entry_create(new_map);
3876 			memset(new_entry, 0, sizeof(*new_entry));
3877 
3878 			new_entry->start = old_entry->start;
3879 			new_entry->end = old_entry->end;
3880 			new_entry->eflags = old_entry->eflags &
3881 			    ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION |
3882 			    MAP_ENTRY_VN_WRITECNT);
3883 			new_entry->protection = old_entry->protection;
3884 			new_entry->max_protection = old_entry->max_protection;
3885 			new_entry->inheritance = VM_INHERIT_ZERO;
3886 
3887 			vm_map_entry_link(new_map, new_entry);
3888 			vmspace_map_entry_forked(vm1, vm2, new_entry);
3889 
3890 			new_entry->cred = curthread->td_ucred;
3891 			crhold(new_entry->cred);
3892 			*fork_charge += (new_entry->end - new_entry->start);
3893 
3894 			break;
3895 		}
3896 		old_entry = old_entry->next;
3897 	}
3898 	/*
3899 	 * Use inlined vm_map_unlock() to postpone handling the deferred
3900 	 * map entries, which cannot be done until both old_map and
3901 	 * new_map locks are released.
3902 	 */
3903 	sx_xunlock(&old_map->lock);
3904 	sx_xunlock(&new_map->lock);
3905 	vm_map_process_deferred();
3906 
3907 	return (vm2);
3908 }
3909 
3910 /*
3911  * Create a process's stack for exec_new_vmspace().  This function is never
3912  * asked to wire the newly created stack.
3913  */
3914 int
3915 vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
3916     vm_prot_t prot, vm_prot_t max, int cow)
3917 {
3918 	vm_size_t growsize, init_ssize;
3919 	rlim_t vmemlim;
3920 	int rv;
3921 
3922 	MPASS((map->flags & MAP_WIREFUTURE) == 0);
3923 	growsize = sgrowsiz;
3924 	init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
3925 	vm_map_lock(map);
3926 	vmemlim = lim_cur(curthread, RLIMIT_VMEM);
3927 	/* If we would blow our VMEM resource limit, no go */
3928 	if (map->size + init_ssize > vmemlim) {
3929 		rv = KERN_NO_SPACE;
3930 		goto out;
3931 	}
3932 	rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot,
3933 	    max, cow);
3934 out:
3935 	vm_map_unlock(map);
3936 	return (rv);
3937 }
3938 
3939 static int stack_guard_page = 1;
3940 SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN,
3941     &stack_guard_page, 0,
3942     "Specifies the number of guard pages for a stack that grows");
3943 
3944 static int
3945 vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
3946     vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow)
3947 {
3948 	vm_map_entry_t new_entry, prev_entry;
3949 	vm_offset_t bot, gap_bot, gap_top, top;
3950 	vm_size_t init_ssize, sgp;
3951 	int orient, rv;
3952 
3953 	/*
3954 	 * The stack orientation is piggybacked with the cow argument.
3955 	 * Extract it into orient and mask the cow argument so that we
3956 	 * don't pass it around further.
3957 	 */
3958 	orient = cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP);
3959 	KASSERT(orient != 0, ("No stack grow direction"));
3960 	KASSERT(orient != (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP),
3961 	    ("bi-dir stack"));
3962 
3963 	if (addrbos < vm_map_min(map) ||
3964 	    addrbos + max_ssize > vm_map_max(map) ||
3965 	    addrbos + max_ssize <= addrbos)
3966 		return (KERN_INVALID_ADDRESS);
3967 	sgp = (vm_size_t)stack_guard_page * PAGE_SIZE;
3968 	if (sgp >= max_ssize)
3969 		return (KERN_INVALID_ARGUMENT);
3970 
3971 	init_ssize = growsize;
3972 	if (max_ssize < init_ssize + sgp)
3973 		init_ssize = max_ssize - sgp;
3974 
3975 	/* If addr is already mapped, no go */
3976 	if (vm_map_lookup_entry(map, addrbos, &prev_entry))
3977 		return (KERN_NO_SPACE);
3978 
3979 	/*
3980 	 * If we can't accommodate max_ssize in the current mapping, no go.
3981 	 */
3982 	if (prev_entry->next->start < addrbos + max_ssize)
3983 		return (KERN_NO_SPACE);
3984 
3985 	/*
3986 	 * We initially map a stack of only init_ssize.  We will grow as
3987 	 * needed later.  Depending on the orientation of the stack (i.e.
3988 	 * the grow direction) we either map at the top of the range, the
3989 	 * bottom of the range or in the middle.
3990 	 *
3991 	 * Note: we would normally expect prot and max to be VM_PROT_ALL,
3992 	 * and cow to be 0.  Possibly we should eliminate these as input
3993 	 * parameters, and just pass these values here in the insert call.
3994 	 */
3995 	if (orient == MAP_STACK_GROWS_DOWN) {
3996 		bot = addrbos + max_ssize - init_ssize;
3997 		top = bot + init_ssize;
3998 		gap_bot = addrbos;
3999 		gap_top = bot;
4000 	} else /* if (orient == MAP_STACK_GROWS_UP) */ {
4001 		bot = addrbos;
4002 		top = bot + init_ssize;
4003 		gap_bot = top;
4004 		gap_top = addrbos + max_ssize;
4005 	}
4006 	rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow);
4007 	if (rv != KERN_SUCCESS)
4008 		return (rv);
4009 	new_entry = prev_entry->next;
4010 	KASSERT(new_entry->end == top || new_entry->start == bot,
4011 	    ("Bad entry start/end for new stack entry"));
4012 	KASSERT((orient & MAP_STACK_GROWS_DOWN) == 0 ||
4013 	    (new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0,
4014 	    ("new entry lacks MAP_ENTRY_GROWS_DOWN"));
4015 	KASSERT((orient & MAP_STACK_GROWS_UP) == 0 ||
4016 	    (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0,
4017 	    ("new entry lacks MAP_ENTRY_GROWS_UP"));
4018 	rv = vm_map_insert(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE,
4019 	    VM_PROT_NONE, MAP_CREATE_GUARD | (orient == MAP_STACK_GROWS_DOWN ?
4020 	    MAP_CREATE_STACK_GAP_DN : MAP_CREATE_STACK_GAP_UP));
4021 	if (rv != KERN_SUCCESS)
4022 		(void)vm_map_delete(map, bot, top);
4023 	return (rv);
4024 }
4025 
4026 /*
4027  * Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if we
4028  * successfully grow the stack.
4029  */
4030 static int
4031 vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry)
4032 {
4033 	vm_map_entry_t stack_entry;
4034 	struct proc *p;
4035 	struct vmspace *vm;
4036 	struct ucred *cred;
4037 	vm_offset_t gap_end, gap_start, grow_start;
4038 	size_t grow_amount, guard, max_grow;
4039 	rlim_t lmemlim, stacklim, vmemlim;
4040 	int rv, rv1;
4041 	bool gap_deleted, grow_down, is_procstack;
4042 #ifdef notyet
4043 	uint64_t limit;
4044 #endif
4045 #ifdef RACCT
4046 	int error;
4047 #endif
4048 
4049 	p = curproc;
4050 	vm = p->p_vmspace;
4051 
4052 	/*
4053 	 * Disallow stack growth when the access is performed by a
4054 	 * debugger or AIO daemon.  The reason is that the wrong
4055 	 * resource limits are applied.
4056 	 */
4057 	if (map != &p->p_vmspace->vm_map || p->p_textvp == NULL)
4058 		return (KERN_FAILURE);
4059 
4060 	MPASS(!map->system_map);
4061 
4062 	guard = stack_guard_page * PAGE_SIZE;
4063 	lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK);
4064 	stacklim = lim_cur(curthread, RLIMIT_STACK);
4065 	vmemlim = lim_cur(curthread, RLIMIT_VMEM);
4066 retry:
4067 	/* If addr is not in a hole for a stack grow area, no need to grow. */
4068 	if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry))
4069 		return (KERN_FAILURE);
4070 	if ((gap_entry->eflags & MAP_ENTRY_GUARD) == 0)
4071 		return (KERN_SUCCESS);
4072 	if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_DN) != 0) {
4073 		stack_entry = gap_entry->next;
4074 		if ((stack_entry->eflags & MAP_ENTRY_GROWS_DOWN) == 0 ||
4075 		    stack_entry->start != gap_entry->end)
4076 			return (KERN_FAILURE);
4077 		grow_amount = round_page(stack_entry->start - addr);
4078 		grow_down = true;
4079 	} else if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_UP) != 0) {
4080 		stack_entry = gap_entry->prev;
4081 		if ((stack_entry->eflags & MAP_ENTRY_GROWS_UP) == 0 ||
4082 		    stack_entry->end != gap_entry->start)
4083 			return (KERN_FAILURE);
4084 		grow_amount = round_page(addr + 1 - stack_entry->end);
4085 		grow_down = false;
4086 	} else {
4087 		return (KERN_FAILURE);
4088 	}
4089 	max_grow = gap_entry->end - gap_entry->start;
4090 	if (guard > max_grow)
4091 		return (KERN_NO_SPACE);
4092 	max_grow -= guard;
4093 	if (grow_amount > max_grow)
4094 		return (KERN_NO_SPACE);
4095 
4096 	/*
4097 	 * If this is the main process stack, see if we're over the stack
4098 	 * limit.
4099 	 */
4100 	is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr &&
4101 	    addr < (vm_offset_t)p->p_sysent->sv_usrstack;
4102 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim))
4103 		return (KERN_NO_SPACE);
4104 
4105 #ifdef RACCT
4106 	if (racct_enable) {
4107 		PROC_LOCK(p);
4108 		if (is_procstack && racct_set(p, RACCT_STACK,
4109 		    ctob(vm->vm_ssize) + grow_amount)) {
4110 			PROC_UNLOCK(p);
4111 			return (KERN_NO_SPACE);
4112 		}
4113 		PROC_UNLOCK(p);
4114 	}
4115 #endif
4116 
4117 	grow_amount = roundup(grow_amount, sgrowsiz);
4118 	if (grow_amount > max_grow)
4119 		grow_amount = max_grow;
4120 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
4121 		grow_amount = trunc_page((vm_size_t)stacklim) -
4122 		    ctob(vm->vm_ssize);
4123 	}
4124 
4125 #ifdef notyet
4126 	PROC_LOCK(p);
4127 	limit = racct_get_available(p, RACCT_STACK);
4128 	PROC_UNLOCK(p);
4129 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit))
4130 		grow_amount = limit - ctob(vm->vm_ssize);
4131 #endif
4132 
4133 	if (!old_mlock && (map->flags & MAP_WIREFUTURE) != 0) {
4134 		if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) {
4135 			rv = KERN_NO_SPACE;
4136 			goto out;
4137 		}
4138 #ifdef RACCT
4139 		if (racct_enable) {
4140 			PROC_LOCK(p);
4141 			if (racct_set(p, RACCT_MEMLOCK,
4142 			    ptoa(pmap_wired_count(map->pmap)) + grow_amount)) {
4143 				PROC_UNLOCK(p);
4144 				rv = KERN_NO_SPACE;
4145 				goto out;
4146 			}
4147 			PROC_UNLOCK(p);
4148 		}
4149 #endif
4150 	}
4151 
4152 	/* If we would blow our VMEM resource limit, no go */
4153 	if (map->size + grow_amount > vmemlim) {
4154 		rv = KERN_NO_SPACE;
4155 		goto out;
4156 	}
4157 #ifdef RACCT
4158 	if (racct_enable) {
4159 		PROC_LOCK(p);
4160 		if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) {
4161 			PROC_UNLOCK(p);
4162 			rv = KERN_NO_SPACE;
4163 			goto out;
4164 		}
4165 		PROC_UNLOCK(p);
4166 	}
4167 #endif
4168 
4169 	if (vm_map_lock_upgrade(map)) {
4170 		gap_entry = NULL;
4171 		vm_map_lock_read(map);
4172 		goto retry;
4173 	}
4174 
4175 	if (grow_down) {
4176 		grow_start = gap_entry->end - grow_amount;
4177 		if (gap_entry->start + grow_amount == gap_entry->end) {
4178 			gap_start = gap_entry->start;
4179 			gap_end = gap_entry->end;
4180 			vm_map_entry_delete(map, gap_entry);
4181 			gap_deleted = true;
4182 		} else {
4183 			MPASS(gap_entry->start < gap_entry->end - grow_amount);
4184 			gap_entry->end -= grow_amount;
4185 			vm_map_entry_resize_free(map, gap_entry);
4186 			gap_deleted = false;
4187 		}
4188 		rv = vm_map_insert(map, NULL, 0, grow_start,
4189 		    grow_start + grow_amount,
4190 		    stack_entry->protection, stack_entry->max_protection,
4191 		    MAP_STACK_GROWS_DOWN);
4192 		if (rv != KERN_SUCCESS) {
4193 			if (gap_deleted) {
4194 				rv1 = vm_map_insert(map, NULL, 0, gap_start,
4195 				    gap_end, VM_PROT_NONE, VM_PROT_NONE,
4196 				    MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP_DN);
4197 				MPASS(rv1 == KERN_SUCCESS);
4198 			} else {
4199 				gap_entry->end += grow_amount;
4200 				vm_map_entry_resize_free(map, gap_entry);
4201 			}
4202 		}
4203 	} else {
4204 		grow_start = stack_entry->end;
4205 		cred = stack_entry->cred;
4206 		if (cred == NULL && stack_entry->object.vm_object != NULL)
4207 			cred = stack_entry->object.vm_object->cred;
4208 		if (cred != NULL && !swap_reserve_by_cred(grow_amount, cred))
4209 			rv = KERN_NO_SPACE;
4210 		/* Grow the underlying object if applicable. */
4211 		else if (stack_entry->object.vm_object == NULL ||
4212 		    vm_object_coalesce(stack_entry->object.vm_object,
4213 		    stack_entry->offset,
4214 		    (vm_size_t)(stack_entry->end - stack_entry->start),
4215 		    (vm_size_t)grow_amount, cred != NULL)) {
4216 			if (gap_entry->start + grow_amount == gap_entry->end)
4217 				vm_map_entry_delete(map, gap_entry);
4218 			else
4219 				gap_entry->start += grow_amount;
4220 			stack_entry->end += grow_amount;
4221 			map->size += grow_amount;
4222 			vm_map_entry_resize_free(map, stack_entry);
4223 			rv = KERN_SUCCESS;
4224 		} else
4225 			rv = KERN_FAILURE;
4226 	}
4227 	if (rv == KERN_SUCCESS && is_procstack)
4228 		vm->vm_ssize += btoc(grow_amount);
4229 
4230 	/*
4231 	 * Heed the MAP_WIREFUTURE flag if it was set for this process.
4232 	 */
4233 	if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) {
4234 		vm_map_unlock(map);
4235 		vm_map_wire(map, grow_start, grow_start + grow_amount,
4236 		    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
4237 		vm_map_lock_read(map);
4238 	} else
4239 		vm_map_lock_downgrade(map);
4240 
4241 out:
4242 #ifdef RACCT
4243 	if (racct_enable && rv != KERN_SUCCESS) {
4244 		PROC_LOCK(p);
4245 		error = racct_set(p, RACCT_VMEM, map->size);
4246 		KASSERT(error == 0, ("decreasing RACCT_VMEM failed"));
4247 		if (!old_mlock) {
4248 			error = racct_set(p, RACCT_MEMLOCK,
4249 			    ptoa(pmap_wired_count(map->pmap)));
4250 			KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed"));
4251 		}
4252 	    	error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize));
4253 		KASSERT(error == 0, ("decreasing RACCT_STACK failed"));
4254 		PROC_UNLOCK(p);
4255 	}
4256 #endif
4257 
4258 	return (rv);
4259 }
4260 
4261 /*
4262  * Unshare the specified VM space for exec.  If other processes are
4263  * mapped to it, then create a new one.  The new vmspace is null.
4264  */
4265 int
4266 vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
4267 {
4268 	struct vmspace *oldvmspace = p->p_vmspace;
4269 	struct vmspace *newvmspace;
4270 
4271 	KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0,
4272 	    ("vmspace_exec recursed"));
4273 	newvmspace = vmspace_alloc(minuser, maxuser, pmap_pinit);
4274 	if (newvmspace == NULL)
4275 		return (ENOMEM);
4276 	newvmspace->vm_swrss = oldvmspace->vm_swrss;
4277 	/*
4278 	 * This code is written like this for prototype purposes.  The
4279 	 * goal is to avoid running down the vmspace here, but let the
4280 	 * other process's that are still using the vmspace to finally
4281 	 * run it down.  Even though there is little or no chance of blocking
4282 	 * here, it is a good idea to keep this form for future mods.
4283 	 */
4284 	PROC_VMSPACE_LOCK(p);
4285 	p->p_vmspace = newvmspace;
4286 	PROC_VMSPACE_UNLOCK(p);
4287 	if (p == curthread->td_proc)
4288 		pmap_activate(curthread);
4289 	curthread->td_pflags |= TDP_EXECVMSPC;
4290 	return (0);
4291 }
4292 
4293 /*
4294  * Unshare the specified VM space for forcing COW.  This
4295  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
4296  */
4297 int
4298 vmspace_unshare(struct proc *p)
4299 {
4300 	struct vmspace *oldvmspace = p->p_vmspace;
4301 	struct vmspace *newvmspace;
4302 	vm_ooffset_t fork_charge;
4303 
4304 	if (oldvmspace->vm_refcnt == 1)
4305 		return (0);
4306 	fork_charge = 0;
4307 	newvmspace = vmspace_fork(oldvmspace, &fork_charge);
4308 	if (newvmspace == NULL)
4309 		return (ENOMEM);
4310 	if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) {
4311 		vmspace_free(newvmspace);
4312 		return (ENOMEM);
4313 	}
4314 	PROC_VMSPACE_LOCK(p);
4315 	p->p_vmspace = newvmspace;
4316 	PROC_VMSPACE_UNLOCK(p);
4317 	if (p == curthread->td_proc)
4318 		pmap_activate(curthread);
4319 	vmspace_free(oldvmspace);
4320 	return (0);
4321 }
4322 
4323 /*
4324  *	vm_map_lookup:
4325  *
4326  *	Finds the VM object, offset, and
4327  *	protection for a given virtual address in the
4328  *	specified map, assuming a page fault of the
4329  *	type specified.
4330  *
4331  *	Leaves the map in question locked for read; return
4332  *	values are guaranteed until a vm_map_lookup_done
4333  *	call is performed.  Note that the map argument
4334  *	is in/out; the returned map must be used in
4335  *	the call to vm_map_lookup_done.
4336  *
4337  *	A handle (out_entry) is returned for use in
4338  *	vm_map_lookup_done, to make that fast.
4339  *
4340  *	If a lookup is requested with "write protection"
4341  *	specified, the map may be changed to perform virtual
4342  *	copying operations, although the data referenced will
4343  *	remain the same.
4344  */
4345 int
4346 vm_map_lookup(vm_map_t *var_map,		/* IN/OUT */
4347 	      vm_offset_t vaddr,
4348 	      vm_prot_t fault_typea,
4349 	      vm_map_entry_t *out_entry,	/* OUT */
4350 	      vm_object_t *object,		/* OUT */
4351 	      vm_pindex_t *pindex,		/* OUT */
4352 	      vm_prot_t *out_prot,		/* OUT */
4353 	      boolean_t *wired)			/* OUT */
4354 {
4355 	vm_map_entry_t entry;
4356 	vm_map_t map = *var_map;
4357 	vm_prot_t prot;
4358 	vm_prot_t fault_type = fault_typea;
4359 	vm_object_t eobject;
4360 	vm_size_t size;
4361 	struct ucred *cred;
4362 
4363 RetryLookup:
4364 
4365 	vm_map_lock_read(map);
4366 
4367 RetryLookupLocked:
4368 	/*
4369 	 * Lookup the faulting address.
4370 	 */
4371 	if (!vm_map_lookup_entry(map, vaddr, out_entry)) {
4372 		vm_map_unlock_read(map);
4373 		return (KERN_INVALID_ADDRESS);
4374 	}
4375 
4376 	entry = *out_entry;
4377 
4378 	/*
4379 	 * Handle submaps.
4380 	 */
4381 	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
4382 		vm_map_t old_map = map;
4383 
4384 		*var_map = map = entry->object.sub_map;
4385 		vm_map_unlock_read(old_map);
4386 		goto RetryLookup;
4387 	}
4388 
4389 	/*
4390 	 * Check whether this task is allowed to have this page.
4391 	 */
4392 	prot = entry->protection;
4393 	if ((fault_typea & VM_PROT_FAULT_LOOKUP) != 0) {
4394 		fault_typea &= ~VM_PROT_FAULT_LOOKUP;
4395 		if (prot == VM_PROT_NONE && map != kernel_map &&
4396 		    (entry->eflags & MAP_ENTRY_GUARD) != 0 &&
4397 		    (entry->eflags & (MAP_ENTRY_STACK_GAP_DN |
4398 		    MAP_ENTRY_STACK_GAP_UP)) != 0 &&
4399 		    vm_map_growstack(map, vaddr, entry) == KERN_SUCCESS)
4400 			goto RetryLookupLocked;
4401 	}
4402 	fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
4403 	if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) {
4404 		vm_map_unlock_read(map);
4405 		return (KERN_PROTECTION_FAILURE);
4406 	}
4407 	KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags &
4408 	    (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) !=
4409 	    (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY),
4410 	    ("entry %p flags %x", entry, entry->eflags));
4411 	if ((fault_typea & VM_PROT_COPY) != 0 &&
4412 	    (entry->max_protection & VM_PROT_WRITE) == 0 &&
4413 	    (entry->eflags & MAP_ENTRY_COW) == 0) {
4414 		vm_map_unlock_read(map);
4415 		return (KERN_PROTECTION_FAILURE);
4416 	}
4417 
4418 	/*
4419 	 * If this page is not pageable, we have to get it for all possible
4420 	 * accesses.
4421 	 */
4422 	*wired = (entry->wired_count != 0);
4423 	if (*wired)
4424 		fault_type = entry->protection;
4425 	size = entry->end - entry->start;
4426 	/*
4427 	 * If the entry was copy-on-write, we either ...
4428 	 */
4429 	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4430 		/*
4431 		 * If we want to write the page, we may as well handle that
4432 		 * now since we've got the map locked.
4433 		 *
4434 		 * If we don't need to write the page, we just demote the
4435 		 * permissions allowed.
4436 		 */
4437 		if ((fault_type & VM_PROT_WRITE) != 0 ||
4438 		    (fault_typea & VM_PROT_COPY) != 0) {
4439 			/*
4440 			 * Make a new object, and place it in the object
4441 			 * chain.  Note that no new references have appeared
4442 			 * -- one just moved from the map to the new
4443 			 * object.
4444 			 */
4445 			if (vm_map_lock_upgrade(map))
4446 				goto RetryLookup;
4447 
4448 			if (entry->cred == NULL) {
4449 				/*
4450 				 * The debugger owner is charged for
4451 				 * the memory.
4452 				 */
4453 				cred = curthread->td_ucred;
4454 				crhold(cred);
4455 				if (!swap_reserve_by_cred(size, cred)) {
4456 					crfree(cred);
4457 					vm_map_unlock(map);
4458 					return (KERN_RESOURCE_SHORTAGE);
4459 				}
4460 				entry->cred = cred;
4461 			}
4462 			vm_object_shadow(&entry->object.vm_object,
4463 			    &entry->offset, size);
4464 			entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
4465 			eobject = entry->object.vm_object;
4466 			if (eobject->cred != NULL) {
4467 				/*
4468 				 * The object was not shadowed.
4469 				 */
4470 				swap_release_by_cred(size, entry->cred);
4471 				crfree(entry->cred);
4472 				entry->cred = NULL;
4473 			} else if (entry->cred != NULL) {
4474 				VM_OBJECT_WLOCK(eobject);
4475 				eobject->cred = entry->cred;
4476 				eobject->charge = size;
4477 				VM_OBJECT_WUNLOCK(eobject);
4478 				entry->cred = NULL;
4479 			}
4480 
4481 			vm_map_lock_downgrade(map);
4482 		} else {
4483 			/*
4484 			 * We're attempting to read a copy-on-write page --
4485 			 * don't allow writes.
4486 			 */
4487 			prot &= ~VM_PROT_WRITE;
4488 		}
4489 	}
4490 
4491 	/*
4492 	 * Create an object if necessary.
4493 	 */
4494 	if (entry->object.vm_object == NULL &&
4495 	    !map->system_map) {
4496 		if (vm_map_lock_upgrade(map))
4497 			goto RetryLookup;
4498 		entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT,
4499 		    atop(size));
4500 		entry->offset = 0;
4501 		if (entry->cred != NULL) {
4502 			VM_OBJECT_WLOCK(entry->object.vm_object);
4503 			entry->object.vm_object->cred = entry->cred;
4504 			entry->object.vm_object->charge = size;
4505 			VM_OBJECT_WUNLOCK(entry->object.vm_object);
4506 			entry->cred = NULL;
4507 		}
4508 		vm_map_lock_downgrade(map);
4509 	}
4510 
4511 	/*
4512 	 * Return the object/offset from this entry.  If the entry was
4513 	 * copy-on-write or empty, it has been fixed up.
4514 	 */
4515 	*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
4516 	*object = entry->object.vm_object;
4517 
4518 	*out_prot = prot;
4519 	return (KERN_SUCCESS);
4520 }
4521 
4522 /*
4523  *	vm_map_lookup_locked:
4524  *
4525  *	Lookup the faulting address.  A version of vm_map_lookup that returns
4526  *      KERN_FAILURE instead of blocking on map lock or memory allocation.
4527  */
4528 int
4529 vm_map_lookup_locked(vm_map_t *var_map,		/* IN/OUT */
4530 		     vm_offset_t vaddr,
4531 		     vm_prot_t fault_typea,
4532 		     vm_map_entry_t *out_entry,	/* OUT */
4533 		     vm_object_t *object,	/* OUT */
4534 		     vm_pindex_t *pindex,	/* OUT */
4535 		     vm_prot_t *out_prot,	/* OUT */
4536 		     boolean_t *wired)		/* OUT */
4537 {
4538 	vm_map_entry_t entry;
4539 	vm_map_t map = *var_map;
4540 	vm_prot_t prot;
4541 	vm_prot_t fault_type = fault_typea;
4542 
4543 	/*
4544 	 * Lookup the faulting address.
4545 	 */
4546 	if (!vm_map_lookup_entry(map, vaddr, out_entry))
4547 		return (KERN_INVALID_ADDRESS);
4548 
4549 	entry = *out_entry;
4550 
4551 	/*
4552 	 * Fail if the entry refers to a submap.
4553 	 */
4554 	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
4555 		return (KERN_FAILURE);
4556 
4557 	/*
4558 	 * Check whether this task is allowed to have this page.
4559 	 */
4560 	prot = entry->protection;
4561 	fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
4562 	if ((fault_type & prot) != fault_type)
4563 		return (KERN_PROTECTION_FAILURE);
4564 
4565 	/*
4566 	 * If this page is not pageable, we have to get it for all possible
4567 	 * accesses.
4568 	 */
4569 	*wired = (entry->wired_count != 0);
4570 	if (*wired)
4571 		fault_type = entry->protection;
4572 
4573 	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4574 		/*
4575 		 * Fail if the entry was copy-on-write for a write fault.
4576 		 */
4577 		if (fault_type & VM_PROT_WRITE)
4578 			return (KERN_FAILURE);
4579 		/*
4580 		 * We're attempting to read a copy-on-write page --
4581 		 * don't allow writes.
4582 		 */
4583 		prot &= ~VM_PROT_WRITE;
4584 	}
4585 
4586 	/*
4587 	 * Fail if an object should be created.
4588 	 */
4589 	if (entry->object.vm_object == NULL && !map->system_map)
4590 		return (KERN_FAILURE);
4591 
4592 	/*
4593 	 * Return the object/offset from this entry.  If the entry was
4594 	 * copy-on-write or empty, it has been fixed up.
4595 	 */
4596 	*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
4597 	*object = entry->object.vm_object;
4598 
4599 	*out_prot = prot;
4600 	return (KERN_SUCCESS);
4601 }
4602 
4603 /*
4604  *	vm_map_lookup_done:
4605  *
4606  *	Releases locks acquired by a vm_map_lookup
4607  *	(according to the handle returned by that lookup).
4608  */
4609 void
4610 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
4611 {
4612 	/*
4613 	 * Unlock the main-level map
4614 	 */
4615 	vm_map_unlock_read(map);
4616 }
4617 
4618 vm_offset_t
4619 vm_map_max_KBI(const struct vm_map *map)
4620 {
4621 
4622 	return (vm_map_max(map));
4623 }
4624 
4625 vm_offset_t
4626 vm_map_min_KBI(const struct vm_map *map)
4627 {
4628 
4629 	return (vm_map_min(map));
4630 }
4631 
4632 pmap_t
4633 vm_map_pmap_KBI(vm_map_t map)
4634 {
4635 
4636 	return (map->pmap);
4637 }
4638 
4639 #include "opt_ddb.h"
4640 #ifdef DDB
4641 #include <sys/kernel.h>
4642 
4643 #include <ddb/ddb.h>
4644 
4645 static void
4646 vm_map_print(vm_map_t map)
4647 {
4648 	vm_map_entry_t entry;
4649 
4650 	db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
4651 	    (void *)map,
4652 	    (void *)map->pmap, map->nentries, map->timestamp);
4653 
4654 	db_indent += 2;
4655 	for (entry = map->header.next; entry != &map->header;
4656 	    entry = entry->next) {
4657 		db_iprintf("map entry %p: start=%p, end=%p, eflags=%#x, \n",
4658 		    (void *)entry, (void *)entry->start, (void *)entry->end,
4659 		    entry->eflags);
4660 		{
4661 			static char *inheritance_name[4] =
4662 			{"share", "copy", "none", "donate_copy"};
4663 
4664 			db_iprintf(" prot=%x/%x/%s",
4665 			    entry->protection,
4666 			    entry->max_protection,
4667 			    inheritance_name[(int)(unsigned char)entry->inheritance]);
4668 			if (entry->wired_count != 0)
4669 				db_printf(", wired");
4670 		}
4671 		if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
4672 			db_printf(", share=%p, offset=0x%jx\n",
4673 			    (void *)entry->object.sub_map,
4674 			    (uintmax_t)entry->offset);
4675 			if ((entry->prev == &map->header) ||
4676 			    (entry->prev->object.sub_map !=
4677 				entry->object.sub_map)) {
4678 				db_indent += 2;
4679 				vm_map_print((vm_map_t)entry->object.sub_map);
4680 				db_indent -= 2;
4681 			}
4682 		} else {
4683 			if (entry->cred != NULL)
4684 				db_printf(", ruid %d", entry->cred->cr_ruid);
4685 			db_printf(", object=%p, offset=0x%jx",
4686 			    (void *)entry->object.vm_object,
4687 			    (uintmax_t)entry->offset);
4688 			if (entry->object.vm_object && entry->object.vm_object->cred)
4689 				db_printf(", obj ruid %d charge %jx",
4690 				    entry->object.vm_object->cred->cr_ruid,
4691 				    (uintmax_t)entry->object.vm_object->charge);
4692 			if (entry->eflags & MAP_ENTRY_COW)
4693 				db_printf(", copy (%s)",
4694 				    (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
4695 			db_printf("\n");
4696 
4697 			if ((entry->prev == &map->header) ||
4698 			    (entry->prev->object.vm_object !=
4699 				entry->object.vm_object)) {
4700 				db_indent += 2;
4701 				vm_object_print((db_expr_t)(intptr_t)
4702 						entry->object.vm_object,
4703 						0, 0, (char *)0);
4704 				db_indent -= 2;
4705 			}
4706 		}
4707 	}
4708 	db_indent -= 2;
4709 }
4710 
4711 DB_SHOW_COMMAND(map, map)
4712 {
4713 
4714 	if (!have_addr) {
4715 		db_printf("usage: show map <addr>\n");
4716 		return;
4717 	}
4718 	vm_map_print((vm_map_t)addr);
4719 }
4720 
4721 DB_SHOW_COMMAND(procvm, procvm)
4722 {
4723 	struct proc *p;
4724 
4725 	if (have_addr) {
4726 		p = db_lookup_proc(addr);
4727 	} else {
4728 		p = curproc;
4729 	}
4730 
4731 	db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
4732 	    (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
4733 	    (void *)vmspace_pmap(p->p_vmspace));
4734 
4735 	vm_map_print((vm_map_t)&p->p_vmspace->vm_map);
4736 }
4737 
4738 #endif /* DDB */
4739