xref: /freebsd/sys/vm/vm_fault.c (revision b5ab20c0669bff9ff0887a7afbce96919e37bb43)
160727d8bSWarner Losh /*-
2df8bae1dSRodney W. Grimes  * Copyright (c) 1991, 1993
3df8bae1dSRodney W. Grimes  *	The Regents of the University of California.  All rights reserved.
426f9a767SRodney W. Grimes  * Copyright (c) 1994 John S. Dyson
526f9a767SRodney W. Grimes  * All rights reserved.
626f9a767SRodney W. Grimes  * Copyright (c) 1994 David Greenman
726f9a767SRodney W. Grimes  * All rights reserved.
826f9a767SRodney W. Grimes  *
9df8bae1dSRodney W. Grimes  *
10df8bae1dSRodney W. Grimes  * This code is derived from software contributed to Berkeley by
11df8bae1dSRodney W. Grimes  * The Mach Operating System project at Carnegie-Mellon University.
12df8bae1dSRodney W. Grimes  *
13df8bae1dSRodney W. Grimes  * Redistribution and use in source and binary forms, with or without
14df8bae1dSRodney W. Grimes  * modification, are permitted provided that the following conditions
15df8bae1dSRodney W. Grimes  * are met:
16df8bae1dSRodney W. Grimes  * 1. Redistributions of source code must retain the above copyright
17df8bae1dSRodney W. Grimes  *    notice, this list of conditions and the following disclaimer.
18df8bae1dSRodney W. Grimes  * 2. Redistributions in binary form must reproduce the above copyright
19df8bae1dSRodney W. Grimes  *    notice, this list of conditions and the following disclaimer in the
20df8bae1dSRodney W. Grimes  *    documentation and/or other materials provided with the distribution.
21df8bae1dSRodney W. Grimes  * 3. All advertising materials mentioning features or use of this software
225929bcfaSPhilippe Charnier  *    must display the following acknowledgement:
23df8bae1dSRodney W. Grimes  *	This product includes software developed by the University of
24df8bae1dSRodney W. Grimes  *	California, Berkeley and its contributors.
25df8bae1dSRodney W. Grimes  * 4. Neither the name of the University nor the names of its contributors
26df8bae1dSRodney W. Grimes  *    may be used to endorse or promote products derived from this software
27df8bae1dSRodney W. Grimes  *    without specific prior written permission.
28df8bae1dSRodney W. Grimes  *
29df8bae1dSRodney W. Grimes  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30df8bae1dSRodney W. Grimes  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31df8bae1dSRodney W. Grimes  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32df8bae1dSRodney W. Grimes  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33df8bae1dSRodney W. Grimes  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34df8bae1dSRodney W. Grimes  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35df8bae1dSRodney W. Grimes  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36df8bae1dSRodney W. Grimes  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37df8bae1dSRodney W. Grimes  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38df8bae1dSRodney W. Grimes  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39df8bae1dSRodney W. Grimes  * SUCH DAMAGE.
40df8bae1dSRodney W. Grimes  *
413c4dd356SDavid Greenman  *	from: @(#)vm_fault.c	8.4 (Berkeley) 1/12/94
42df8bae1dSRodney W. Grimes  *
43df8bae1dSRodney W. Grimes  *
44df8bae1dSRodney W. Grimes  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
45df8bae1dSRodney W. Grimes  * All rights reserved.
46df8bae1dSRodney W. Grimes  *
47df8bae1dSRodney W. Grimes  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
48df8bae1dSRodney W. Grimes  *
49df8bae1dSRodney W. Grimes  * Permission to use, copy, modify and distribute this software and
50df8bae1dSRodney W. Grimes  * its documentation is hereby granted, provided that both the copyright
51df8bae1dSRodney W. Grimes  * notice and this permission notice appear in all copies of the
52df8bae1dSRodney W. Grimes  * software, derivative works or modified versions, and any portions
53df8bae1dSRodney W. Grimes  * thereof, and that both notices appear in supporting documentation.
54df8bae1dSRodney W. Grimes  *
55df8bae1dSRodney W. Grimes  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
56df8bae1dSRodney W. Grimes  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
57df8bae1dSRodney W. Grimes  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
58df8bae1dSRodney W. Grimes  *
59df8bae1dSRodney W. Grimes  * Carnegie Mellon requests users of this software to return to
60df8bae1dSRodney W. Grimes  *
61df8bae1dSRodney W. Grimes  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
62df8bae1dSRodney W. Grimes  *  School of Computer Science
63df8bae1dSRodney W. Grimes  *  Carnegie Mellon University
64df8bae1dSRodney W. Grimes  *  Pittsburgh PA 15213-3890
65df8bae1dSRodney W. Grimes  *
66df8bae1dSRodney W. Grimes  * any improvements or extensions that they make and grant Carnegie the
67df8bae1dSRodney W. Grimes  * rights to redistribute these changes.
68df8bae1dSRodney W. Grimes  */
69df8bae1dSRodney W. Grimes 
70df8bae1dSRodney W. Grimes /*
71df8bae1dSRodney W. Grimes  *	Page fault handling module.
72df8bae1dSRodney W. Grimes  */
73874651b1SDavid E. O'Brien 
74874651b1SDavid E. O'Brien #include <sys/cdefs.h>
75874651b1SDavid E. O'Brien __FBSDID("$FreeBSD$");
76874651b1SDavid E. O'Brien 
7735818d2eSJohn Baldwin #include "opt_ktrace.h"
78f8a47341SAlan Cox #include "opt_vm.h"
79f8a47341SAlan Cox 
80df8bae1dSRodney W. Grimes #include <sys/param.h>
81df8bae1dSRodney W. Grimes #include <sys/systm.h>
824edf4a58SJohn Baldwin #include <sys/kernel.h>
83fb919e4dSMark Murray #include <sys/lock.h>
84a8b0f100SAlan Cox #include <sys/mman.h>
8526f9a767SRodney W. Grimes #include <sys/proc.h>
8626f9a767SRodney W. Grimes #include <sys/resourcevar.h>
8789f6b863SAttilio Rao #include <sys/rwlock.h>
8823955314SAlfred Perlstein #include <sys/sysctl.h>
894edf4a58SJohn Baldwin #include <sys/vmmeter.h>
904edf4a58SJohn Baldwin #include <sys/vnode.h>
9135818d2eSJohn Baldwin #ifdef KTRACE
9235818d2eSJohn Baldwin #include <sys/ktrace.h>
9335818d2eSJohn Baldwin #endif
94df8bae1dSRodney W. Grimes 
95df8bae1dSRodney W. Grimes #include <vm/vm.h>
96efeaf95aSDavid Greenman #include <vm/vm_param.h>
97efeaf95aSDavid Greenman #include <vm/pmap.h>
98efeaf95aSDavid Greenman #include <vm/vm_map.h>
99efeaf95aSDavid Greenman #include <vm/vm_object.h>
100df8bae1dSRodney W. Grimes #include <vm/vm_page.h>
101df8bae1dSRodney W. Grimes #include <vm/vm_pageout.h>
102a83c285cSDavid Greenman #include <vm/vm_kern.h>
10324a1cce3SDavid Greenman #include <vm/vm_pager.h>
104efeaf95aSDavid Greenman #include <vm/vm_extern.h>
105dfdf9abdSAlan Cox #include <vm/vm_reserv.h>
106df8bae1dSRodney W. Grimes 
107566526a9SAlan Cox #define PFBAK 4
108566526a9SAlan Cox #define PFFOR 4
109566526a9SAlan Cox 
11011caded3SAlfred Perlstein static int vm_fault_additional_pages(vm_page_t, int, int, vm_page_t *, int *);
11126f9a767SRodney W. Grimes 
11213458803SAlan Cox #define	VM_FAULT_READ_BEHIND	8
1135268042bSAlan Cox #define	VM_FAULT_READ_DEFAULT	(1 + VM_FAULT_READ_AHEAD_INIT)
11413458803SAlan Cox #define	VM_FAULT_READ_MAX	(1 + VM_FAULT_READ_AHEAD_MAX)
11513458803SAlan Cox #define	VM_FAULT_NINCR		(VM_FAULT_READ_MAX / VM_FAULT_READ_BEHIND)
11613458803SAlan Cox #define	VM_FAULT_SUM		(VM_FAULT_NINCR * (VM_FAULT_NINCR + 1) / 2)
117a8b0f100SAlan Cox 
118a8b0f100SAlan Cox #define	VM_FAULT_DONTNEED_MIN	1048576
11926f9a767SRodney W. Grimes 
1204866e085SJohn Dyson struct faultstate {
1214866e085SJohn Dyson 	vm_page_t m;
1224866e085SJohn Dyson 	vm_object_t object;
1234866e085SJohn Dyson 	vm_pindex_t pindex;
1244866e085SJohn Dyson 	vm_page_t first_m;
1254866e085SJohn Dyson 	vm_object_t	first_object;
1264866e085SJohn Dyson 	vm_pindex_t first_pindex;
1274866e085SJohn Dyson 	vm_map_t map;
1284866e085SJohn Dyson 	vm_map_entry_t entry;
12925adb370SBrian Feldman 	int lookup_still_valid;
1304866e085SJohn Dyson 	struct vnode *vp;
1314866e085SJohn Dyson };
1324866e085SJohn Dyson 
133a8b0f100SAlan Cox static void vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr,
134a8b0f100SAlan Cox 	    int ahead);
13563281952SAlan Cox static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
13663281952SAlan Cox 	    int faultcount, int reqpage);
13713458803SAlan Cox 
13862a59e8fSWarner Losh static inline void
1394866e085SJohn Dyson release_page(struct faultstate *fs)
1404866e085SJohn Dyson {
1410d0be82aSKonstantin Belousov 
142c7aebda8SAttilio Rao 	vm_page_xunbusy(fs->m);
1432965a453SKip Macy 	vm_page_lock(fs->m);
1444866e085SJohn Dyson 	vm_page_deactivate(fs->m);
1452965a453SKip Macy 	vm_page_unlock(fs->m);
1464866e085SJohn Dyson 	fs->m = NULL;
1474866e085SJohn Dyson }
1484866e085SJohn Dyson 
14962a59e8fSWarner Losh static inline void
1504866e085SJohn Dyson unlock_map(struct faultstate *fs)
1514866e085SJohn Dyson {
1520d0be82aSKonstantin Belousov 
15325adb370SBrian Feldman 	if (fs->lookup_still_valid) {
1544866e085SJohn Dyson 		vm_map_lookup_done(fs->map, fs->entry);
15525adb370SBrian Feldman 		fs->lookup_still_valid = FALSE;
1564866e085SJohn Dyson 	}
1574866e085SJohn Dyson }
1584866e085SJohn Dyson 
1594866e085SJohn Dyson static void
160a51b0840SAlan Cox unlock_and_deallocate(struct faultstate *fs)
1614866e085SJohn Dyson {
162f29ba63eSAlan Cox 
1634866e085SJohn Dyson 	vm_object_pip_wakeup(fs->object);
16489f6b863SAttilio Rao 	VM_OBJECT_WUNLOCK(fs->object);
1654866e085SJohn Dyson 	if (fs->object != fs->first_object) {
16689f6b863SAttilio Rao 		VM_OBJECT_WLOCK(fs->first_object);
1672965a453SKip Macy 		vm_page_lock(fs->first_m);
1684866e085SJohn Dyson 		vm_page_free(fs->first_m);
1692965a453SKip Macy 		vm_page_unlock(fs->first_m);
1704866e085SJohn Dyson 		vm_object_pip_wakeup(fs->first_object);
17189f6b863SAttilio Rao 		VM_OBJECT_WUNLOCK(fs->first_object);
1724866e085SJohn Dyson 		fs->first_m = NULL;
1734866e085SJohn Dyson 	}
1744866e085SJohn Dyson 	vm_object_deallocate(fs->first_object);
1754866e085SJohn Dyson 	unlock_map(fs);
1764866e085SJohn Dyson 	if (fs->vp != NULL) {
1770cddd8f0SMatthew Dillon 		vput(fs->vp);
1784866e085SJohn Dyson 		fs->vp = NULL;
1794866e085SJohn Dyson 	}
1804866e085SJohn Dyson }
1814866e085SJohn Dyson 
182a36f5532SKonstantin Belousov static void
183a36f5532SKonstantin Belousov vm_fault_dirty(vm_map_entry_t entry, vm_page_t m, vm_prot_t prot,
184a36f5532SKonstantin Belousov     vm_prot_t fault_type, int fault_flags, boolean_t set_wd)
185a36f5532SKonstantin Belousov {
186a36f5532SKonstantin Belousov 	boolean_t need_dirty;
187a36f5532SKonstantin Belousov 
188a36f5532SKonstantin Belousov 	if (((prot & VM_PROT_WRITE) == 0 &&
189a36f5532SKonstantin Belousov 	    (fault_flags & VM_FAULT_DIRTY) == 0) ||
190a36f5532SKonstantin Belousov 	    (m->oflags & VPO_UNMANAGED) != 0)
191a36f5532SKonstantin Belousov 		return;
192a36f5532SKonstantin Belousov 
193a36f5532SKonstantin Belousov 	VM_OBJECT_ASSERT_LOCKED(m->object);
194a36f5532SKonstantin Belousov 
195a36f5532SKonstantin Belousov 	need_dirty = ((fault_type & VM_PROT_WRITE) != 0 &&
196a36f5532SKonstantin Belousov 	    (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) ||
197a36f5532SKonstantin Belousov 	    (fault_flags & VM_FAULT_DIRTY) != 0;
198a36f5532SKonstantin Belousov 
199a36f5532SKonstantin Belousov 	if (set_wd)
200a36f5532SKonstantin Belousov 		vm_object_set_writeable_dirty(m->object);
201a36f5532SKonstantin Belousov 	else
202a36f5532SKonstantin Belousov 		/*
203a36f5532SKonstantin Belousov 		 * If two callers of vm_fault_dirty() with set_wd ==
204a36f5532SKonstantin Belousov 		 * FALSE, one for the map entry with MAP_ENTRY_NOSYNC
205a36f5532SKonstantin Belousov 		 * flag set, other with flag clear, race, it is
206a36f5532SKonstantin Belousov 		 * possible for the no-NOSYNC thread to see m->dirty
207a36f5532SKonstantin Belousov 		 * != 0 and not clear VPO_NOSYNC.  Take vm_page lock
208a36f5532SKonstantin Belousov 		 * around manipulation of VPO_NOSYNC and
209a36f5532SKonstantin Belousov 		 * vm_page_dirty() call, to avoid the race and keep
210a36f5532SKonstantin Belousov 		 * m->oflags consistent.
211a36f5532SKonstantin Belousov 		 */
212a36f5532SKonstantin Belousov 		vm_page_lock(m);
213a36f5532SKonstantin Belousov 
214a36f5532SKonstantin Belousov 	/*
215a36f5532SKonstantin Belousov 	 * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC
216a36f5532SKonstantin Belousov 	 * if the page is already dirty to prevent data written with
217a36f5532SKonstantin Belousov 	 * the expectation of being synced from not being synced.
218a36f5532SKonstantin Belousov 	 * Likewise if this entry does not request NOSYNC then make
219a36f5532SKonstantin Belousov 	 * sure the page isn't marked NOSYNC.  Applications sharing
220a36f5532SKonstantin Belousov 	 * data should use the same flags to avoid ping ponging.
221a36f5532SKonstantin Belousov 	 */
222a36f5532SKonstantin Belousov 	if ((entry->eflags & MAP_ENTRY_NOSYNC) != 0) {
223a36f5532SKonstantin Belousov 		if (m->dirty == 0) {
224a36f5532SKonstantin Belousov 			m->oflags |= VPO_NOSYNC;
225a36f5532SKonstantin Belousov 		}
226a36f5532SKonstantin Belousov 	} else {
227a36f5532SKonstantin Belousov 		m->oflags &= ~VPO_NOSYNC;
228a36f5532SKonstantin Belousov 	}
229a36f5532SKonstantin Belousov 
230a36f5532SKonstantin Belousov 	/*
231a36f5532SKonstantin Belousov 	 * If the fault is a write, we know that this page is being
232a36f5532SKonstantin Belousov 	 * written NOW so dirty it explicitly to save on
233a36f5532SKonstantin Belousov 	 * pmap_is_modified() calls later.
234a36f5532SKonstantin Belousov 	 *
235a36f5532SKonstantin Belousov 	 * Also tell the backing pager, if any, that it should remove
236a36f5532SKonstantin Belousov 	 * any swap backing since the page is now dirty.
237a36f5532SKonstantin Belousov 	 */
238a36f5532SKonstantin Belousov 	if (need_dirty)
239a36f5532SKonstantin Belousov 		vm_page_dirty(m);
240a36f5532SKonstantin Belousov 	if (!set_wd)
241a36f5532SKonstantin Belousov 		vm_page_unlock(m);
242a36f5532SKonstantin Belousov 	if (need_dirty)
243a36f5532SKonstantin Belousov 		vm_pager_page_unswapped(m);
244a36f5532SKonstantin Belousov }
245a36f5532SKonstantin Belousov 
246df8bae1dSRodney W. Grimes /*
24740360b1bSMatthew Dillon  * TRYPAGER - used by vm_fault to calculate whether the pager for the
24840360b1bSMatthew Dillon  *	      current object *might* contain the page.
24940360b1bSMatthew Dillon  *
25040360b1bSMatthew Dillon  *	      default objects are zero-fill, there is no real pager.
25140360b1bSMatthew Dillon  */
25240360b1bSMatthew Dillon #define TRYPAGER	(fs.object->type != OBJT_DEFAULT && \
2532db65ab4SAlan Cox 			((fault_flags & VM_FAULT_CHANGE_WIRING) == 0 || wired))
25440360b1bSMatthew Dillon 
25540360b1bSMatthew Dillon /*
256df8bae1dSRodney W. Grimes  *	vm_fault:
257df8bae1dSRodney W. Grimes  *
258956f3135SPhilippe Charnier  *	Handle a page fault occurring at the given address,
259df8bae1dSRodney W. Grimes  *	requiring the given permissions, in the map specified.
260df8bae1dSRodney W. Grimes  *	If successful, the page is inserted into the
261df8bae1dSRodney W. Grimes  *	associated physical map.
262df8bae1dSRodney W. Grimes  *
263df8bae1dSRodney W. Grimes  *	NOTE: the given address should be truncated to the
264df8bae1dSRodney W. Grimes  *	proper page address.
265df8bae1dSRodney W. Grimes  *
266df8bae1dSRodney W. Grimes  *	KERN_SUCCESS is returned if the page fault is handled; otherwise,
267df8bae1dSRodney W. Grimes  *	a standard error specifying why the fault is fatal is returned.
268df8bae1dSRodney W. Grimes  *
269df8bae1dSRodney W. Grimes  *	The map in question must be referenced, and remains so.
2700cddd8f0SMatthew Dillon  *	Caller may hold no locks.
271df8bae1dSRodney W. Grimes  */
272df8bae1dSRodney W. Grimes int
27323955314SAlfred Perlstein vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
27423955314SAlfred Perlstein     int fault_flags)
27523955314SAlfred Perlstein {
27635818d2eSJohn Baldwin 	struct thread *td;
27735818d2eSJohn Baldwin 	int result;
278acd11c74SAlan Cox 
27935818d2eSJohn Baldwin 	td = curthread;
28035818d2eSJohn Baldwin 	if ((td->td_pflags & TDP_NOFAULTING) != 0)
2812801687dSKonstantin Belousov 		return (KERN_PROTECTION_FAILURE);
28235818d2eSJohn Baldwin #ifdef KTRACE
28335818d2eSJohn Baldwin 	if (map != kernel_map && KTRPOINT(td, KTR_FAULT))
28435818d2eSJohn Baldwin 		ktrfault(vaddr, fault_type);
28535818d2eSJohn Baldwin #endif
286be996836SAttilio Rao 	result = vm_fault_hold(map, trunc_page(vaddr), fault_type, fault_flags,
287be996836SAttilio Rao 	    NULL);
28835818d2eSJohn Baldwin #ifdef KTRACE
28935818d2eSJohn Baldwin 	if (map != kernel_map && KTRPOINT(td, KTR_FAULTEND))
29035818d2eSJohn Baldwin 		ktrfaultend(result);
29135818d2eSJohn Baldwin #endif
29235818d2eSJohn Baldwin 	return (result);
293acd11c74SAlan Cox }
294acd11c74SAlan Cox 
295acd11c74SAlan Cox int
296be996836SAttilio Rao vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
297acd11c74SAlan Cox     int fault_flags, vm_page_t *m_hold)
298acd11c74SAlan Cox {
299df8bae1dSRodney W. Grimes 	vm_prot_t prot;
30013458803SAlan Cox 	int alloc_req, era, faultcount, nera, reqpage, result;
30113458803SAlan Cox 	boolean_t growstack, is_first_object_locked, wired;
3022d8acc0fSJohn Dyson 	int map_generation;
303df8bae1dSRodney W. Grimes 	vm_object_t next_object;
30413458803SAlan Cox 	vm_page_t marray[VM_FAULT_READ_MAX];
3054866e085SJohn Dyson 	int hardfault;
3064866e085SJohn Dyson 	struct faultstate fs;
307d2bf64c3SKonstantin Belousov 	struct vnode *vp;
308afe55ca3SKonstantin Belousov 	vm_page_t m;
3095268042bSAlan Cox 	int ahead, behind, cluster_offset, error, locked;
310df8bae1dSRodney W. Grimes 
3114866e085SJohn Dyson 	hardfault = 0;
3126139043bSAlan Cox 	growstack = TRUE;
31367596082SAttilio Rao 	PCPU_INC(cnt.v_vm_faults);
314d2bf64c3SKonstantin Belousov 	fs.vp = NULL;
31513458803SAlan Cox 	faultcount = reqpage = 0;
316df8bae1dSRodney W. Grimes 
317df8bae1dSRodney W. Grimes RetryFault:;
318df8bae1dSRodney W. Grimes 
319df8bae1dSRodney W. Grimes 	/*
3200d94caffSDavid Greenman 	 * Find the backing store object and offset into it to begin the
3210d94caffSDavid Greenman 	 * search.
322df8bae1dSRodney W. Grimes 	 */
32340360b1bSMatthew Dillon 	fs.map = map;
32492de35b0SAlan Cox 	result = vm_map_lookup(&fs.map, vaddr, fault_type, &fs.entry,
32592de35b0SAlan Cox 	    &fs.first_object, &fs.first_pindex, &prot, &wired);
32692de35b0SAlan Cox 	if (result != KERN_SUCCESS) {
3276139043bSAlan Cox 		if (growstack && result == KERN_INVALID_ADDRESS &&
3282db65ab4SAlan Cox 		    map != kernel_map) {
3296139043bSAlan Cox 			result = vm_map_growstack(curproc, vaddr);
330a976eb5eSAlan Cox 			if (result != KERN_SUCCESS)
3316139043bSAlan Cox 				return (KERN_FAILURE);
3326139043bSAlan Cox 			growstack = FALSE;
3336139043bSAlan Cox 			goto RetryFault;
3346139043bSAlan Cox 		}
33592de35b0SAlan Cox 		return (result);
33609e0c6ccSJohn Dyson 	}
33709e0c6ccSJohn Dyson 
3384866e085SJohn Dyson 	map_generation = fs.map->timestamp;
3392d8acc0fSJohn Dyson 
3404866e085SJohn Dyson 	if (fs.entry->eflags & MAP_ENTRY_NOFAULT) {
34147221757SJohn Dyson 		panic("vm_fault: fault on nofault entry, addr: %lx",
34292c4c4ebSBruce Evans 		    (u_long)vaddr);
3437aaaa4fdSJohn Dyson 	}
3447aaaa4fdSJohn Dyson 
3454f9c9114SKonstantin Belousov 	if (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION &&
3464f9c9114SKonstantin Belousov 	    fs.entry->wiring_thread != curthread) {
3474f9c9114SKonstantin Belousov 		vm_map_unlock_read(fs.map);
3484f9c9114SKonstantin Belousov 		vm_map_lock(fs.map);
3494f9c9114SKonstantin Belousov 		if (vm_map_lookup_entry(fs.map, vaddr, &fs.entry) &&
3504f9c9114SKonstantin Belousov 		    (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION)) {
3514f9c9114SKonstantin Belousov 			fs.entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
3524f9c9114SKonstantin Belousov 			vm_map_unlock_and_wait(fs.map, 0);
3534f9c9114SKonstantin Belousov 		} else
3544f9c9114SKonstantin Belousov 			vm_map_unlock(fs.map);
3554f9c9114SKonstantin Belousov 		goto RetryFault;
3564f9c9114SKonstantin Belousov 	}
3574f9c9114SKonstantin Belousov 
358afe55ca3SKonstantin Belousov 	if (wired)
359afe55ca3SKonstantin Belousov 		fault_type = prot | (fault_type & VM_PROT_COPY);
360afe55ca3SKonstantin Belousov 
361afe55ca3SKonstantin Belousov 	if (fs.vp == NULL /* avoid locked vnode leak */ &&
362afe55ca3SKonstantin Belousov 	    (fault_flags & (VM_FAULT_CHANGE_WIRING | VM_FAULT_DIRTY)) == 0 &&
363afe55ca3SKonstantin Belousov 	    /* avoid calling vm_object_set_writeable_dirty() */
364afe55ca3SKonstantin Belousov 	    ((prot & VM_PROT_WRITE) == 0 ||
365f40cb1c6SKonstantin Belousov 	    (fs.first_object->type != OBJT_VNODE &&
366f40cb1c6SKonstantin Belousov 	    (fs.first_object->flags & OBJ_TMPFS_NODE) == 0) ||
367afe55ca3SKonstantin Belousov 	    (fs.first_object->flags & OBJ_MIGHTBEDIRTY) != 0)) {
368afe55ca3SKonstantin Belousov 		VM_OBJECT_RLOCK(fs.first_object);
369afe55ca3SKonstantin Belousov 		if ((prot & VM_PROT_WRITE) != 0 &&
370f40cb1c6SKonstantin Belousov 		    (fs.first_object->type == OBJT_VNODE ||
371f40cb1c6SKonstantin Belousov 		    (fs.first_object->flags & OBJ_TMPFS_NODE) != 0) &&
372afe55ca3SKonstantin Belousov 		    (fs.first_object->flags & OBJ_MIGHTBEDIRTY) == 0)
373afe55ca3SKonstantin Belousov 			goto fast_failed;
374afe55ca3SKonstantin Belousov 		m = vm_page_lookup(fs.first_object, fs.first_pindex);
375b9ce8cc2SAlan Cox 		/* A busy page can be mapped for read|execute access. */
376b9ce8cc2SAlan Cox 		if (m == NULL || ((prot & VM_PROT_WRITE) != 0 &&
377b9ce8cc2SAlan Cox 		    vm_page_busied(m)) || m->valid != VM_PAGE_BITS_ALL)
378afe55ca3SKonstantin Belousov 			goto fast_failed;
379afe55ca3SKonstantin Belousov 		result = pmap_enter(fs.map->pmap, vaddr, m, prot,
380afe55ca3SKonstantin Belousov 		   fault_type | PMAP_ENTER_NOSLEEP | (wired ? PMAP_ENTER_WIRED :
381afe55ca3SKonstantin Belousov 		   0), 0);
382afe55ca3SKonstantin Belousov 		if (result != KERN_SUCCESS)
383afe55ca3SKonstantin Belousov 			goto fast_failed;
384afe55ca3SKonstantin Belousov 		if (m_hold != NULL) {
385afe55ca3SKonstantin Belousov 			*m_hold = m;
386afe55ca3SKonstantin Belousov 			vm_page_lock(m);
387afe55ca3SKonstantin Belousov 			vm_page_hold(m);
388afe55ca3SKonstantin Belousov 			vm_page_unlock(m);
389afe55ca3SKonstantin Belousov 		}
390a36f5532SKonstantin Belousov 		vm_fault_dirty(fs.entry, m, prot, fault_type, fault_flags,
391a36f5532SKonstantin Belousov 		    FALSE);
392afe55ca3SKonstantin Belousov 		VM_OBJECT_RUNLOCK(fs.first_object);
393afe55ca3SKonstantin Belousov 		if (!wired)
394afe55ca3SKonstantin Belousov 			vm_fault_prefault(&fs, vaddr, 0, 0);
395afe55ca3SKonstantin Belousov 		vm_map_lookup_done(fs.map, fs.entry);
396afe55ca3SKonstantin Belousov 		curthread->td_ru.ru_minflt++;
397afe55ca3SKonstantin Belousov 		return (KERN_SUCCESS);
398afe55ca3SKonstantin Belousov fast_failed:
399afe55ca3SKonstantin Belousov 		if (!VM_OBJECT_TRYUPGRADE(fs.first_object)) {
400afe55ca3SKonstantin Belousov 			VM_OBJECT_RUNLOCK(fs.first_object);
401afe55ca3SKonstantin Belousov 			VM_OBJECT_WLOCK(fs.first_object);
402afe55ca3SKonstantin Belousov 		}
403afe55ca3SKonstantin Belousov 	} else {
404afe55ca3SKonstantin Belousov 		VM_OBJECT_WLOCK(fs.first_object);
405afe55ca3SKonstantin Belousov 	}
406afe55ca3SKonstantin Belousov 
40795e5e988SJohn Dyson 	/*
40895e5e988SJohn Dyson 	 * Make a reference to this object to prevent its disposal while we
40995e5e988SJohn Dyson 	 * are messing with it.  Once we have the reference, the map is free
41095e5e988SJohn Dyson 	 * to be diddled.  Since objects reference their shadows (and copies),
41195e5e988SJohn Dyson 	 * they will stay around as well.
412fe8e0238SMatthew Dillon 	 *
413fe8e0238SMatthew Dillon 	 * Bump the paging-in-progress count to prevent size changes (e.g.
414fe8e0238SMatthew Dillon 	 * truncation operations) during I/O.  This must be done after
415fe8e0238SMatthew Dillon 	 * obtaining the vnode lock in order to avoid possible deadlocks.
41695e5e988SJohn Dyson 	 */
417a976eb5eSAlan Cox 	vm_object_reference_locked(fs.first_object);
418d474eaaaSDoug Rabson 	vm_object_pip_add(fs.first_object, 1);
41995e5e988SJohn Dyson 
42025adb370SBrian Feldman 	fs.lookup_still_valid = TRUE;
421df8bae1dSRodney W. Grimes 
4224866e085SJohn Dyson 	fs.first_m = NULL;
423df8bae1dSRodney W. Grimes 
424df8bae1dSRodney W. Grimes 	/*
425df8bae1dSRodney W. Grimes 	 * Search for the page at object/offset.
426df8bae1dSRodney W. Grimes 	 */
4274866e085SJohn Dyson 	fs.object = fs.first_object;
4284866e085SJohn Dyson 	fs.pindex = fs.first_pindex;
429df8bae1dSRodney W. Grimes 	while (TRUE) {
4301c7c3c6aSMatthew Dillon 		/*
4311c7c3c6aSMatthew Dillon 		 * If the object is dead, we stop here
4321c7c3c6aSMatthew Dillon 		 */
4334866e085SJohn Dyson 		if (fs.object->flags & OBJ_DEAD) {
4344866e085SJohn Dyson 			unlock_and_deallocate(&fs);
43547221757SJohn Dyson 			return (KERN_PROTECTION_FAILURE);
43647221757SJohn Dyson 		}
43747221757SJohn Dyson 
4381c7c3c6aSMatthew Dillon 		/*
4391c7c3c6aSMatthew Dillon 		 * See if page is resident
4401c7c3c6aSMatthew Dillon 		 */
4414866e085SJohn Dyson 		fs.m = vm_page_lookup(fs.object, fs.pindex);
4424866e085SJohn Dyson 		if (fs.m != NULL) {
44398cb733cSKenneth D. Merry 			/*
4441c7c3c6aSMatthew Dillon 			 * Wait/Retry if the page is busy.  We have to do this
445c7aebda8SAttilio Rao 			 * if the page is either exclusive or shared busy
446c7aebda8SAttilio Rao 			 * because the vm_pager may be using read busy for
447c7aebda8SAttilio Rao 			 * pageouts (and even pageins if it is the vnode
448c7aebda8SAttilio Rao 			 * pager), and we could end up trying to pagein and
449c7aebda8SAttilio Rao 			 * pageout the same page simultaneously.
4501c7c3c6aSMatthew Dillon 			 *
4511c7c3c6aSMatthew Dillon 			 * We can theoretically allow the busy case on a read
4521c7c3c6aSMatthew Dillon 			 * fault if the page is marked valid, but since such
4531c7c3c6aSMatthew Dillon 			 * pages are typically already pmap'd, putting that
4541c7c3c6aSMatthew Dillon 			 * special case in might be more effort then it is
4551c7c3c6aSMatthew Dillon 			 * worth.  We cannot under any circumstances mess
456c7aebda8SAttilio Rao 			 * around with a shared busied page except, perhaps,
4571c7c3c6aSMatthew Dillon 			 * to pmap it.
458df8bae1dSRodney W. Grimes 			 */
459c7aebda8SAttilio Rao 			if (vm_page_busied(fs.m)) {
460b88b6c9dSAlan Cox 				/*
461b88b6c9dSAlan Cox 				 * Reference the page before unlocking and
462b88b6c9dSAlan Cox 				 * sleeping so that the page daemon is less
463b88b6c9dSAlan Cox 				 * likely to reclaim it.
464b88b6c9dSAlan Cox 				 */
4653407fefeSKonstantin Belousov 				vm_page_aflag_set(fs.m, PGA_REFERENCED);
466a51b0840SAlan Cox 				if (fs.object != fs.first_object) {
46789f6b863SAttilio Rao 					if (!VM_OBJECT_TRYWLOCK(
468a6e38685SKonstantin Belousov 					    fs.first_object)) {
46989f6b863SAttilio Rao 						VM_OBJECT_WUNLOCK(fs.object);
47089f6b863SAttilio Rao 						VM_OBJECT_WLOCK(fs.first_object);
47189f6b863SAttilio Rao 						VM_OBJECT_WLOCK(fs.object);
472a6e38685SKonstantin Belousov 					}
4732965a453SKip Macy 					vm_page_lock(fs.first_m);
474a51b0840SAlan Cox 					vm_page_free(fs.first_m);
4752965a453SKip Macy 					vm_page_unlock(fs.first_m);
476a51b0840SAlan Cox 					vm_object_pip_wakeup(fs.first_object);
47789f6b863SAttilio Rao 					VM_OBJECT_WUNLOCK(fs.first_object);
478a51b0840SAlan Cox 					fs.first_m = NULL;
479a51b0840SAlan Cox 				}
480a51b0840SAlan Cox 				unlock_map(&fs);
481a51b0840SAlan Cox 				if (fs.m == vm_page_lookup(fs.object,
482a51b0840SAlan Cox 				    fs.pindex)) {
483c7aebda8SAttilio Rao 					vm_page_sleep_if_busy(fs.m, "vmpfw");
484a51b0840SAlan Cox 				}
485a51b0840SAlan Cox 				vm_object_pip_wakeup(fs.object);
48689f6b863SAttilio Rao 				VM_OBJECT_WUNLOCK(fs.object);
48767596082SAttilio Rao 				PCPU_INC(cnt.v_intrans);
4884866e085SJohn Dyson 				vm_object_deallocate(fs.first_object);
489df8bae1dSRodney W. Grimes 				goto RetryFault;
490df8bae1dSRodney W. Grimes 			}
4913846a822SKonstantin Belousov 			vm_page_lock(fs.m);
4928d220203SAlan Cox 			vm_page_remque(fs.m);
4932965a453SKip Macy 			vm_page_unlock(fs.m);
4947615edaaSMatthew Dillon 
4951c7c3c6aSMatthew Dillon 			/*
4961c7c3c6aSMatthew Dillon 			 * Mark page busy for other processes, and the
4971c7c3c6aSMatthew Dillon 			 * pagedaemon.  If it still isn't completely valid
4981c7c3c6aSMatthew Dillon 			 * (readable), jump to readrest, else break-out ( we
4991c7c3c6aSMatthew Dillon 			 * found the page ).
5001c7c3c6aSMatthew Dillon 			 */
501c7aebda8SAttilio Rao 			vm_page_xbusy(fs.m);
502ff5958e7SAlan Cox 			if (fs.m->valid != VM_PAGE_BITS_ALL)
5030d94caffSDavid Greenman 				goto readrest;
504df8bae1dSRodney W. Grimes 			break;
505df8bae1dSRodney W. Grimes 		}
5061c7c3c6aSMatthew Dillon 
5071c7c3c6aSMatthew Dillon 		/*
50840360b1bSMatthew Dillon 		 * Page is not resident, If this is the search termination
50940360b1bSMatthew Dillon 		 * or the pager might contain the page, allocate a new page.
5101c7c3c6aSMatthew Dillon 		 */
51140360b1bSMatthew Dillon 		if (TRYPAGER || fs.object == fs.first_object) {
5124866e085SJohn Dyson 			if (fs.pindex >= fs.object->size) {
5134866e085SJohn Dyson 				unlock_and_deallocate(&fs);
5145f55e841SDavid Greenman 				return (KERN_PROTECTION_FAILURE);
5155f55e841SDavid Greenman 			}
51622ba64e8SJohn Dyson 
517df8bae1dSRodney W. Grimes 			/*
5180d94caffSDavid Greenman 			 * Allocate a new page for this object/offset pair.
5193f1c4c4fSKonstantin Belousov 			 *
5203f1c4c4fSKonstantin Belousov 			 * Unlocked read of the p_flag is harmless. At
5213f1c4c4fSKonstantin Belousov 			 * worst, the P_KILLED might be not observed
5223f1c4c4fSKonstantin Belousov 			 * there, and allocation can fail, causing
5233f1c4c4fSKonstantin Belousov 			 * restart and new reading of the p_flag.
524df8bae1dSRodney W. Grimes 			 */
52540360b1bSMatthew Dillon 			fs.m = NULL;
5263f1c4c4fSKonstantin Belousov 			if (!vm_page_count_severe() || P_KILLED(curproc)) {
527f8a47341SAlan Cox #if VM_NRESERVLEVEL > 0
5283d653db0SAlan Cox 				vm_object_color(fs.object, atop(vaddr) -
5293d653db0SAlan Cox 				    fs.pindex);
530f8a47341SAlan Cox #endif
5313f1c4c4fSKonstantin Belousov 				alloc_req = P_KILLED(curproc) ?
5323f1c4c4fSKonstantin Belousov 				    VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL;
5333f1c4c4fSKonstantin Belousov 				if (fs.object->type != OBJT_VNODE &&
5343f1c4c4fSKonstantin Belousov 				    fs.object->backing_object == NULL)
5353f1c4c4fSKonstantin Belousov 					alloc_req |= VM_ALLOC_ZERO;
5364866e085SJohn Dyson 				fs.m = vm_page_alloc(fs.object, fs.pindex,
5373f1c4c4fSKonstantin Belousov 				    alloc_req);
53840360b1bSMatthew Dillon 			}
5394866e085SJohn Dyson 			if (fs.m == NULL) {
5404866e085SJohn Dyson 				unlock_and_deallocate(&fs);
541ef6020d1SMike Silbersack 				VM_WAITPFAULT;
542df8bae1dSRodney W. Grimes 				goto RetryFault;
5430a2e596aSAlan Cox 			} else if (fs.m->valid == VM_PAGE_BITS_ALL)
5444ab8ab92SKonstantin Belousov 				break;
545df8bae1dSRodney W. Grimes 		}
54647221757SJohn Dyson 
5470d94caffSDavid Greenman readrest:
5481c7c3c6aSMatthew Dillon 		/*
54940360b1bSMatthew Dillon 		 * We have found a valid page or we have allocated a new page.
55040360b1bSMatthew Dillon 		 * The page thus may not be valid or may not be entirely
55140360b1bSMatthew Dillon 		 * valid.
55240360b1bSMatthew Dillon 		 *
55340360b1bSMatthew Dillon 		 * Attempt to fault-in the page if there is a chance that the
55440360b1bSMatthew Dillon 		 * pager has it, and potentially fault in additional pages
55540360b1bSMatthew Dillon 		 * at the same time.
5561c7c3c6aSMatthew Dillon 		 */
55740360b1bSMatthew Dillon 		if (TRYPAGER) {
558df8bae1dSRodney W. Grimes 			int rv;
5597f866e4bSAlan Cox 			u_char behavior = vm_map_entry_behavior(fs.entry);
560867a482dSJohn Dyson 
5615268042bSAlan Cox 			era = fs.entry->read_ahead;
5623f1c4c4fSKonstantin Belousov 			if (behavior == MAP_ENTRY_BEHAV_RANDOM ||
5633f1c4c4fSKonstantin Belousov 			    P_KILLED(curproc)) {
564867a482dSJohn Dyson 				behind = 0;
5655268042bSAlan Cox 				nera = 0;
56613458803SAlan Cox 				ahead = 0;
56713458803SAlan Cox 			} else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) {
56813458803SAlan Cox 				behind = 0;
5695268042bSAlan Cox 				nera = VM_FAULT_READ_AHEAD_MAX;
5705268042bSAlan Cox 				ahead = nera;
57113458803SAlan Cox 				if (fs.pindex == fs.entry->next_read)
572a8b0f100SAlan Cox 					vm_fault_dontneed(&fs, vaddr, ahead);
5735268042bSAlan Cox 			} else if (fs.pindex == fs.entry->next_read) {
57413458803SAlan Cox 				/*
5755268042bSAlan Cox 				 * This is a sequential fault.  Arithmetically
5765268042bSAlan Cox 				 * increase the requested number of pages in
5775268042bSAlan Cox 				 * the read-ahead window.  The requested
5785268042bSAlan Cox 				 * number of pages is "# of sequential faults
5795268042bSAlan Cox 				 * x (read ahead min + 1) + read ahead min"
58013458803SAlan Cox 				 */
5815268042bSAlan Cox 				behind = 0;
5825268042bSAlan Cox 				nera = VM_FAULT_READ_AHEAD_MIN;
5835268042bSAlan Cox 				if (era > 0) {
5845268042bSAlan Cox 					nera += era + 1;
58513458803SAlan Cox 					if (nera > VM_FAULT_READ_AHEAD_MAX)
58613458803SAlan Cox 						nera = VM_FAULT_READ_AHEAD_MAX;
5875268042bSAlan Cox 				}
58813458803SAlan Cox 				ahead = nera;
58913458803SAlan Cox 				if (era == VM_FAULT_READ_AHEAD_MAX)
590a8b0f100SAlan Cox 					vm_fault_dontneed(&fs, vaddr, ahead);
5915268042bSAlan Cox 			} else {
5925268042bSAlan Cox 				/*
5935268042bSAlan Cox 				 * This is a non-sequential fault.  Request a
5945268042bSAlan Cox 				 * cluster of pages that is aligned to a
5955268042bSAlan Cox 				 * VM_FAULT_READ_DEFAULT page offset boundary
5965268042bSAlan Cox 				 * within the object.  Alignment to a page
5975268042bSAlan Cox 				 * offset boundary is more likely to coincide
5985268042bSAlan Cox 				 * with the underlying file system block than
5995268042bSAlan Cox 				 * alignment to a virtual address boundary.
6005268042bSAlan Cox 				 */
6015268042bSAlan Cox 				cluster_offset = fs.pindex %
6025268042bSAlan Cox 				    VM_FAULT_READ_DEFAULT;
6035268042bSAlan Cox 				behind = ulmin(cluster_offset,
6045268042bSAlan Cox 				    atop(vaddr - fs.entry->start));
6055268042bSAlan Cox 				nera = 0;
6065268042bSAlan Cox 				ahead = VM_FAULT_READ_DEFAULT - 1 -
6075268042bSAlan Cox 				    cluster_offset;
608867a482dSJohn Dyson 			}
6095268042bSAlan Cox 			ahead = ulmin(ahead, atop(fs.entry->end - vaddr) - 1);
6105268042bSAlan Cox 			if (era != nera)
6115268042bSAlan Cox 				fs.entry->read_ahead = nera;
612d2bf64c3SKonstantin Belousov 
613d2bf64c3SKonstantin Belousov 			/*
614d2bf64c3SKonstantin Belousov 			 * Call the pager to retrieve the data, if any, after
615d2bf64c3SKonstantin Belousov 			 * releasing the lock on the map.  We hold a ref on
616c7aebda8SAttilio Rao 			 * fs.object and the pages are exclusive busied.
617d2bf64c3SKonstantin Belousov 			 */
618d2bf64c3SKonstantin Belousov 			unlock_map(&fs);
619d2bf64c3SKonstantin Belousov 
620d2bf64c3SKonstantin Belousov 			if (fs.object->type == OBJT_VNODE) {
621d2bf64c3SKonstantin Belousov 				vp = fs.object->handle;
622d2bf64c3SKonstantin Belousov 				if (vp == fs.vp)
623d2bf64c3SKonstantin Belousov 					goto vnode_locked;
624d2bf64c3SKonstantin Belousov 				else if (fs.vp != NULL) {
625d2bf64c3SKonstantin Belousov 					vput(fs.vp);
626d2bf64c3SKonstantin Belousov 					fs.vp = NULL;
627d2bf64c3SKonstantin Belousov 				}
628d2bf64c3SKonstantin Belousov 				locked = VOP_ISLOCKED(vp);
629d2bf64c3SKonstantin Belousov 
630d2bf64c3SKonstantin Belousov 				if (locked != LK_EXCLUSIVE)
631d2bf64c3SKonstantin Belousov 					locked = LK_SHARED;
632d2bf64c3SKonstantin Belousov 				/* Do not sleep for vnode lock while fs.m is busy */
633d2bf64c3SKonstantin Belousov 				error = vget(vp, locked | LK_CANRECURSE |
634d2bf64c3SKonstantin Belousov 				    LK_NOWAIT, curthread);
635d2bf64c3SKonstantin Belousov 				if (error != 0) {
636d2bf64c3SKonstantin Belousov 					vhold(vp);
637d2bf64c3SKonstantin Belousov 					release_page(&fs);
638d2bf64c3SKonstantin Belousov 					unlock_and_deallocate(&fs);
639d2bf64c3SKonstantin Belousov 					error = vget(vp, locked | LK_RETRY |
640d2bf64c3SKonstantin Belousov 					    LK_CANRECURSE, curthread);
641d2bf64c3SKonstantin Belousov 					vdrop(vp);
642d2bf64c3SKonstantin Belousov 					fs.vp = vp;
643d2bf64c3SKonstantin Belousov 					KASSERT(error == 0,
644d2bf64c3SKonstantin Belousov 					    ("vm_fault: vget failed"));
645d2bf64c3SKonstantin Belousov 					goto RetryFault;
646d2bf64c3SKonstantin Belousov 				}
647d2bf64c3SKonstantin Belousov 				fs.vp = vp;
648d2bf64c3SKonstantin Belousov 			}
649d2bf64c3SKonstantin Belousov vnode_locked:
650d2bf64c3SKonstantin Belousov 			KASSERT(fs.vp == NULL || !fs.map->system_map,
651d2bf64c3SKonstantin Belousov 			    ("vm_fault: vnode-backed object mapped by system map"));
652d2bf64c3SKonstantin Belousov 
653df8bae1dSRodney W. Grimes 			/*
6540d94caffSDavid Greenman 			 * now we find out if any other pages should be paged
6550d94caffSDavid Greenman 			 * in at this time this routine checks to see if the
6560d94caffSDavid Greenman 			 * pages surrounding this fault reside in the same
6570d94caffSDavid Greenman 			 * object as the page for this fault.  If they do,
6580d94caffSDavid Greenman 			 * then they are faulted in also into the object.  The
6590d94caffSDavid Greenman 			 * array "marray" returned contains an array of
6600d94caffSDavid Greenman 			 * vm_page_t structs where one of them is the
6610d94caffSDavid Greenman 			 * vm_page_t passed to the routine.  The reqpage
6620d94caffSDavid Greenman 			 * return value is the index into the marray for the
6630d94caffSDavid Greenman 			 * vm_page_t passed to the routine.
6641c7c3c6aSMatthew Dillon 			 *
665c7aebda8SAttilio Rao 			 * fs.m plus the additional pages are exclusive busied.
66626f9a767SRodney W. Grimes 			 */
66705f0fdd2SPoul-Henning Kamp 			faultcount = vm_fault_additional_pages(
6684866e085SJohn Dyson 			    fs.m, behind, ahead, marray, &reqpage);
669df8bae1dSRodney W. Grimes 
67026f9a767SRodney W. Grimes 			rv = faultcount ?
6714866e085SJohn Dyson 			    vm_pager_get_pages(fs.object, marray, faultcount,
67224a1cce3SDavid Greenman 				reqpage) : VM_PAGER_FAIL;
67322ba64e8SJohn Dyson 
67426f9a767SRodney W. Grimes 			if (rv == VM_PAGER_OK) {
675df8bae1dSRodney W. Grimes 				/*
676f230c45cSJohn Dyson 				 * Found the page. Leave it busy while we play
677f230c45cSJohn Dyson 				 * with it.
678f230c45cSJohn Dyson 				 */
679f230c45cSJohn Dyson 
680f230c45cSJohn Dyson 				/*
6810d94caffSDavid Greenman 				 * Relookup in case pager changed page. Pager
6820d94caffSDavid Greenman 				 * is responsible for disposition of old page
6830d94caffSDavid Greenman 				 * if moved.
684df8bae1dSRodney W. Grimes 				 */
6854866e085SJohn Dyson 				fs.m = vm_page_lookup(fs.object, fs.pindex);
6864866e085SJohn Dyson 				if (!fs.m) {
6874866e085SJohn Dyson 					unlock_and_deallocate(&fs);
688f6b04d2bSDavid Greenman 					goto RetryFault;
689f6b04d2bSDavid Greenman 				}
690f6b04d2bSDavid Greenman 
69126f9a767SRodney W. Grimes 				hardfault++;
6921c7c3c6aSMatthew Dillon 				break; /* break to PAGE HAS BEEN FOUND */
693df8bae1dSRodney W. Grimes 			}
694df8bae1dSRodney W. Grimes 			/*
6950d94caffSDavid Greenman 			 * Remove the bogus page (which does not exist at this
6960d94caffSDavid Greenman 			 * object/offset); before doing so, we must get back
6970d94caffSDavid Greenman 			 * our object lock to preserve our invariant.
698df8bae1dSRodney W. Grimes 			 *
69924a1cce3SDavid Greenman 			 * Also wake up any other process that may want to bring
7000d94caffSDavid Greenman 			 * in this page.
701df8bae1dSRodney W. Grimes 			 *
7020d94caffSDavid Greenman 			 * If this is the top-level object, we must leave the
70324a1cce3SDavid Greenman 			 * busy page to prevent another process from rushing
7040d94caffSDavid Greenman 			 * past us, and inserting the page in that object at
7050d94caffSDavid Greenman 			 * the same time that we are.
706df8bae1dSRodney W. Grimes 			 */
707a83c285cSDavid Greenman 			if (rv == VM_PAGER_ERROR)
708f3679e35SDavid Greenman 				printf("vm_fault: pager read error, pid %d (%s)\n",
709f3679e35SDavid Greenman 				    curproc->p_pid, curproc->p_comm);
71026f9a767SRodney W. Grimes 			/*
711a83c285cSDavid Greenman 			 * Data outside the range of the pager or an I/O error
71226f9a767SRodney W. Grimes 			 */
713a83c285cSDavid Greenman 			/*
7140d94caffSDavid Greenman 			 * XXX - the check for kernel_map is a kludge to work
7150d94caffSDavid Greenman 			 * around having the machine panic on a kernel space
7160d94caffSDavid Greenman 			 * fault w/ I/O error.
717a83c285cSDavid Greenman 			 */
7184866e085SJohn Dyson 			if (((fs.map != kernel_map) && (rv == VM_PAGER_ERROR)) ||
71947221757SJohn Dyson 				(rv == VM_PAGER_BAD)) {
7202965a453SKip Macy 				vm_page_lock(fs.m);
7214866e085SJohn Dyson 				vm_page_free(fs.m);
7222965a453SKip Macy 				vm_page_unlock(fs.m);
7234866e085SJohn Dyson 				fs.m = NULL;
7244866e085SJohn Dyson 				unlock_and_deallocate(&fs);
725a83c285cSDavid Greenman 				return ((rv == VM_PAGER_ERROR) ? KERN_FAILURE : KERN_PROTECTION_FAILURE);
72626f9a767SRodney W. Grimes 			}
7274866e085SJohn Dyson 			if (fs.object != fs.first_object) {
7282965a453SKip Macy 				vm_page_lock(fs.m);
7294866e085SJohn Dyson 				vm_page_free(fs.m);
7302965a453SKip Macy 				vm_page_unlock(fs.m);
7314866e085SJohn Dyson 				fs.m = NULL;
73226f9a767SRodney W. Grimes 				/*
73326f9a767SRodney W. Grimes 				 * XXX - we cannot just fall out at this
73426f9a767SRodney W. Grimes 				 * point, m has been freed and is invalid!
73526f9a767SRodney W. Grimes 				 */
736df8bae1dSRodney W. Grimes 			}
737df8bae1dSRodney W. Grimes 		}
73840360b1bSMatthew Dillon 
739df8bae1dSRodney W. Grimes 		/*
7401c7c3c6aSMatthew Dillon 		 * We get here if the object has default pager (or unwiring)
7411c7c3c6aSMatthew Dillon 		 * or the pager doesn't have the page.
742df8bae1dSRodney W. Grimes 		 */
7434866e085SJohn Dyson 		if (fs.object == fs.first_object)
7444866e085SJohn Dyson 			fs.first_m = fs.m;
745df8bae1dSRodney W. Grimes 
746df8bae1dSRodney W. Grimes 		/*
7470d94caffSDavid Greenman 		 * Move on to the next object.  Lock the next object before
7480d94caffSDavid Greenman 		 * unlocking the current one.
749df8bae1dSRodney W. Grimes 		 */
7504866e085SJohn Dyson 		fs.pindex += OFF_TO_IDX(fs.object->backing_object_offset);
7514866e085SJohn Dyson 		next_object = fs.object->backing_object;
752df8bae1dSRodney W. Grimes 		if (next_object == NULL) {
753df8bae1dSRodney W. Grimes 			/*
7540d94caffSDavid Greenman 			 * If there's no object left, fill the page in the top
7550d94caffSDavid Greenman 			 * object with zeros.
756df8bae1dSRodney W. Grimes 			 */
7574866e085SJohn Dyson 			if (fs.object != fs.first_object) {
7584866e085SJohn Dyson 				vm_object_pip_wakeup(fs.object);
75989f6b863SAttilio Rao 				VM_OBJECT_WUNLOCK(fs.object);
760df8bae1dSRodney W. Grimes 
7614866e085SJohn Dyson 				fs.object = fs.first_object;
7624866e085SJohn Dyson 				fs.pindex = fs.first_pindex;
7634866e085SJohn Dyson 				fs.m = fs.first_m;
76489f6b863SAttilio Rao 				VM_OBJECT_WLOCK(fs.object);
765df8bae1dSRodney W. Grimes 			}
7664866e085SJohn Dyson 			fs.first_m = NULL;
767df8bae1dSRodney W. Grimes 
7684221e284SAlan Cox 			/*
7694221e284SAlan Cox 			 * Zero the page if necessary and mark it valid.
7704221e284SAlan Cox 			 */
7714866e085SJohn Dyson 			if ((fs.m->flags & PG_ZERO) == 0) {
772fff6062aSAlan Cox 				pmap_zero_page(fs.m);
7734221e284SAlan Cox 			} else {
77467596082SAttilio Rao 				PCPU_INC(cnt.v_ozfod);
7754221e284SAlan Cox 			}
77667596082SAttilio Rao 			PCPU_INC(cnt.v_zfod);
7774221e284SAlan Cox 			fs.m->valid = VM_PAGE_BITS_ALL;
7787b9b301cSAlan Cox 			/* Don't try to prefault neighboring pages. */
7797b9b301cSAlan Cox 			faultcount = 1;
7801c7c3c6aSMatthew Dillon 			break;	/* break to PAGE HAS BEEN FOUND */
7810d94caffSDavid Greenman 		} else {
782c8567c3aSAlan Cox 			KASSERT(fs.object != next_object,
783c8567c3aSAlan Cox 			    ("object loop %p", next_object));
78489f6b863SAttilio Rao 			VM_OBJECT_WLOCK(next_object);
785c8567c3aSAlan Cox 			vm_object_pip_add(next_object, 1);
786c8567c3aSAlan Cox 			if (fs.object != fs.first_object)
7874866e085SJohn Dyson 				vm_object_pip_wakeup(fs.object);
78889f6b863SAttilio Rao 			VM_OBJECT_WUNLOCK(fs.object);
7894866e085SJohn Dyson 			fs.object = next_object;
790df8bae1dSRodney W. Grimes 		}
791df8bae1dSRodney W. Grimes 	}
7921c7c3c6aSMatthew Dillon 
793c7aebda8SAttilio Rao 	vm_page_assert_xbusied(fs.m);
794df8bae1dSRodney W. Grimes 
795df8bae1dSRodney W. Grimes 	/*
7960d94caffSDavid Greenman 	 * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock
797df8bae1dSRodney W. Grimes 	 * is held.]
798df8bae1dSRodney W. Grimes 	 */
799df8bae1dSRodney W. Grimes 
800df8bae1dSRodney W. Grimes 	/*
8010d94caffSDavid Greenman 	 * If the page is being written, but isn't already owned by the
8020d94caffSDavid Greenman 	 * top-level object, we have to copy it into a new page owned by the
8030d94caffSDavid Greenman 	 * top-level object.
804df8bae1dSRodney W. Grimes 	 */
8054866e085SJohn Dyson 	if (fs.object != fs.first_object) {
806df8bae1dSRodney W. Grimes 		/*
8070d94caffSDavid Greenman 		 * We only really need to copy if we want to write it.
808df8bae1dSRodney W. Grimes 		 */
809a6d42a0dSAlan Cox 		if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) {
810df8bae1dSRodney W. Grimes 			/*
8111c7c3c6aSMatthew Dillon 			 * This allows pages to be virtually copied from a
8121c7c3c6aSMatthew Dillon 			 * backing_object into the first_object, where the
8131c7c3c6aSMatthew Dillon 			 * backing object has no other refs to it, and cannot
8141c7c3c6aSMatthew Dillon 			 * gain any more refs.  Instead of a bcopy, we just
8151c7c3c6aSMatthew Dillon 			 * move the page from the backing object to the
8161c7c3c6aSMatthew Dillon 			 * first object.  Note that we must mark the page
8171c7c3c6aSMatthew Dillon 			 * dirty in the first object so that it will go out
8181c7c3c6aSMatthew Dillon 			 * to swap when needed.
819df8bae1dSRodney W. Grimes 			 */
820ebf75125SAlan Cox 			is_first_object_locked = FALSE;
821e50346b5SAlan Cox 			if (
822de5f6a77SJohn Dyson 				/*
823de5f6a77SJohn Dyson 				 * Only one shadow object
824de5f6a77SJohn Dyson 				 */
8254866e085SJohn Dyson 				(fs.object->shadow_count == 1) &&
826de5f6a77SJohn Dyson 				/*
827de5f6a77SJohn Dyson 				 * No COW refs, except us
828de5f6a77SJohn Dyson 				 */
8294866e085SJohn Dyson 				(fs.object->ref_count == 1) &&
830de5f6a77SJohn Dyson 				/*
8315929bcfaSPhilippe Charnier 				 * No one else can look this object up
832de5f6a77SJohn Dyson 				 */
8334866e085SJohn Dyson 				(fs.object->handle == NULL) &&
834de5f6a77SJohn Dyson 				/*
835de5f6a77SJohn Dyson 				 * No other ways to look the object up
836de5f6a77SJohn Dyson 				 */
8374866e085SJohn Dyson 				((fs.object->type == OBJT_DEFAULT) ||
8384866e085SJohn Dyson 				 (fs.object->type == OBJT_SWAP)) &&
83989f6b863SAttilio Rao 			    (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs.first_object)) &&
840de5f6a77SJohn Dyson 				/*
841de5f6a77SJohn Dyson 				 * We don't chase down the shadow chain
842de5f6a77SJohn Dyson 				 */
843e50346b5SAlan Cox 			    fs.object == fs.first_object->backing_object) {
8442d8acc0fSJohn Dyson 				/*
845de5f6a77SJohn Dyson 				 * get rid of the unnecessary page
846df8bae1dSRodney W. Grimes 				 */
847eb00b276SAlan Cox 				vm_page_lock(fs.first_m);
8484866e085SJohn Dyson 				vm_page_free(fs.first_m);
8492965a453SKip Macy 				vm_page_unlock(fs.first_m);
850de5f6a77SJohn Dyson 				/*
8511c7c3c6aSMatthew Dillon 				 * grab the page and put it into the
8521c7c3c6aSMatthew Dillon 				 * process'es object.  The page is
8531c7c3c6aSMatthew Dillon 				 * automatically made dirty.
854de5f6a77SJohn Dyson 				 */
855e946b949SAttilio Rao 				if (vm_page_rename(fs.m, fs.first_object,
856e946b949SAttilio Rao 				    fs.first_pindex)) {
857e946b949SAttilio Rao 					unlock_and_deallocate(&fs);
858e946b949SAttilio Rao 					goto RetryFault;
859e946b949SAttilio Rao 				}
860dfdf9abdSAlan Cox #if VM_NRESERVLEVEL > 0
861dfdf9abdSAlan Cox 				/*
862dfdf9abdSAlan Cox 				 * Rename the reservation.
863dfdf9abdSAlan Cox 				 */
864dfdf9abdSAlan Cox 				vm_reserv_rename(fs.m, fs.first_object,
865dfdf9abdSAlan Cox 				    fs.object, OFF_TO_IDX(
866dfdf9abdSAlan Cox 				    fs.first_object->backing_object_offset));
867dfdf9abdSAlan Cox #endif
868c7aebda8SAttilio Rao 				vm_page_xbusy(fs.m);
869d98ddc46SAlan Cox 				fs.first_m = fs.m;
8704866e085SJohn Dyson 				fs.m = NULL;
87167596082SAttilio Rao 				PCPU_INC(cnt.v_cow_optim);
872de5f6a77SJohn Dyson 			} else {
873de5f6a77SJohn Dyson 				/*
874de5f6a77SJohn Dyson 				 * Oh, well, lets copy it.
875de5f6a77SJohn Dyson 				 */
876669890eaSAlan Cox 				pmap_copy_page(fs.m, fs.first_m);
877669890eaSAlan Cox 				fs.first_m->valid = VM_PAGE_BITS_ALL;
878d8778512SAlan Cox 				if (wired && (fault_flags &
879d8778512SAlan Cox 				    VM_FAULT_CHANGE_WIRING) == 0) {
8802965a453SKip Macy 					vm_page_lock(fs.first_m);
881d8778512SAlan Cox 					vm_page_wire(fs.first_m);
8822965a453SKip Macy 					vm_page_unlock(fs.first_m);
8832965a453SKip Macy 
8842965a453SKip Macy 					vm_page_lock(fs.m);
8853ae10f74SAttilio Rao 					vm_page_unwire(fs.m, PQ_INACTIVE);
8862965a453SKip Macy 					vm_page_unlock(fs.m);
887de5f6a77SJohn Dyson 				}
888df8bae1dSRodney W. Grimes 				/*
889df8bae1dSRodney W. Grimes 				 * We no longer need the old page or object.
890df8bae1dSRodney W. Grimes 				 */
8914866e085SJohn Dyson 				release_page(&fs);
892de5f6a77SJohn Dyson 			}
8931c7c3c6aSMatthew Dillon 			/*
8941c7c3c6aSMatthew Dillon 			 * fs.object != fs.first_object due to above
8951c7c3c6aSMatthew Dillon 			 * conditional
8961c7c3c6aSMatthew Dillon 			 */
8974866e085SJohn Dyson 			vm_object_pip_wakeup(fs.object);
89889f6b863SAttilio Rao 			VM_OBJECT_WUNLOCK(fs.object);
899df8bae1dSRodney W. Grimes 			/*
900df8bae1dSRodney W. Grimes 			 * Only use the new page below...
901df8bae1dSRodney W. Grimes 			 */
9024866e085SJohn Dyson 			fs.object = fs.first_object;
9034866e085SJohn Dyson 			fs.pindex = fs.first_pindex;
904d98ddc46SAlan Cox 			fs.m = fs.first_m;
905f29ba63eSAlan Cox 			if (!is_first_object_locked)
90689f6b863SAttilio Rao 				VM_OBJECT_WLOCK(fs.object);
90767596082SAttilio Rao 			PCPU_INC(cnt.v_cow_faults);
9084d34e019SKonstantin Belousov 			curthread->td_cow++;
9090d94caffSDavid Greenman 		} else {
910df8bae1dSRodney W. Grimes 			prot &= ~VM_PROT_WRITE;
911df8bae1dSRodney W. Grimes 		}
912df8bae1dSRodney W. Grimes 	}
913df8bae1dSRodney W. Grimes 
914df8bae1dSRodney W. Grimes 	/*
9150d94caffSDavid Greenman 	 * We must verify that the maps have not changed since our last
9160d94caffSDavid Greenman 	 * lookup.
917df8bae1dSRodney W. Grimes 	 */
91819dc5607STor Egge 	if (!fs.lookup_still_valid) {
919df8bae1dSRodney W. Grimes 		vm_object_t retry_object;
920a316d390SJohn Dyson 		vm_pindex_t retry_pindex;
921df8bae1dSRodney W. Grimes 		vm_prot_t retry_prot;
922df8bae1dSRodney W. Grimes 
92319dc5607STor Egge 		if (!vm_map_trylock_read(fs.map)) {
924b823bbd6SMatthew Dillon 			release_page(&fs);
925b823bbd6SMatthew Dillon 			unlock_and_deallocate(&fs);
926b823bbd6SMatthew Dillon 			goto RetryFault;
927b823bbd6SMatthew Dillon 		}
92819dc5607STor Egge 		fs.lookup_still_valid = TRUE;
92919dc5607STor Egge 		if (fs.map->timestamp != map_generation) {
93019dc5607STor Egge 			result = vm_map_lookup_locked(&fs.map, vaddr, fault_type,
9314866e085SJohn Dyson 			    &fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired);
932df8bae1dSRodney W. Grimes 
933df8bae1dSRodney W. Grimes 			/*
93444ed3417STor Egge 			 * If we don't need the page any longer, put it on the inactive
9350d94caffSDavid Greenman 			 * list (the easiest thing to do here).  If no one needs it,
9360d94caffSDavid Greenman 			 * pageout will grab it eventually.
937df8bae1dSRodney W. Grimes 			 */
938df8bae1dSRodney W. Grimes 			if (result != KERN_SUCCESS) {
9394866e085SJohn Dyson 				release_page(&fs);
9404866e085SJohn Dyson 				unlock_and_deallocate(&fs);
94119dc5607STor Egge 
94219dc5607STor Egge 				/*
94319dc5607STor Egge 				 * If retry of map lookup would have blocked then
94419dc5607STor Egge 				 * retry fault from start.
94519dc5607STor Egge 				 */
94619dc5607STor Egge 				if (result == KERN_FAILURE)
94719dc5607STor Egge 					goto RetryFault;
948df8bae1dSRodney W. Grimes 				return (result);
949df8bae1dSRodney W. Grimes 			}
9504866e085SJohn Dyson 			if ((retry_object != fs.first_object) ||
9514866e085SJohn Dyson 			    (retry_pindex != fs.first_pindex)) {
9524866e085SJohn Dyson 				release_page(&fs);
9534866e085SJohn Dyson 				unlock_and_deallocate(&fs);
954df8bae1dSRodney W. Grimes 				goto RetryFault;
955df8bae1dSRodney W. Grimes 			}
95619dc5607STor Egge 
957df8bae1dSRodney W. Grimes 			/*
9580d94caffSDavid Greenman 			 * Check whether the protection has changed or the object has
9590d94caffSDavid Greenman 			 * been copied while we left the map unlocked. Changing from
9600d94caffSDavid Greenman 			 * read to write permission is OK - we leave the page
9610d94caffSDavid Greenman 			 * write-protected, and catch the write fault. Changing from
9620d94caffSDavid Greenman 			 * write to read permission means that we can't mark the page
9630d94caffSDavid Greenman 			 * write-enabled after all.
964df8bae1dSRodney W. Grimes 			 */
965df8bae1dSRodney W. Grimes 			prot &= retry_prot;
966df8bae1dSRodney W. Grimes 		}
96719dc5607STor Egge 	}
968d2bf64c3SKonstantin Belousov 	/*
9695758fe71SAlan Cox 	 * If the page was filled by a pager, update the map entry's
9705758fe71SAlan Cox 	 * last read offset.  Since the pager does not return the
9715758fe71SAlan Cox 	 * actual set of pages that it read, this update is based on
9725758fe71SAlan Cox 	 * the requested set.  Typically, the requested and actual
9735758fe71SAlan Cox 	 * sets are the same.
974d2bf64c3SKonstantin Belousov 	 *
975d2bf64c3SKonstantin Belousov 	 * XXX The following assignment modifies the map
976d2bf64c3SKonstantin Belousov 	 * without holding a write lock on it.
977d2bf64c3SKonstantin Belousov 	 */
9785758fe71SAlan Cox 	if (hardfault)
97913458803SAlan Cox 		fs.entry->next_read = fs.pindex + faultcount - reqpage;
980d2bf64c3SKonstantin Belousov 
981a36f5532SKonstantin Belousov 	vm_fault_dirty(fs.entry, fs.m, prot, fault_type, fault_flags, TRUE);
982c7aebda8SAttilio Rao 	vm_page_assert_xbusied(fs.m);
983c7aebda8SAttilio Rao 
9844221e284SAlan Cox 	/*
98578cfe1f7SAlan Cox 	 * Page must be completely valid or it is not fit to
9864221e284SAlan Cox 	 * map into user space.  vm_pager_get_pages() ensures this.
9874221e284SAlan Cox 	 */
98878cfe1f7SAlan Cox 	KASSERT(fs.m->valid == VM_PAGE_BITS_ALL,
98978cfe1f7SAlan Cox 	    ("vm_fault: page %p partially invalid", fs.m));
99089f6b863SAttilio Rao 	VM_OBJECT_WUNLOCK(fs.object);
991cbfbaad8SAlan Cox 
99286735996SAlan Cox 	/*
99386735996SAlan Cox 	 * Put this page into the physical map.  We had to do the unlock above
99486735996SAlan Cox 	 * because pmap_enter() may sleep.  We don't put the page
99586735996SAlan Cox 	 * back on the active queue until later so that the pageout daemon
99686735996SAlan Cox 	 * won't find it (yet).
99786735996SAlan Cox 	 */
99839ffa8c1SKonstantin Belousov 	pmap_enter(fs.map->pmap, vaddr, fs.m, prot,
99939ffa8c1SKonstantin Belousov 	    fault_type | (wired ? PMAP_ENTER_WIRED : 0), 0);
10007b9b301cSAlan Cox 	if (faultcount != 1 && (fault_flags & VM_FAULT_CHANGE_WIRING) == 0 &&
10017b9b301cSAlan Cox 	    wired == 0)
100263281952SAlan Cox 		vm_fault_prefault(&fs, vaddr, faultcount, reqpage);
100389f6b863SAttilio Rao 	VM_OBJECT_WLOCK(fs.object);
10042965a453SKip Macy 	vm_page_lock(fs.m);
1005ff97964aSJohn Dyson 
1006df8bae1dSRodney W. Grimes 	/*
10070d94caffSDavid Greenman 	 * If the page is not wired down, then put it where the pageout daemon
10080d94caffSDavid Greenman 	 * can find it.
1009df8bae1dSRodney W. Grimes 	 */
10102db65ab4SAlan Cox 	if (fault_flags & VM_FAULT_CHANGE_WIRING) {
1011df8bae1dSRodney W. Grimes 		if (wired)
10124866e085SJohn Dyson 			vm_page_wire(fs.m);
1013df8bae1dSRodney W. Grimes 		else
10143ae10f74SAttilio Rao 			vm_page_unwire(fs.m, PQ_ACTIVE);
101503679e23SAlan Cox 	} else
10164866e085SJohn Dyson 		vm_page_activate(fs.m);
1017acd11c74SAlan Cox 	if (m_hold != NULL) {
1018acd11c74SAlan Cox 		*m_hold = fs.m;
1019acd11c74SAlan Cox 		vm_page_hold(fs.m);
1020acd11c74SAlan Cox 	}
10212965a453SKip Macy 	vm_page_unlock(fs.m);
1022c7aebda8SAttilio Rao 	vm_page_xunbusy(fs.m);
1023eeec6babSJohn Baldwin 
1024eebf3286SAlan Cox 	/*
1025eebf3286SAlan Cox 	 * Unlock everything, and return
1026eebf3286SAlan Cox 	 */
1027eebf3286SAlan Cox 	unlock_and_deallocate(&fs);
1028b3a01bdfSAndrey Zonov 	if (hardfault) {
1029b3a01bdfSAndrey Zonov 		PCPU_INC(cnt.v_io_faults);
10301c4bcd05SJeff Roberson 		curthread->td_ru.ru_majflt++;
1031b3a01bdfSAndrey Zonov 	} else
10321c4bcd05SJeff Roberson 		curthread->td_ru.ru_minflt++;
1033df8bae1dSRodney W. Grimes 
1034df8bae1dSRodney W. Grimes 	return (KERN_SUCCESS);
1035df8bae1dSRodney W. Grimes }
1036df8bae1dSRodney W. Grimes 
1037df8bae1dSRodney W. Grimes /*
1038a8b0f100SAlan Cox  * Speed up the reclamation of pages that precede the faulting pindex within
1039a8b0f100SAlan Cox  * the first object of the shadow chain.  Essentially, perform the equivalent
1040a8b0f100SAlan Cox  * to madvise(..., MADV_DONTNEED) on a large cluster of pages that precedes
1041a8b0f100SAlan Cox  * the faulting pindex by the cluster size when the pages read by vm_fault()
1042a8b0f100SAlan Cox  * cross a cluster-size boundary.  The cluster size is the greater of the
1043a8b0f100SAlan Cox  * smallest superpage size and VM_FAULT_DONTNEED_MIN.
1044a8b0f100SAlan Cox  *
1045a8b0f100SAlan Cox  * When "fs->first_object" is a shadow object, the pages in the backing object
1046a8b0f100SAlan Cox  * that precede the faulting pindex are deactivated by vm_fault().  So, this
1047a8b0f100SAlan Cox  * function must only be concerned with pages in the first object.
104813458803SAlan Cox  */
104913458803SAlan Cox static void
1050a8b0f100SAlan Cox vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead)
105113458803SAlan Cox {
1052a8b0f100SAlan Cox 	vm_map_entry_t entry;
105313458803SAlan Cox 	vm_object_t first_object, object;
1054a8b0f100SAlan Cox 	vm_offset_t end, start;
1055a8b0f100SAlan Cox 	vm_page_t m, m_next;
1056a8b0f100SAlan Cox 	vm_pindex_t pend, pstart;
1057a8b0f100SAlan Cox 	vm_size_t size;
105813458803SAlan Cox 
105913458803SAlan Cox 	object = fs->object;
106089f6b863SAttilio Rao 	VM_OBJECT_ASSERT_WLOCKED(object);
106113458803SAlan Cox 	first_object = fs->first_object;
106213458803SAlan Cox 	if (first_object != object) {
1063*b5ab20c0SAlan Cox 		if (!VM_OBJECT_TRYWLOCK(first_object)) {
106489f6b863SAttilio Rao 			VM_OBJECT_WUNLOCK(object);
1065*b5ab20c0SAlan Cox 			VM_OBJECT_WLOCK(first_object);
106689f6b863SAttilio Rao 			VM_OBJECT_WLOCK(object);
106713458803SAlan Cox 		}
106813458803SAlan Cox 	}
1069a8b0f100SAlan Cox 	/* Neither fictitious nor unmanaged pages can be reclaimed. */
107028634820SAlan Cox 	if ((first_object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0) {
1071a8b0f100SAlan Cox 		size = VM_FAULT_DONTNEED_MIN;
1072a8b0f100SAlan Cox 		if (MAXPAGESIZES > 1 && size < pagesizes[1])
1073a8b0f100SAlan Cox 			size = pagesizes[1];
1074a8b0f100SAlan Cox 		end = rounddown2(vaddr, size);
1075a8b0f100SAlan Cox 		if (vaddr - end >= size - PAGE_SIZE - ptoa(ahead) &&
1076a8b0f100SAlan Cox 		    (entry = fs->entry)->start < end) {
1077a8b0f100SAlan Cox 			if (end - entry->start < size)
1078a8b0f100SAlan Cox 				start = entry->start;
107913458803SAlan Cox 			else
1080a8b0f100SAlan Cox 				start = end - size;
1081a8b0f100SAlan Cox 			pmap_advise(fs->map->pmap, start, end, MADV_DONTNEED);
1082a8b0f100SAlan Cox 			pstart = OFF_TO_IDX(entry->offset) + atop(start -
1083a8b0f100SAlan Cox 			    entry->start);
1084a8b0f100SAlan Cox 			m_next = vm_page_find_least(first_object, pstart);
1085a8b0f100SAlan Cox 			pend = OFF_TO_IDX(entry->offset) + atop(end -
1086a8b0f100SAlan Cox 			    entry->start);
1087a8b0f100SAlan Cox 			while ((m = m_next) != NULL && m->pindex < pend) {
1088a8b0f100SAlan Cox 				m_next = TAILQ_NEXT(m, listq);
1089a8b0f100SAlan Cox 				if (m->valid != VM_PAGE_BITS_ALL ||
1090a8b0f100SAlan Cox 				    vm_page_busied(m))
109113458803SAlan Cox 					continue;
109213458803SAlan Cox 				vm_page_lock(m);
1093a8b0f100SAlan Cox 				if (m->hold_count == 0 && m->wire_count == 0)
1094a8b0f100SAlan Cox 					vm_page_advise(m, MADV_DONTNEED);
109513458803SAlan Cox 				vm_page_unlock(m);
109613458803SAlan Cox 			}
109713458803SAlan Cox 		}
1098a8b0f100SAlan Cox 	}
109913458803SAlan Cox 	if (first_object != object)
1100*b5ab20c0SAlan Cox 		VM_OBJECT_WUNLOCK(first_object);
110113458803SAlan Cox }
110213458803SAlan Cox 
110313458803SAlan Cox /*
1104566526a9SAlan Cox  * vm_fault_prefault provides a quick way of clustering
1105566526a9SAlan Cox  * pagefaults into a processes address space.  It is a "cousin"
1106566526a9SAlan Cox  * of vm_map_pmap_enter, except it runs at page fault time instead
1107566526a9SAlan Cox  * of mmap time.
1108566526a9SAlan Cox  */
1109566526a9SAlan Cox static void
111063281952SAlan Cox vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
111163281952SAlan Cox     int faultcount, int reqpage)
1112566526a9SAlan Cox {
111363281952SAlan Cox 	pmap_t pmap;
111463281952SAlan Cox 	vm_map_entry_t entry;
111563281952SAlan Cox 	vm_object_t backing_object, lobject;
1116566526a9SAlan Cox 	vm_offset_t addr, starta;
1117566526a9SAlan Cox 	vm_pindex_t pindex;
11182053c127SStephan Uphoff 	vm_page_t m;
111963281952SAlan Cox 	int backward, forward, i;
1120566526a9SAlan Cox 
112163281952SAlan Cox 	pmap = fs->map->pmap;
1122950d5f7aSAlan Cox 	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))
1123566526a9SAlan Cox 		return;
1124566526a9SAlan Cox 
112563281952SAlan Cox 	if (faultcount > 0) {
112663281952SAlan Cox 		backward = reqpage;
112763281952SAlan Cox 		forward = faultcount - reqpage - 1;
112863281952SAlan Cox 	} else {
112963281952SAlan Cox 		backward = PFBAK;
113063281952SAlan Cox 		forward = PFFOR;
113163281952SAlan Cox 	}
113263281952SAlan Cox 	entry = fs->entry;
1133566526a9SAlan Cox 
113463281952SAlan Cox 	starta = addra - backward * PAGE_SIZE;
1135566526a9SAlan Cox 	if (starta < entry->start) {
1136566526a9SAlan Cox 		starta = entry->start;
1137566526a9SAlan Cox 	} else if (starta > addra) {
1138566526a9SAlan Cox 		starta = 0;
1139566526a9SAlan Cox 	}
1140566526a9SAlan Cox 
114163281952SAlan Cox 	/*
114263281952SAlan Cox 	 * Generate the sequence of virtual addresses that are candidates for
114363281952SAlan Cox 	 * prefaulting in an outward spiral from the faulting virtual address,
114463281952SAlan Cox 	 * "addra".  Specifically, the sequence is "addra - PAGE_SIZE", "addra
114563281952SAlan Cox 	 * + PAGE_SIZE", "addra - 2 * PAGE_SIZE", "addra + 2 * PAGE_SIZE", ...
114663281952SAlan Cox 	 * If the candidate address doesn't have a backing physical page, then
114763281952SAlan Cox 	 * the loop immediately terminates.
114863281952SAlan Cox 	 */
114963281952SAlan Cox 	for (i = 0; i < 2 * imax(backward, forward); i++) {
115063281952SAlan Cox 		addr = addra + ((i >> 1) + 1) * ((i & 1) == 0 ? -PAGE_SIZE :
115163281952SAlan Cox 		    PAGE_SIZE);
115263281952SAlan Cox 		if (addr > addra + forward * PAGE_SIZE)
1153566526a9SAlan Cox 			addr = 0;
1154566526a9SAlan Cox 
1155566526a9SAlan Cox 		if (addr < starta || addr >= entry->end)
1156566526a9SAlan Cox 			continue;
1157566526a9SAlan Cox 
1158566526a9SAlan Cox 		if (!pmap_is_prefaultable(pmap, addr))
1159566526a9SAlan Cox 			continue;
1160566526a9SAlan Cox 
1161566526a9SAlan Cox 		pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
116263281952SAlan Cox 		lobject = entry->object.vm_object;
1163c141ae7fSAlan Cox 		VM_OBJECT_RLOCK(lobject);
1164566526a9SAlan Cox 		while ((m = vm_page_lookup(lobject, pindex)) == NULL &&
1165566526a9SAlan Cox 		    lobject->type == OBJT_DEFAULT &&
1166566526a9SAlan Cox 		    (backing_object = lobject->backing_object) != NULL) {
116736930fc9SAlan Cox 			KASSERT((lobject->backing_object_offset & PAGE_MASK) ==
116836930fc9SAlan Cox 			    0, ("vm_fault_prefault: unaligned object offset"));
1169566526a9SAlan Cox 			pindex += lobject->backing_object_offset >> PAGE_SHIFT;
1170c141ae7fSAlan Cox 			VM_OBJECT_RLOCK(backing_object);
1171c141ae7fSAlan Cox 			VM_OBJECT_RUNLOCK(lobject);
1172566526a9SAlan Cox 			lobject = backing_object;
1173566526a9SAlan Cox 		}
1174cbfbaad8SAlan Cox 		if (m == NULL) {
1175c141ae7fSAlan Cox 			VM_OBJECT_RUNLOCK(lobject);
1176566526a9SAlan Cox 			break;
1177cbfbaad8SAlan Cox 		}
11780a2e596aSAlan Cox 		if (m->valid == VM_PAGE_BITS_ALL &&
11793c4a2440SAlan Cox 		    (m->flags & PG_FICTITIOUS) == 0)
11807bfda801SAlan Cox 			pmap_enter_quick(pmap, addr, m, entry->protection);
1181c141ae7fSAlan Cox 		VM_OBJECT_RUNLOCK(lobject);
1182566526a9SAlan Cox 	}
1183566526a9SAlan Cox }
1184566526a9SAlan Cox 
1185566526a9SAlan Cox /*
118682de724fSAlan Cox  * Hold each of the physical pages that are mapped by the specified range of
118782de724fSAlan Cox  * virtual addresses, ["addr", "addr" + "len"), if those mappings are valid
118882de724fSAlan Cox  * and allow the specified types of access, "prot".  If all of the implied
118982de724fSAlan Cox  * pages are successfully held, then the number of held pages is returned
119082de724fSAlan Cox  * together with pointers to those pages in the array "ma".  However, if any
119182de724fSAlan Cox  * of the pages cannot be held, -1 is returned.
119282de724fSAlan Cox  */
119382de724fSAlan Cox int
119482de724fSAlan Cox vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
119582de724fSAlan Cox     vm_prot_t prot, vm_page_t *ma, int max_count)
119682de724fSAlan Cox {
119782de724fSAlan Cox 	vm_offset_t end, va;
119882de724fSAlan Cox 	vm_page_t *mp;
11997e14088dSKonstantin Belousov 	int count;
120082de724fSAlan Cox 	boolean_t pmap_failed;
120182de724fSAlan Cox 
1202af32c419SKonstantin Belousov 	if (len == 0)
1203af32c419SKonstantin Belousov 		return (0);
120482de724fSAlan Cox 	end = round_page(addr + len);
120582de724fSAlan Cox 	addr = trunc_page(addr);
120682de724fSAlan Cox 
120782de724fSAlan Cox 	/*
120882de724fSAlan Cox 	 * Check for illegal addresses.
120982de724fSAlan Cox 	 */
121082de724fSAlan Cox 	if (addr < vm_map_min(map) || addr > end || end > vm_map_max(map))
121182de724fSAlan Cox 		return (-1);
121282de724fSAlan Cox 
12137e14088dSKonstantin Belousov 	if (atop(end - addr) > max_count)
121482de724fSAlan Cox 		panic("vm_fault_quick_hold_pages: count > max_count");
12157e14088dSKonstantin Belousov 	count = atop(end - addr);
121682de724fSAlan Cox 
121782de724fSAlan Cox 	/*
121882de724fSAlan Cox 	 * Most likely, the physical pages are resident in the pmap, so it is
121982de724fSAlan Cox 	 * faster to try pmap_extract_and_hold() first.
122082de724fSAlan Cox 	 */
122182de724fSAlan Cox 	pmap_failed = FALSE;
122282de724fSAlan Cox 	for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) {
122382de724fSAlan Cox 		*mp = pmap_extract_and_hold(map->pmap, va, prot);
122482de724fSAlan Cox 		if (*mp == NULL)
122582de724fSAlan Cox 			pmap_failed = TRUE;
122682de724fSAlan Cox 		else if ((prot & VM_PROT_WRITE) != 0 &&
1227a5dbab54SAlan Cox 		    (*mp)->dirty != VM_PAGE_BITS_ALL) {
122882de724fSAlan Cox 			/*
122982de724fSAlan Cox 			 * Explicitly dirty the physical page.  Otherwise, the
123082de724fSAlan Cox 			 * caller's changes may go unnoticed because they are
123182de724fSAlan Cox 			 * performed through an unmanaged mapping or by a DMA
123282de724fSAlan Cox 			 * operation.
12333c76db4cSAlan Cox 			 *
1234abb9b935SKonstantin Belousov 			 * The object lock is not held here.
1235abb9b935SKonstantin Belousov 			 * See vm_page_clear_dirty_mask().
123682de724fSAlan Cox 			 */
12373c76db4cSAlan Cox 			vm_page_dirty(*mp);
123882de724fSAlan Cox 		}
123982de724fSAlan Cox 	}
124082de724fSAlan Cox 	if (pmap_failed) {
124182de724fSAlan Cox 		/*
124282de724fSAlan Cox 		 * One or more pages could not be held by the pmap.  Either no
124382de724fSAlan Cox 		 * page was mapped at the specified virtual address or that
124482de724fSAlan Cox 		 * mapping had insufficient permissions.  Attempt to fault in
124582de724fSAlan Cox 		 * and hold these pages.
124682de724fSAlan Cox 		 */
124782de724fSAlan Cox 		for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE)
1248be996836SAttilio Rao 			if (*mp == NULL && vm_fault_hold(map, va, prot,
124982de724fSAlan Cox 			    VM_FAULT_NORMAL, mp) != KERN_SUCCESS)
125082de724fSAlan Cox 				goto error;
125182de724fSAlan Cox 	}
125282de724fSAlan Cox 	return (count);
125382de724fSAlan Cox error:
125482de724fSAlan Cox 	for (mp = ma; mp < ma + count; mp++)
125582de724fSAlan Cox 		if (*mp != NULL) {
125682de724fSAlan Cox 			vm_page_lock(*mp);
125782de724fSAlan Cox 			vm_page_unhold(*mp);
125882de724fSAlan Cox 			vm_page_unlock(*mp);
125982de724fSAlan Cox 		}
126082de724fSAlan Cox 	return (-1);
126182de724fSAlan Cox }
126282de724fSAlan Cox 
126382de724fSAlan Cox /*
1264df8bae1dSRodney W. Grimes  *	Routine:
1265df8bae1dSRodney W. Grimes  *		vm_fault_copy_entry
1266df8bae1dSRodney W. Grimes  *	Function:
1267210a6886SKonstantin Belousov  *		Create new shadow object backing dst_entry with private copy of
1268210a6886SKonstantin Belousov  *		all underlying pages. When src_entry is equal to dst_entry,
1269210a6886SKonstantin Belousov  *		function implements COW for wired-down map entry. Otherwise,
1270210a6886SKonstantin Belousov  *		it forks wired entry into dst_map.
1271df8bae1dSRodney W. Grimes  *
1272df8bae1dSRodney W. Grimes  *	In/out conditions:
1273df8bae1dSRodney W. Grimes  *		The source and destination maps must be locked for write.
1274df8bae1dSRodney W. Grimes  *		The source map entry must be wired down (or be a sharing map
1275df8bae1dSRodney W. Grimes  *		entry corresponding to a main map entry that is wired down).
1276df8bae1dSRodney W. Grimes  */
127726f9a767SRodney W. Grimes void
1278121fd461SKonstantin Belousov vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map,
1279121fd461SKonstantin Belousov     vm_map_entry_t dst_entry, vm_map_entry_t src_entry,
1280121fd461SKonstantin Belousov     vm_ooffset_t *fork_charge)
1281df8bae1dSRodney W. Grimes {
1282210a6886SKonstantin Belousov 	vm_object_t backing_object, dst_object, object, src_object;
12837afab86cSAlan Cox 	vm_pindex_t dst_pindex, pindex, src_pindex;
1284210a6886SKonstantin Belousov 	vm_prot_t access, prot;
1285df8bae1dSRodney W. Grimes 	vm_offset_t vaddr;
1286df8bae1dSRodney W. Grimes 	vm_page_t dst_m;
1287df8bae1dSRodney W. Grimes 	vm_page_t src_m;
12884c74acf7SKonstantin Belousov 	boolean_t upgrade;
1289df8bae1dSRodney W. Grimes 
1290df8bae1dSRodney W. Grimes #ifdef	lint
1291df8bae1dSRodney W. Grimes 	src_map++;
12920d94caffSDavid Greenman #endif	/* lint */
1293df8bae1dSRodney W. Grimes 
1294210a6886SKonstantin Belousov 	upgrade = src_entry == dst_entry;
12950973283dSKonstantin Belousov 	access = prot = dst_entry->protection;
1296210a6886SKonstantin Belousov 
1297df8bae1dSRodney W. Grimes 	src_object = src_entry->object.vm_object;
12987afab86cSAlan Cox 	src_pindex = OFF_TO_IDX(src_entry->offset);
1299df8bae1dSRodney W. Grimes 
13000973283dSKonstantin Belousov 	if (upgrade && (dst_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
13010973283dSKonstantin Belousov 		dst_object = src_object;
13020973283dSKonstantin Belousov 		vm_object_reference(dst_object);
13030973283dSKonstantin Belousov 	} else {
1304df8bae1dSRodney W. Grimes 		/*
13050d94caffSDavid Greenman 		 * Create the top-level object for the destination entry. (Doesn't
13060d94caffSDavid Greenman 		 * actually shadow anything - we copy the pages directly.)
1307df8bae1dSRodney W. Grimes 		 */
130824a1cce3SDavid Greenman 		dst_object = vm_object_allocate(OBJT_DEFAULT,
130957b5187bSAlan Cox 		    OFF_TO_IDX(dst_entry->end - dst_entry->start));
1310f8a47341SAlan Cox #if VM_NRESERVLEVEL > 0
1311f8a47341SAlan Cox 		dst_object->flags |= OBJ_COLORED;
1312f8a47341SAlan Cox 		dst_object->pg_color = atop(dst_entry->start);
1313f8a47341SAlan Cox #endif
13140973283dSKonstantin Belousov 	}
1315df8bae1dSRodney W. Grimes 
131689f6b863SAttilio Rao 	VM_OBJECT_WLOCK(dst_object);
1317210a6886SKonstantin Belousov 	KASSERT(upgrade || dst_entry->object.vm_object == NULL,
1318121fd461SKonstantin Belousov 	    ("vm_fault_copy_entry: vm_object not NULL"));
13190973283dSKonstantin Belousov 	if (src_object != dst_object) {
1320df8bae1dSRodney W. Grimes 		dst_entry->object.vm_object = dst_object;
1321df8bae1dSRodney W. Grimes 		dst_entry->offset = 0;
13223364c323SKonstantin Belousov 		dst_object->charge = dst_entry->end - dst_entry->start;
13230973283dSKonstantin Belousov 	}
1324210a6886SKonstantin Belousov 	if (fork_charge != NULL) {
1325ef694c1aSEdward Tomasz Napierala 		KASSERT(dst_entry->cred == NULL,
1326121fd461SKonstantin Belousov 		    ("vm_fault_copy_entry: leaked swp charge"));
1327ef694c1aSEdward Tomasz Napierala 		dst_object->cred = curthread->td_ucred;
1328ef694c1aSEdward Tomasz Napierala 		crhold(dst_object->cred);
1329121fd461SKonstantin Belousov 		*fork_charge += dst_object->charge;
13300973283dSKonstantin Belousov 	} else if (dst_object->cred == NULL) {
13310973283dSKonstantin Belousov 		KASSERT(dst_entry->cred != NULL, ("no cred for entry %p",
13320973283dSKonstantin Belousov 		    dst_entry));
1333ef694c1aSEdward Tomasz Napierala 		dst_object->cred = dst_entry->cred;
1334ef694c1aSEdward Tomasz Napierala 		dst_entry->cred = NULL;
1335210a6886SKonstantin Belousov 	}
13360973283dSKonstantin Belousov 
1337210a6886SKonstantin Belousov 	/*
1338210a6886SKonstantin Belousov 	 * If not an upgrade, then enter the mappings in the pmap as
1339210a6886SKonstantin Belousov 	 * read and/or execute accesses.  Otherwise, enter them as
1340210a6886SKonstantin Belousov 	 * write accesses.
1341210a6886SKonstantin Belousov 	 *
1342210a6886SKonstantin Belousov 	 * A writeable large page mapping is only created if all of
1343210a6886SKonstantin Belousov 	 * the constituent small page mappings are modified. Marking
1344210a6886SKonstantin Belousov 	 * PTEs as modified on inception allows promotion to happen
1345210a6886SKonstantin Belousov 	 * without taking potentially large number of soft faults.
1346210a6886SKonstantin Belousov 	 */
1347210a6886SKonstantin Belousov 	if (!upgrade)
1348210a6886SKonstantin Belousov 		access &= ~VM_PROT_WRITE;
1349df8bae1dSRodney W. Grimes 
1350df8bae1dSRodney W. Grimes 	/*
1351ef45823eSKonstantin Belousov 	 * Loop through all of the virtual pages within the entry's
1352ef45823eSKonstantin Belousov 	 * range, copying each page from the source object to the
1353ef45823eSKonstantin Belousov 	 * destination object.  Since the source is wired, those pages
1354ef45823eSKonstantin Belousov 	 * must exist.  In contrast, the destination is pageable.
1355ef45823eSKonstantin Belousov 	 * Since the destination object does share any backing storage
1356ef45823eSKonstantin Belousov 	 * with the source object, all of its pages must be dirtied,
1357ef45823eSKonstantin Belousov 	 * regardless of whether they can be written.
1358df8bae1dSRodney W. Grimes 	 */
13597afab86cSAlan Cox 	for (vaddr = dst_entry->start, dst_pindex = 0;
1360df8bae1dSRodney W. Grimes 	    vaddr < dst_entry->end;
13617afab86cSAlan Cox 	    vaddr += PAGE_SIZE, dst_pindex++) {
13620973283dSKonstantin Belousov again:
1363df8bae1dSRodney W. Grimes 		/*
1364df8bae1dSRodney W. Grimes 		 * Find the page in the source object, and copy it in.
13654c74acf7SKonstantin Belousov 		 * Because the source is wired down, the page will be
13664c74acf7SKonstantin Belousov 		 * in memory.
1367df8bae1dSRodney W. Grimes 		 */
13680973283dSKonstantin Belousov 		if (src_object != dst_object)
136983b375eaSAttilio Rao 			VM_OBJECT_RLOCK(src_object);
1370c5b65a67SAlan Cox 		object = src_object;
13717afab86cSAlan Cox 		pindex = src_pindex + dst_pindex;
13727afab86cSAlan Cox 		while ((src_m = vm_page_lookup(object, pindex)) == NULL &&
1373c5b65a67SAlan Cox 		    (backing_object = object->backing_object) != NULL) {
1374c5b65a67SAlan Cox 			/*
13754c74acf7SKonstantin Belousov 			 * Unless the source mapping is read-only or
13764c74acf7SKonstantin Belousov 			 * it is presently being upgraded from
13774c74acf7SKonstantin Belousov 			 * read-only, the first object in the shadow
13784c74acf7SKonstantin Belousov 			 * chain should provide all of the pages.  In
13794c74acf7SKonstantin Belousov 			 * other words, this loop body should never be
13804c74acf7SKonstantin Belousov 			 * executed when the source mapping is already
13814c74acf7SKonstantin Belousov 			 * read/write.
1382c5b65a67SAlan Cox 			 */
13834c74acf7SKonstantin Belousov 			KASSERT((src_entry->protection & VM_PROT_WRITE) == 0 ||
13844c74acf7SKonstantin Belousov 			    upgrade,
13854c74acf7SKonstantin Belousov 			    ("vm_fault_copy_entry: main object missing page"));
13864c74acf7SKonstantin Belousov 
138783b375eaSAttilio Rao 			VM_OBJECT_RLOCK(backing_object);
1388c5b65a67SAlan Cox 			pindex += OFF_TO_IDX(object->backing_object_offset);
13890973283dSKonstantin Belousov 			if (object != dst_object)
139083b375eaSAttilio Rao 				VM_OBJECT_RUNLOCK(object);
1391c5b65a67SAlan Cox 			object = backing_object;
1392c5b65a67SAlan Cox 		}
13934c74acf7SKonstantin Belousov 		KASSERT(src_m != NULL, ("vm_fault_copy_entry: page missing"));
13940973283dSKonstantin Belousov 
13950973283dSKonstantin Belousov 		if (object != dst_object) {
13960973283dSKonstantin Belousov 			/*
13970973283dSKonstantin Belousov 			 * Allocate a page in the destination object.
13980973283dSKonstantin Belousov 			 */
13992602a2eaSKonstantin Belousov 			dst_m = vm_page_alloc(dst_object, (src_object ==
14002602a2eaSKonstantin Belousov 			    dst_object ? src_pindex : 0) + dst_pindex,
14012602a2eaSKonstantin Belousov 			    VM_ALLOC_NORMAL);
14020973283dSKonstantin Belousov 			if (dst_m == NULL) {
14030973283dSKonstantin Belousov 				VM_OBJECT_WUNLOCK(dst_object);
14040973283dSKonstantin Belousov 				VM_OBJECT_RUNLOCK(object);
14050973283dSKonstantin Belousov 				VM_WAIT;
1406c8f780e3SKonstantin Belousov 				VM_OBJECT_WLOCK(dst_object);
14070973283dSKonstantin Belousov 				goto again;
14080973283dSKonstantin Belousov 			}
1409669890eaSAlan Cox 			pmap_copy_page(src_m, dst_m);
141083b375eaSAttilio Rao 			VM_OBJECT_RUNLOCK(object);
1411669890eaSAlan Cox 			dst_m->valid = VM_PAGE_BITS_ALL;
1412bc79b37fSKonstantin Belousov 			dst_m->dirty = VM_PAGE_BITS_ALL;
14130973283dSKonstantin Belousov 		} else {
14140973283dSKonstantin Belousov 			dst_m = src_m;
14150973283dSKonstantin Belousov 			if (vm_page_sleep_if_busy(dst_m, "fltupg"))
14160973283dSKonstantin Belousov 				goto again;
14170973283dSKonstantin Belousov 			vm_page_xbusy(dst_m);
14180973283dSKonstantin Belousov 			KASSERT(dst_m->valid == VM_PAGE_BITS_ALL,
14190973283dSKonstantin Belousov 			    ("invalid dst page %p", dst_m));
14200973283dSKonstantin Belousov 		}
142189f6b863SAttilio Rao 		VM_OBJECT_WUNLOCK(dst_object);
1422df8bae1dSRodney W. Grimes 
1423df8bae1dSRodney W. Grimes 		/*
1424210a6886SKonstantin Belousov 		 * Enter it in the pmap. If a wired, copy-on-write
1425210a6886SKonstantin Belousov 		 * mapping is being replaced by a write-enabled
1426210a6886SKonstantin Belousov 		 * mapping, then wire that new mapping.
1427df8bae1dSRodney W. Grimes 		 */
142839ffa8c1SKonstantin Belousov 		pmap_enter(dst_map->pmap, vaddr, dst_m, prot,
142939ffa8c1SKonstantin Belousov 		    access | (upgrade ? PMAP_ENTER_WIRED : 0), 0);
1430df8bae1dSRodney W. Grimes 
1431df8bae1dSRodney W. Grimes 		/*
1432df8bae1dSRodney W. Grimes 		 * Mark it no longer busy, and put it on the active list.
1433df8bae1dSRodney W. Grimes 		 */
143489f6b863SAttilio Rao 		VM_OBJECT_WLOCK(dst_object);
14352965a453SKip Macy 
1436210a6886SKonstantin Belousov 		if (upgrade) {
14370973283dSKonstantin Belousov 			if (src_m != dst_m) {
14382965a453SKip Macy 				vm_page_lock(src_m);
14393ae10f74SAttilio Rao 				vm_page_unwire(src_m, PQ_INACTIVE);
1440e20e8c15SKonstantin Belousov 				vm_page_unlock(src_m);
14412965a453SKip Macy 				vm_page_lock(dst_m);
1442210a6886SKonstantin Belousov 				vm_page_wire(dst_m);
1443e20e8c15SKonstantin Belousov 				vm_page_unlock(dst_m);
14442965a453SKip Macy 			} else {
14450973283dSKonstantin Belousov 				KASSERT(dst_m->wire_count > 0,
14460973283dSKonstantin Belousov 				    ("dst_m %p is not wired", dst_m));
14470973283dSKonstantin Belousov 			}
14480973283dSKonstantin Belousov 		} else {
14492965a453SKip Macy 			vm_page_lock(dst_m);
1450df8bae1dSRodney W. Grimes 			vm_page_activate(dst_m);
1451e20e8c15SKonstantin Belousov 			vm_page_unlock(dst_m);
14522965a453SKip Macy 		}
1453c7aebda8SAttilio Rao 		vm_page_xunbusy(dst_m);
1454df8bae1dSRodney W. Grimes 	}
145589f6b863SAttilio Rao 	VM_OBJECT_WUNLOCK(dst_object);
1456210a6886SKonstantin Belousov 	if (upgrade) {
1457210a6886SKonstantin Belousov 		dst_entry->eflags &= ~(MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY);
1458210a6886SKonstantin Belousov 		vm_object_deallocate(src_object);
1459210a6886SKonstantin Belousov 	}
1460df8bae1dSRodney W. Grimes }
146126f9a767SRodney W. Grimes 
146226f9a767SRodney W. Grimes 
146326f9a767SRodney W. Grimes /*
146426f9a767SRodney W. Grimes  * This routine checks around the requested page for other pages that
146522ba64e8SJohn Dyson  * might be able to be faulted in.  This routine brackets the viable
146622ba64e8SJohn Dyson  * pages for the pages to be paged in.
146726f9a767SRodney W. Grimes  *
146826f9a767SRodney W. Grimes  * Inputs:
146922ba64e8SJohn Dyson  *	m, rbehind, rahead
147026f9a767SRodney W. Grimes  *
147126f9a767SRodney W. Grimes  * Outputs:
147226f9a767SRodney W. Grimes  *  marray (array of vm_page_t), reqpage (index of requested page)
147326f9a767SRodney W. Grimes  *
147426f9a767SRodney W. Grimes  * Return value:
147526f9a767SRodney W. Grimes  *  number of pages in marray
147626f9a767SRodney W. Grimes  */
1477303b270bSEivind Eklund static int
147822ba64e8SJohn Dyson vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage)
147926f9a767SRodney W. Grimes 	vm_page_t m;
148026f9a767SRodney W. Grimes 	int rbehind;
148122ba64e8SJohn Dyson 	int rahead;
148226f9a767SRodney W. Grimes 	vm_page_t *marray;
148326f9a767SRodney W. Grimes 	int *reqpage;
148426f9a767SRodney W. Grimes {
14852d8acc0fSJohn Dyson 	int i,j;
148626f9a767SRodney W. Grimes 	vm_object_t object;
1487a316d390SJohn Dyson 	vm_pindex_t pindex, startpindex, endpindex, tpindex;
148826f9a767SRodney W. Grimes 	vm_page_t rtm;
1489170db9c6SJohn Dyson 	int cbehind, cahead;
149026f9a767SRodney W. Grimes 
149189f6b863SAttilio Rao 	VM_OBJECT_ASSERT_WLOCKED(m->object);
149223955314SAlfred Perlstein 
149326f9a767SRodney W. Grimes 	object = m->object;
1494a316d390SJohn Dyson 	pindex = m->pindex;
1495fcdd9721SPawel Jakub Dawidek 	cbehind = cahead = 0;
1496fcdd9721SPawel Jakub Dawidek 
1497f35329acSJohn Dyson 	/*
149826f9a767SRodney W. Grimes 	 * if the requested page is not available, then give up now
149926f9a767SRodney W. Grimes 	 */
15001c7c3c6aSMatthew Dillon 	if (!vm_pager_has_page(object, pindex, &cbehind, &cahead)) {
150126f9a767SRodney W. Grimes 		return 0;
15022d8acc0fSJohn Dyson 	}
150326f9a767SRodney W. Grimes 
150422ba64e8SJohn Dyson 	if ((cbehind == 0) && (cahead == 0)) {
150522ba64e8SJohn Dyson 		*reqpage = 0;
150622ba64e8SJohn Dyson 		marray[0] = m;
150722ba64e8SJohn Dyson 		return 1;
1508170db9c6SJohn Dyson 	}
150922ba64e8SJohn Dyson 
151022ba64e8SJohn Dyson 	if (rahead > cahead) {
151122ba64e8SJohn Dyson 		rahead = cahead;
151222ba64e8SJohn Dyson 	}
151322ba64e8SJohn Dyson 
1514170db9c6SJohn Dyson 	if (rbehind > cbehind) {
1515170db9c6SJohn Dyson 		rbehind = cbehind;
1516170db9c6SJohn Dyson 	}
1517170db9c6SJohn Dyson 
151826f9a767SRodney W. Grimes 	/*
15192d8acc0fSJohn Dyson 	 * scan backward for the read behind pages -- in memory
152026f9a767SRodney W. Grimes 	 */
15212d8acc0fSJohn Dyson 	if (pindex > 0) {
15222d8acc0fSJohn Dyson 		if (rbehind > pindex) {
1523a316d390SJohn Dyson 			rbehind = pindex;
15242d8acc0fSJohn Dyson 			startpindex = 0;
15252d8acc0fSJohn Dyson 		} else {
1526a316d390SJohn Dyson 			startpindex = pindex - rbehind;
15272d8acc0fSJohn Dyson 		}
15282d8acc0fSJohn Dyson 
15298f8790a7SAlan Cox 		if ((rtm = TAILQ_PREV(m, pglist, listq)) != NULL &&
15308f8790a7SAlan Cox 		    rtm->pindex >= startpindex)
15318f8790a7SAlan Cox 			startpindex = rtm->pindex + 1;
153226f9a767SRodney W. Grimes 
153380645364SAlan Cox 		/* tpindex is unsigned; beware of numeric underflow. */
153480645364SAlan Cox 		for (i = 0, tpindex = pindex - 1; tpindex >= startpindex &&
153580645364SAlan Cox 		    tpindex < pindex; i++, tpindex--) {
153626f9a767SRodney W. Grimes 
15377bfda801SAlan Cox 			rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL |
15387bfda801SAlan Cox 			    VM_ALLOC_IFNOTCACHED);
1539ccbb2f72SJohn Dyson 			if (rtm == NULL) {
154080645364SAlan Cox 				/*
154180645364SAlan Cox 				 * Shift the allocated pages to the
154280645364SAlan Cox 				 * beginning of the array.
154380645364SAlan Cox 				 */
1544ccbb2f72SJohn Dyson 				for (j = 0; j < i; j++) {
154580645364SAlan Cox 					marray[j] = marray[j + tpindex + 1 -
154680645364SAlan Cox 					    startpindex];
154726f9a767SRodney W. Grimes 				}
154880645364SAlan Cox 				break;
154926f9a767SRodney W. Grimes 			}
1550170db9c6SJohn Dyson 
155180645364SAlan Cox 			marray[tpindex - startpindex] = rtm;
155226f9a767SRodney W. Grimes 		}
15532d8acc0fSJohn Dyson 	} else {
15542d8acc0fSJohn Dyson 		startpindex = 0;
15552d8acc0fSJohn Dyson 		i = 0;
15562d8acc0fSJohn Dyson 	}
15572d8acc0fSJohn Dyson 
15582d8acc0fSJohn Dyson 	marray[i] = m;
15592d8acc0fSJohn Dyson 	/* page offset of the required page */
15602d8acc0fSJohn Dyson 	*reqpage = i;
15612d8acc0fSJohn Dyson 
15622d8acc0fSJohn Dyson 	tpindex = pindex + 1;
15632d8acc0fSJohn Dyson 	i++;
15642d8acc0fSJohn Dyson 
15652d8acc0fSJohn Dyson 	/*
15662d8acc0fSJohn Dyson 	 * scan forward for the read ahead pages
15672d8acc0fSJohn Dyson 	 */
15682d8acc0fSJohn Dyson 	endpindex = tpindex + rahead;
15698f8790a7SAlan Cox 	if ((rtm = TAILQ_NEXT(m, listq)) != NULL && rtm->pindex < endpindex)
15708f8790a7SAlan Cox 		endpindex = rtm->pindex;
15712d8acc0fSJohn Dyson 	if (endpindex > object->size)
15722d8acc0fSJohn Dyson 		endpindex = object->size;
15732d8acc0fSJohn Dyson 
15742d8acc0fSJohn Dyson 	for (; tpindex < endpindex; i++, tpindex++) {
15752d8acc0fSJohn Dyson 
15767bfda801SAlan Cox 		rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL |
15777bfda801SAlan Cox 		    VM_ALLOC_IFNOTCACHED);
15782d8acc0fSJohn Dyson 		if (rtm == NULL) {
15792d8acc0fSJohn Dyson 			break;
15802d8acc0fSJohn Dyson 		}
15812d8acc0fSJohn Dyson 
15822d8acc0fSJohn Dyson 		marray[i] = rtm;
15832d8acc0fSJohn Dyson 	}
15842d8acc0fSJohn Dyson 
158565ea29a6SAlan Cox 	/* return number of pages */
15862d8acc0fSJohn Dyson 	return i;
158726f9a767SRodney W. Grimes }
15882801687dSKonstantin Belousov 
15895730afc9SAlan Cox /*
15905730afc9SAlan Cox  * Block entry into the machine-independent layer's page fault handler by
15915730afc9SAlan Cox  * the calling thread.  Subsequent calls to vm_fault() by that thread will
15925730afc9SAlan Cox  * return KERN_PROTECTION_FAILURE.  Enable machine-dependent handling of
15935730afc9SAlan Cox  * spurious page faults.
15945730afc9SAlan Cox  */
15952801687dSKonstantin Belousov int
15962801687dSKonstantin Belousov vm_fault_disable_pagefaults(void)
15972801687dSKonstantin Belousov {
15982801687dSKonstantin Belousov 
15995730afc9SAlan Cox 	return (curthread_pflags_set(TDP_NOFAULTING | TDP_RESETSPUR));
16002801687dSKonstantin Belousov }
16012801687dSKonstantin Belousov 
16022801687dSKonstantin Belousov void
16032801687dSKonstantin Belousov vm_fault_enable_pagefaults(int save)
16042801687dSKonstantin Belousov {
16052801687dSKonstantin Belousov 
16062801687dSKonstantin Belousov 	curthread_pflags_restore(save);
16072801687dSKonstantin Belousov }
1608