160727d8bSWarner Losh /*- 2796df753SPedro F. Giffuni * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU) 3df57947fSPedro F. Giffuni * 4df8bae1dSRodney W. Grimes * Copyright (c) 1991, 1993 5df8bae1dSRodney W. Grimes * The Regents of the University of California. All rights reserved. 626f9a767SRodney W. Grimes * Copyright (c) 1994 John S. Dyson 726f9a767SRodney W. Grimes * All rights reserved. 826f9a767SRodney W. Grimes * Copyright (c) 1994 David Greenman 926f9a767SRodney W. Grimes * All rights reserved. 1026f9a767SRodney W. Grimes * 11df8bae1dSRodney W. Grimes * 12df8bae1dSRodney W. Grimes * This code is derived from software contributed to Berkeley by 13df8bae1dSRodney W. Grimes * The Mach Operating System project at Carnegie-Mellon University. 14df8bae1dSRodney W. Grimes * 15df8bae1dSRodney W. Grimes * Redistribution and use in source and binary forms, with or without 16df8bae1dSRodney W. Grimes * modification, are permitted provided that the following conditions 17df8bae1dSRodney W. Grimes * are met: 18df8bae1dSRodney W. Grimes * 1. Redistributions of source code must retain the above copyright 19df8bae1dSRodney W. Grimes * notice, this list of conditions and the following disclaimer. 20df8bae1dSRodney W. Grimes * 2. Redistributions in binary form must reproduce the above copyright 21df8bae1dSRodney W. Grimes * notice, this list of conditions and the following disclaimer in the 22df8bae1dSRodney W. Grimes * documentation and/or other materials provided with the distribution. 23df8bae1dSRodney W. Grimes * 3. All advertising materials mentioning features or use of this software 245929bcfaSPhilippe Charnier * must display the following acknowledgement: 25df8bae1dSRodney W. Grimes * This product includes software developed by the University of 26df8bae1dSRodney W. Grimes * California, Berkeley and its contributors. 27df8bae1dSRodney W. Grimes * 4. Neither the name of the University nor the names of its contributors 28df8bae1dSRodney W. Grimes * may be used to endorse or promote products derived from this software 29df8bae1dSRodney W. Grimes * without specific prior written permission. 30df8bae1dSRodney W. Grimes * 31df8bae1dSRodney W. Grimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 32df8bae1dSRodney W. Grimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 33df8bae1dSRodney W. Grimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34df8bae1dSRodney W. Grimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 35df8bae1dSRodney W. Grimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 36df8bae1dSRodney W. Grimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 37df8bae1dSRodney W. Grimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 38df8bae1dSRodney W. Grimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 39df8bae1dSRodney W. Grimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 40df8bae1dSRodney W. Grimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 41df8bae1dSRodney W. Grimes * SUCH DAMAGE. 42df8bae1dSRodney W. Grimes * 43df8bae1dSRodney W. Grimes * 44df8bae1dSRodney W. Grimes * Copyright (c) 1987, 1990 Carnegie-Mellon University. 45df8bae1dSRodney W. Grimes * All rights reserved. 46df8bae1dSRodney W. Grimes * 47df8bae1dSRodney W. Grimes * Authors: Avadis Tevanian, Jr., Michael Wayne Young 48df8bae1dSRodney W. Grimes * 49df8bae1dSRodney W. Grimes * Permission to use, copy, modify and distribute this software and 50df8bae1dSRodney W. Grimes * its documentation is hereby granted, provided that both the copyright 51df8bae1dSRodney W. Grimes * notice and this permission notice appear in all copies of the 52df8bae1dSRodney W. Grimes * software, derivative works or modified versions, and any portions 53df8bae1dSRodney W. Grimes * thereof, and that both notices appear in supporting documentation. 54df8bae1dSRodney W. Grimes * 55df8bae1dSRodney W. Grimes * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 56df8bae1dSRodney W. Grimes * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 57df8bae1dSRodney W. Grimes * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 58df8bae1dSRodney W. Grimes * 59df8bae1dSRodney W. Grimes * Carnegie Mellon requests users of this software to return to 60df8bae1dSRodney W. Grimes * 61df8bae1dSRodney W. Grimes * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 62df8bae1dSRodney W. Grimes * School of Computer Science 63df8bae1dSRodney W. Grimes * Carnegie Mellon University 64df8bae1dSRodney W. Grimes * Pittsburgh PA 15213-3890 65df8bae1dSRodney W. Grimes * 66df8bae1dSRodney W. Grimes * any improvements or extensions that they make and grant Carnegie the 67df8bae1dSRodney W. Grimes * rights to redistribute these changes. 68df8bae1dSRodney W. Grimes */ 69df8bae1dSRodney W. Grimes 70df8bae1dSRodney W. Grimes /* 71df8bae1dSRodney W. Grimes * Page fault handling module. 72df8bae1dSRodney W. Grimes */ 73874651b1SDavid E. O'Brien 74874651b1SDavid E. O'Brien #include <sys/cdefs.h> 7535818d2eSJohn Baldwin #include "opt_ktrace.h" 76f8a47341SAlan Cox #include "opt_vm.h" 77f8a47341SAlan Cox 78df8bae1dSRodney W. Grimes #include <sys/param.h> 79df8bae1dSRodney W. Grimes #include <sys/systm.h> 804edf4a58SJohn Baldwin #include <sys/kernel.h> 81fb919e4dSMark Murray #include <sys/lock.h> 82a8b0f100SAlan Cox #include <sys/mman.h> 83eeacb3b0SMark Johnston #include <sys/mutex.h> 845d32157dSMark Johnston #include <sys/pctrie.h> 8526f9a767SRodney W. Grimes #include <sys/proc.h> 86ae34b6ffSEdward Tomasz Napierala #include <sys/racct.h> 8711b57401SHans Petter Selasky #include <sys/refcount.h> 8826f9a767SRodney W. Grimes #include <sys/resourcevar.h> 8989f6b863SAttilio Rao #include <sys/rwlock.h> 90df08823dSKonstantin Belousov #include <sys/signalvar.h> 9123955314SAlfred Perlstein #include <sys/sysctl.h> 92df08823dSKonstantin Belousov #include <sys/sysent.h> 934edf4a58SJohn Baldwin #include <sys/vmmeter.h> 944edf4a58SJohn Baldwin #include <sys/vnode.h> 9535818d2eSJohn Baldwin #ifdef KTRACE 9635818d2eSJohn Baldwin #include <sys/ktrace.h> 9735818d2eSJohn Baldwin #endif 98df8bae1dSRodney W. Grimes 99df8bae1dSRodney W. Grimes #include <vm/vm.h> 100efeaf95aSDavid Greenman #include <vm/vm_param.h> 101efeaf95aSDavid Greenman #include <vm/pmap.h> 102efeaf95aSDavid Greenman #include <vm/vm_map.h> 103efeaf95aSDavid Greenman #include <vm/vm_object.h> 104df8bae1dSRodney W. Grimes #include <vm/vm_page.h> 105df8bae1dSRodney W. Grimes #include <vm/vm_pageout.h> 106a83c285cSDavid Greenman #include <vm/vm_kern.h> 10724a1cce3SDavid Greenman #include <vm/vm_pager.h> 108efeaf95aSDavid Greenman #include <vm/vm_extern.h> 109dfdf9abdSAlan Cox #include <vm/vm_reserv.h> 110df8bae1dSRodney W. Grimes 111566526a9SAlan Cox #define PFBAK 4 112566526a9SAlan Cox #define PFFOR 4 113566526a9SAlan Cox 1145268042bSAlan Cox #define VM_FAULT_READ_DEFAULT (1 + VM_FAULT_READ_AHEAD_INIT) 115a8b0f100SAlan Cox 116a8b0f100SAlan Cox #define VM_FAULT_DONTNEED_MIN 1048576 11726f9a767SRodney W. Grimes 1184866e085SJohn Dyson struct faultstate { 1192c2f4413SJeff Roberson /* Fault parameters. */ 1205949b1caSJeff Roberson vm_offset_t vaddr; 1212c2f4413SJeff Roberson vm_page_t *m_hold; 1222c2f4413SJeff Roberson vm_prot_t fault_type; 1232c2f4413SJeff Roberson vm_prot_t prot; 1242c2f4413SJeff Roberson int fault_flags; 12545c09a74SMark Johnston boolean_t wired; 12645c09a74SMark Johnston 12745c09a74SMark Johnston /* Control state. */ 128174aad04SKonstantin Belousov struct timeval oom_start_time; 129174aad04SKonstantin Belousov bool oom_started; 13045c09a74SMark Johnston int nera; 131fdb1dbb1SMateusz Guzik bool can_read_lock; 1322c2f4413SJeff Roberson 1332c2f4413SJeff Roberson /* Page reference for cow. */ 13458447749SJeff Roberson vm_page_t m_cow; 1352c2f4413SJeff Roberson 1362c2f4413SJeff Roberson /* Current object. */ 1374866e085SJohn Dyson vm_object_t object; 1384866e085SJohn Dyson vm_pindex_t pindex; 1392c2f4413SJeff Roberson vm_page_t m; 1402c2f4413SJeff Roberson 1412c2f4413SJeff Roberson /* Top-level map object. */ 1424866e085SJohn Dyson vm_object_t first_object; 1434866e085SJohn Dyson vm_pindex_t first_pindex; 1442c2f4413SJeff Roberson vm_page_t first_m; 1452c2f4413SJeff Roberson 1462c2f4413SJeff Roberson /* Map state. */ 1474866e085SJohn Dyson vm_map_t map; 1484866e085SJohn Dyson vm_map_entry_t entry; 149dc5401d2SKonstantin Belousov int map_generation; 150cd8a6fe8SAlan Cox bool lookup_still_valid; 1512c2f4413SJeff Roberson 1522c2f4413SJeff Roberson /* Vnode if locked. */ 1534866e085SJohn Dyson struct vnode *vp; 1544866e085SJohn Dyson }; 1554866e085SJohn Dyson 156f1b642c2SMark Johnston /* 157f1b642c2SMark Johnston * Return codes for internal fault routines. 158f1b642c2SMark Johnston */ 159f1b642c2SMark Johnston enum fault_status { 160ef747607SKonstantin Belousov FAULT_SUCCESS = 10000, /* Return success to user. */ 161f1b642c2SMark Johnston FAULT_FAILURE, /* Return failure to user. */ 162f1b642c2SMark Johnston FAULT_CONTINUE, /* Continue faulting. */ 163f1b642c2SMark Johnston FAULT_RESTART, /* Restart fault. */ 164f1b642c2SMark Johnston FAULT_OUT_OF_BOUNDS, /* Invalid address for pager. */ 165f1b642c2SMark Johnston FAULT_HARD, /* Performed I/O. */ 166f1b642c2SMark Johnston FAULT_SOFT, /* Found valid page. */ 167d47d3a94SMark Johnston FAULT_PROTECTION_FAILURE, /* Invalid access. */ 168f1b642c2SMark Johnston }; 169f1b642c2SMark Johnston 170fdb1dbb1SMateusz Guzik enum fault_next_status { 171fdb1dbb1SMateusz Guzik FAULT_NEXT_GOTOBJ = 1, 172fdb1dbb1SMateusz Guzik FAULT_NEXT_NOOBJ, 173fdb1dbb1SMateusz Guzik FAULT_NEXT_RESTART, 174fdb1dbb1SMateusz Guzik }; 175fdb1dbb1SMateusz Guzik 176a8b0f100SAlan Cox static void vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, 177a8b0f100SAlan Cox int ahead); 17863281952SAlan Cox static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra, 179a7163bb9SKonstantin Belousov int backward, int forward, bool obj_locked); 18013458803SAlan Cox 181245139c6SKonstantin Belousov static int vm_pfault_oom_attempts = 3; 182245139c6SKonstantin Belousov SYSCTL_INT(_vm, OID_AUTO, pfault_oom_attempts, CTLFLAG_RWTUN, 183245139c6SKonstantin Belousov &vm_pfault_oom_attempts, 0, 184245139c6SKonstantin Belousov "Number of page allocation attempts in page fault handler before it " 185245139c6SKonstantin Belousov "triggers OOM handling"); 186245139c6SKonstantin Belousov 187245139c6SKonstantin Belousov static int vm_pfault_oom_wait = 10; 188245139c6SKonstantin Belousov SYSCTL_INT(_vm, OID_AUTO, pfault_oom_wait, CTLFLAG_RWTUN, 189245139c6SKonstantin Belousov &vm_pfault_oom_wait, 0, 190245139c6SKonstantin Belousov "Number of seconds to wait for free pages before retrying " 191245139c6SKonstantin Belousov "the page fault handler"); 192245139c6SKonstantin Belousov 19362a59e8fSWarner Losh static inline void 1940a310c94SMateusz Guzik vm_fault_page_release(vm_page_t *mp) 1954866e085SJohn Dyson { 1964bf95d00SJeff Roberson vm_page_t m; 1970d0be82aSKonstantin Belousov 1984bf95d00SJeff Roberson m = *mp; 1994bf95d00SJeff Roberson if (m != NULL) { 200be801aaaSMark Johnston /* 2014bf95d00SJeff Roberson * We are likely to loop around again and attempt to busy 2024bf95d00SJeff Roberson * this page. Deactivating it leaves it available for 2034bf95d00SJeff Roberson * pageout while optimizing fault restarts. 204be801aaaSMark Johnston */ 2054bf95d00SJeff Roberson vm_page_deactivate(m); 2064bf95d00SJeff Roberson vm_page_xunbusy(m); 2074bf95d00SJeff Roberson *mp = NULL; 2084bf95d00SJeff Roberson } 2094bf95d00SJeff Roberson } 2104bf95d00SJeff Roberson 2114bf95d00SJeff Roberson static inline void 2120a310c94SMateusz Guzik vm_fault_page_free(vm_page_t *mp) 2134bf95d00SJeff Roberson { 2144bf95d00SJeff Roberson vm_page_t m; 2154bf95d00SJeff Roberson 2164bf95d00SJeff Roberson m = *mp; 2174bf95d00SJeff Roberson if (m != NULL) { 2184bf95d00SJeff Roberson VM_OBJECT_ASSERT_WLOCKED(m->object); 2194bf95d00SJeff Roberson if (!vm_page_wired(m)) 2204bf95d00SJeff Roberson vm_page_free(m); 221419f0b1fSJeff Roberson else 222419f0b1fSJeff Roberson vm_page_xunbusy(m); 2234bf95d00SJeff Roberson *mp = NULL; 2244866e085SJohn Dyson } 225be2c5610SMark Johnston } 2264866e085SJohn Dyson 2275d32157dSMark Johnston /* 2285d32157dSMark Johnston * Return true if a vm_pager_get_pages() call is needed in order to check 2295d32157dSMark Johnston * whether the pager might have a particular page, false if it can be determined 2305d32157dSMark Johnston * immediately that the pager can not have a copy. For swap objects, this can 2315d32157dSMark Johnston * be checked quickly. 2325d32157dSMark Johnston */ 2335d32157dSMark Johnston static inline bool 2340a310c94SMateusz Guzik vm_fault_object_needs_getpages(vm_object_t object) 2355d32157dSMark Johnston { 2365d32157dSMark Johnston VM_OBJECT_ASSERT_LOCKED(object); 2375d32157dSMark Johnston 2385d32157dSMark Johnston return ((object->flags & OBJ_SWAP) == 0 || 2395d32157dSMark Johnston !pctrie_is_empty(&object->un_pager.swp.swp_blks)); 2405d32157dSMark Johnston } 2415d32157dSMark Johnston 24262a59e8fSWarner Losh static inline void 2430a310c94SMateusz Guzik vm_fault_unlock_map(struct faultstate *fs) 2444866e085SJohn Dyson { 2450d0be82aSKonstantin Belousov 24625adb370SBrian Feldman if (fs->lookup_still_valid) { 2474866e085SJohn Dyson vm_map_lookup_done(fs->map, fs->entry); 248cd8a6fe8SAlan Cox fs->lookup_still_valid = false; 2494866e085SJohn Dyson } 2504866e085SJohn Dyson } 2514866e085SJohn Dyson 2524866e085SJohn Dyson static void 2530a310c94SMateusz Guzik vm_fault_unlock_vp(struct faultstate *fs) 254cfabea3dSKonstantin Belousov { 255cfabea3dSKonstantin Belousov 256cfabea3dSKonstantin Belousov if (fs->vp != NULL) { 257cfabea3dSKonstantin Belousov vput(fs->vp); 258cfabea3dSKonstantin Belousov fs->vp = NULL; 259cfabea3dSKonstantin Belousov } 260cfabea3dSKonstantin Belousov } 261cfabea3dSKonstantin Belousov 262cfabea3dSKonstantin Belousov static void 2630a310c94SMateusz Guzik vm_fault_deallocate(struct faultstate *fs) 2644866e085SJohn Dyson { 265f29ba63eSAlan Cox 2660a310c94SMateusz Guzik vm_fault_page_release(&fs->m_cow); 2670a310c94SMateusz Guzik vm_fault_page_release(&fs->m); 2684866e085SJohn Dyson vm_object_pip_wakeup(fs->object); 2694866e085SJohn Dyson if (fs->object != fs->first_object) { 27089f6b863SAttilio Rao VM_OBJECT_WLOCK(fs->first_object); 2710a310c94SMateusz Guzik vm_fault_page_free(&fs->first_m); 27289f6b863SAttilio Rao VM_OBJECT_WUNLOCK(fs->first_object); 2734bf95d00SJeff Roberson vm_object_pip_wakeup(fs->first_object); 2744866e085SJohn Dyson } 2754866e085SJohn Dyson vm_object_deallocate(fs->first_object); 2760a310c94SMateusz Guzik vm_fault_unlock_map(fs); 2770a310c94SMateusz Guzik vm_fault_unlock_vp(fs); 2784866e085SJohn Dyson } 2794866e085SJohn Dyson 280a36f5532SKonstantin Belousov static void 2810a310c94SMateusz Guzik vm_fault_unlock_and_deallocate(struct faultstate *fs) 2824b3e0665SJeff Roberson { 2834b3e0665SJeff Roberson 284fdb1dbb1SMateusz Guzik VM_OBJECT_UNLOCK(fs->object); 2850a310c94SMateusz Guzik vm_fault_deallocate(fs); 2864b3e0665SJeff Roberson } 2874b3e0665SJeff Roberson 2884b3e0665SJeff Roberson static void 2892c2f4413SJeff Roberson vm_fault_dirty(struct faultstate *fs, vm_page_t m) 290a36f5532SKonstantin Belousov { 291e26236e9SKonstantin Belousov bool need_dirty; 292a36f5532SKonstantin Belousov 2932c2f4413SJeff Roberson if (((fs->prot & VM_PROT_WRITE) == 0 && 2942c2f4413SJeff Roberson (fs->fault_flags & VM_FAULT_DIRTY) == 0) || 295a36f5532SKonstantin Belousov (m->oflags & VPO_UNMANAGED) != 0) 296a36f5532SKonstantin Belousov return; 297a36f5532SKonstantin Belousov 2980012f373SJeff Roberson VM_PAGE_OBJECT_BUSY_ASSERT(m); 299a36f5532SKonstantin Belousov 3002c2f4413SJeff Roberson need_dirty = ((fs->fault_type & VM_PROT_WRITE) != 0 && 3012c2f4413SJeff Roberson (fs->fault_flags & VM_FAULT_WIRE) == 0) || 3022c2f4413SJeff Roberson (fs->fault_flags & VM_FAULT_DIRTY) != 0; 303a36f5532SKonstantin Belousov 304a36f5532SKonstantin Belousov vm_object_set_writeable_dirty(m->object); 30567d0e293SJeff Roberson 306a36f5532SKonstantin Belousov /* 307a8081778SJeff Roberson * If the fault is a write, we know that this page is being 308a8081778SJeff Roberson * written NOW so dirty it explicitly to save on 309a8081778SJeff Roberson * pmap_is_modified() calls later. 310a8081778SJeff Roberson * 311a8081778SJeff Roberson * Also, since the page is now dirty, we can possibly tell 312a8081778SJeff Roberson * the pager to release any swap backing the page. 313a36f5532SKonstantin Belousov */ 314a8081778SJeff Roberson if (need_dirty && vm_page_set_dirty(m) == 0) { 315a36f5532SKonstantin Belousov /* 316fff5403fSJeff Roberson * If this is a NOSYNC mmap we do not want to set PGA_NOSYNC 317a36f5532SKonstantin Belousov * if the page is already dirty to prevent data written with 318a36f5532SKonstantin Belousov * the expectation of being synced from not being synced. 319a36f5532SKonstantin Belousov * Likewise if this entry does not request NOSYNC then make 320a36f5532SKonstantin Belousov * sure the page isn't marked NOSYNC. Applications sharing 321a36f5532SKonstantin Belousov * data should use the same flags to avoid ping ponging. 322a36f5532SKonstantin Belousov */ 3232c2f4413SJeff Roberson if ((fs->entry->eflags & MAP_ENTRY_NOSYNC) != 0) 324fff5403fSJeff Roberson vm_page_aflag_set(m, PGA_NOSYNC); 325a8081778SJeff Roberson else 326fff5403fSJeff Roberson vm_page_aflag_clear(m, PGA_NOSYNC); 327a36f5532SKonstantin Belousov } 328a36f5532SKonstantin Belousov 329a36f5532SKonstantin Belousov } 330a36f5532SKonstantin Belousov 33141ddec83SKonstantin Belousov /* 33241ddec83SKonstantin Belousov * Unlocks fs.first_object and fs.map on success. 33341ddec83SKonstantin Belousov */ 334f1b642c2SMark Johnston static enum fault_status 3352c2f4413SJeff Roberson vm_fault_soft_fast(struct faultstate *fs) 33641ddec83SKonstantin Belousov { 3378b5e1472SAlan Cox vm_page_t m, m_map; 338fe0dcc40SKonstantin Belousov #if VM_NRESERVLEVEL > 0 3398b5e1472SAlan Cox vm_page_t m_super; 34090ea34bfSAlan Cox int flags; 3418b5e1472SAlan Cox #endif 342f1b642c2SMark Johnston int psind; 3432c2f4413SJeff Roberson vm_offset_t vaddr; 34441ddec83SKonstantin Belousov 34541ddec83SKonstantin Belousov MPASS(fs->vp == NULL); 346f1b642c2SMark Johnston 3470e71f4f7SMateusz Guzik /* 3480e71f4f7SMateusz Guzik * If we fail, vast majority of the time it is because the page is not 3490e71f4f7SMateusz Guzik * there to begin with. Opportunistically perform the lookup and 3500e71f4f7SMateusz Guzik * subsequent checks without the object lock, revalidate later. 3510e71f4f7SMateusz Guzik * 3520e71f4f7SMateusz Guzik * Note: a busy page can be mapped for read|execute access. 3530e71f4f7SMateusz Guzik */ 3540e71f4f7SMateusz Guzik m = vm_page_lookup_unlocked(fs->first_object, fs->first_pindex); 3550e71f4f7SMateusz Guzik if (m == NULL || !vm_page_all_valid(m) || 3560e71f4f7SMateusz Guzik ((fs->prot & VM_PROT_WRITE) != 0 && vm_page_busied(m))) { 3570e71f4f7SMateusz Guzik VM_OBJECT_WLOCK(fs->first_object); 3580e71f4f7SMateusz Guzik return (FAULT_FAILURE); 3590e71f4f7SMateusz Guzik } 3600e71f4f7SMateusz Guzik 3612c2f4413SJeff Roberson vaddr = fs->vaddr; 3620e71f4f7SMateusz Guzik 3630e71f4f7SMateusz Guzik VM_OBJECT_RLOCK(fs->first_object); 3640e71f4f7SMateusz Guzik 3650e71f4f7SMateusz Guzik /* 3660e71f4f7SMateusz Guzik * Now that we stabilized the state, revalidate the page is in the shape 3670e71f4f7SMateusz Guzik * we encountered above. 3680e71f4f7SMateusz Guzik */ 3690e71f4f7SMateusz Guzik 3700e71f4f7SMateusz Guzik if (m->object != fs->first_object || m->pindex != fs->first_pindex) 371d0991948SMark Johnston goto fail; 3720e71f4f7SMateusz Guzik 3730e71f4f7SMateusz Guzik vm_object_busy(fs->first_object); 3740e71f4f7SMateusz Guzik 3750e71f4f7SMateusz Guzik if (!vm_page_all_valid(m) || 3760e71f4f7SMateusz Guzik ((fs->prot & VM_PROT_WRITE) != 0 && vm_page_busied(m))) 3770e71f4f7SMateusz Guzik goto fail_busy; 3780e71f4f7SMateusz Guzik 3798b5e1472SAlan Cox m_map = m; 3808b5e1472SAlan Cox psind = 0; 381fe0dcc40SKonstantin Belousov #if VM_NRESERVLEVEL > 0 3828b5e1472SAlan Cox if ((m->flags & PG_FICTITIOUS) == 0 && 3833e00c11aSAlan Cox (m_super = vm_reserv_to_superpage(m)) != NULL) { 3843e00c11aSAlan Cox psind = m_super->psind; 3853e00c11aSAlan Cox KASSERT(psind > 0, 3863e00c11aSAlan Cox ("psind %d of m_super %p < 1", psind, m_super)); 3878b5e1472SAlan Cox flags = PS_ALL_VALID; 3882c2f4413SJeff Roberson if ((fs->prot & VM_PROT_WRITE) != 0) { 3898b5e1472SAlan Cox /* 3908b5e1472SAlan Cox * Create a superpage mapping allowing write access 3918b5e1472SAlan Cox * only if none of the constituent pages are busy and 3928b5e1472SAlan Cox * all of them are already dirty (except possibly for 3938b5e1472SAlan Cox * the page that was faulted on). 3948b5e1472SAlan Cox */ 3958b5e1472SAlan Cox flags |= PS_NONE_BUSY; 3968b5e1472SAlan Cox if ((fs->first_object->flags & OBJ_UNMANAGED) == 0) 3978b5e1472SAlan Cox flags |= PS_ALL_DIRTY; 3988b5e1472SAlan Cox } 3993e00c11aSAlan Cox while (rounddown2(vaddr, pagesizes[psind]) < fs->entry->start || 4003e00c11aSAlan Cox roundup2(vaddr + 1, pagesizes[psind]) > fs->entry->end || 4013e00c11aSAlan Cox (vaddr & (pagesizes[psind] - 1)) != 4023e00c11aSAlan Cox (VM_PAGE_TO_PHYS(m) & (pagesizes[psind] - 1)) || 4033e00c11aSAlan Cox !vm_page_ps_test(m_super, psind, flags, m) || 4043e00c11aSAlan Cox !pmap_ps_enabled(fs->map->pmap)) { 4053e00c11aSAlan Cox psind--; 4063e00c11aSAlan Cox if (psind == 0) 4073e00c11aSAlan Cox break; 4083e00c11aSAlan Cox m_super += rounddown2(m - m_super, 4093e00c11aSAlan Cox atop(pagesizes[psind])); 4103e00c11aSAlan Cox KASSERT(m_super->psind >= psind, 4113e00c11aSAlan Cox ("psind %d of m_super %p < %d", m_super->psind, 4123e00c11aSAlan Cox m_super, psind)); 4133e00c11aSAlan Cox } 4143e00c11aSAlan Cox if (psind > 0) { 4158b5e1472SAlan Cox m_map = m_super; 4168b5e1472SAlan Cox vaddr = rounddown2(vaddr, pagesizes[psind]); 4178b5e1472SAlan Cox /* Preset the modified bit for dirty superpages. */ 4188b5e1472SAlan Cox if ((flags & PS_ALL_DIRTY) != 0) 4192c2f4413SJeff Roberson fs->fault_type |= VM_PROT_WRITE; 4208b5e1472SAlan Cox } 4218b5e1472SAlan Cox } 4228b5e1472SAlan Cox #endif 423f1b642c2SMark Johnston if (pmap_enter(fs->map->pmap, vaddr, m_map, fs->prot, fs->fault_type | 424f1b642c2SMark Johnston PMAP_ENTER_NOSLEEP | (fs->wired ? PMAP_ENTER_WIRED : 0), psind) != 425d0991948SMark Johnston KERN_SUCCESS) 4260e71f4f7SMateusz Guzik goto fail_busy; 4272c2f4413SJeff Roberson if (fs->m_hold != NULL) { 4282c2f4413SJeff Roberson (*fs->m_hold) = m; 429fee2a2faSMark Johnston vm_page_wire(m); 430fee2a2faSMark Johnston } 4312c2f4413SJeff Roberson if (psind == 0 && !fs->wired) 432a7163bb9SKonstantin Belousov vm_fault_prefault(fs, vaddr, PFBAK, PFFOR, true); 433a7163bb9SKonstantin Belousov VM_OBJECT_RUNLOCK(fs->first_object); 4342c2f4413SJeff Roberson vm_fault_dirty(fs, m); 435d0991948SMark Johnston vm_object_unbusy(fs->first_object); 43641ddec83SKonstantin Belousov vm_map_lookup_done(fs->map, fs->entry); 43741ddec83SKonstantin Belousov curthread->td_ru.ru_minflt++; 438d0991948SMark Johnston return (FAULT_SUCCESS); 4390e71f4f7SMateusz Guzik fail_busy: 440205be21dSJeff Roberson vm_object_unbusy(fs->first_object); 4410e71f4f7SMateusz Guzik fail: 4420e71f4f7SMateusz Guzik if (!VM_OBJECT_TRYUPGRADE(fs->first_object)) { 4430e71f4f7SMateusz Guzik VM_OBJECT_RUNLOCK(fs->first_object); 4440e71f4f7SMateusz Guzik VM_OBJECT_WLOCK(fs->first_object); 4450e71f4f7SMateusz Guzik } 446d0991948SMark Johnston return (FAULT_FAILURE); 44741ddec83SKonstantin Belousov } 44841ddec83SKonstantin Belousov 449c42b43a0SKonstantin Belousov static void 450c42b43a0SKonstantin Belousov vm_fault_restore_map_lock(struct faultstate *fs) 451c42b43a0SKonstantin Belousov { 452c42b43a0SKonstantin Belousov 453c42b43a0SKonstantin Belousov VM_OBJECT_ASSERT_WLOCKED(fs->first_object); 454c99d0c58SMark Johnston MPASS(blockcount_read(&fs->first_object->paging_in_progress) > 0); 455c42b43a0SKonstantin Belousov 456c42b43a0SKonstantin Belousov if (!vm_map_trylock_read(fs->map)) { 457c42b43a0SKonstantin Belousov VM_OBJECT_WUNLOCK(fs->first_object); 458c42b43a0SKonstantin Belousov vm_map_lock_read(fs->map); 459c42b43a0SKonstantin Belousov VM_OBJECT_WLOCK(fs->first_object); 460c42b43a0SKonstantin Belousov } 461c42b43a0SKonstantin Belousov fs->lookup_still_valid = true; 462c42b43a0SKonstantin Belousov } 463c42b43a0SKonstantin Belousov 4647a432b84SKonstantin Belousov static void 4657a432b84SKonstantin Belousov vm_fault_populate_check_page(vm_page_t m) 4667a432b84SKonstantin Belousov { 4677a432b84SKonstantin Belousov 4687a432b84SKonstantin Belousov /* 4697a432b84SKonstantin Belousov * Check each page to ensure that the pager is obeying the 4707a432b84SKonstantin Belousov * interface: the page must be installed in the object, fully 4717a432b84SKonstantin Belousov * valid, and exclusively busied. 4727a432b84SKonstantin Belousov */ 4737a432b84SKonstantin Belousov MPASS(m != NULL); 4740012f373SJeff Roberson MPASS(vm_page_all_valid(m)); 4757a432b84SKonstantin Belousov MPASS(vm_page_xbusied(m)); 4767a432b84SKonstantin Belousov } 4777a432b84SKonstantin Belousov 4787a432b84SKonstantin Belousov static void 4797a432b84SKonstantin Belousov vm_fault_populate_cleanup(vm_object_t object, vm_pindex_t first, 4807a432b84SKonstantin Belousov vm_pindex_t last) 4817a432b84SKonstantin Belousov { 4827a432b84SKonstantin Belousov vm_page_t m; 4837a432b84SKonstantin Belousov vm_pindex_t pidx; 4847a432b84SKonstantin Belousov 4857a432b84SKonstantin Belousov VM_OBJECT_ASSERT_WLOCKED(object); 4867a432b84SKonstantin Belousov MPASS(first <= last); 4877a432b84SKonstantin Belousov for (pidx = first, m = vm_page_lookup(object, pidx); 488*b3cec803SDoug Moore pidx <= last; pidx++, m = TAILQ_NEXT(m, listq)) { 489*b3cec803SDoug Moore KASSERT(m != NULL && m->pindex == pidx, 490*b3cec803SDoug Moore ("%s: pindex mismatch", __func__)); 4917a432b84SKonstantin Belousov vm_fault_populate_check_page(m); 4927a432b84SKonstantin Belousov vm_page_deactivate(m); 4937a432b84SKonstantin Belousov vm_page_xunbusy(m); 4947a432b84SKonstantin Belousov } 4957a432b84SKonstantin Belousov } 496c42b43a0SKonstantin Belousov 497f1b642c2SMark Johnston static enum fault_status 4982c2f4413SJeff Roberson vm_fault_populate(struct faultstate *fs) 499c42b43a0SKonstantin Belousov { 50070183daaSAlan Cox vm_offset_t vaddr; 501c42b43a0SKonstantin Belousov vm_page_t m; 5027a432b84SKonstantin Belousov vm_pindex_t map_first, map_last, pager_first, pager_last, pidx; 503d301b358SKonstantin Belousov int bdry_idx, i, npages, psind, rv; 504f1b642c2SMark Johnston enum fault_status res; 505c42b43a0SKonstantin Belousov 506c42b43a0SKonstantin Belousov MPASS(fs->object == fs->first_object); 507c42b43a0SKonstantin Belousov VM_OBJECT_ASSERT_WLOCKED(fs->first_object); 508c99d0c58SMark Johnston MPASS(blockcount_read(&fs->first_object->paging_in_progress) > 0); 509c42b43a0SKonstantin Belousov MPASS(fs->first_object->backing_object == NULL); 510c42b43a0SKonstantin Belousov MPASS(fs->lookup_still_valid); 511c42b43a0SKonstantin Belousov 5127a432b84SKonstantin Belousov pager_first = OFF_TO_IDX(fs->entry->offset); 51389564188SAlan Cox pager_last = pager_first + atop(fs->entry->end - fs->entry->start) - 1; 5140a310c94SMateusz Guzik vm_fault_unlock_map(fs); 5150a310c94SMateusz Guzik vm_fault_unlock_vp(fs); 516c42b43a0SKonstantin Belousov 517f1b642c2SMark Johnston res = FAULT_SUCCESS; 518f1b642c2SMark Johnston 519c42b43a0SKonstantin Belousov /* 520c42b43a0SKonstantin Belousov * Call the pager (driver) populate() method. 521c42b43a0SKonstantin Belousov * 522c42b43a0SKonstantin Belousov * There is no guarantee that the method will be called again 523c42b43a0SKonstantin Belousov * if the current fault is for read, and a future fault is 524c42b43a0SKonstantin Belousov * for write. Report the entry's maximum allowed protection 525c42b43a0SKonstantin Belousov * to the driver. 526c42b43a0SKonstantin Belousov */ 527c42b43a0SKonstantin Belousov rv = vm_pager_populate(fs->first_object, fs->first_pindex, 528d301b358SKonstantin Belousov fs->fault_type, fs->entry->max_protection, &pager_first, 529d301b358SKonstantin Belousov &pager_last); 530c42b43a0SKonstantin Belousov 531c42b43a0SKonstantin Belousov VM_OBJECT_ASSERT_WLOCKED(fs->first_object); 532c42b43a0SKonstantin Belousov if (rv == VM_PAGER_BAD) { 533c42b43a0SKonstantin Belousov /* 534c42b43a0SKonstantin Belousov * VM_PAGER_BAD is the backdoor for a pager to request 535c42b43a0SKonstantin Belousov * normal fault handling. 536c42b43a0SKonstantin Belousov */ 537c42b43a0SKonstantin Belousov vm_fault_restore_map_lock(fs); 538c42b43a0SKonstantin Belousov if (fs->map->timestamp != fs->map_generation) 539f1b642c2SMark Johnston return (FAULT_RESTART); 540f1b642c2SMark Johnston return (FAULT_CONTINUE); 541c42b43a0SKonstantin Belousov } 542c42b43a0SKonstantin Belousov if (rv != VM_PAGER_OK) 543f1b642c2SMark Johnston return (FAULT_FAILURE); /* AKA SIGSEGV */ 544c42b43a0SKonstantin Belousov 545c42b43a0SKonstantin Belousov /* Ensure that the driver is obeying the interface. */ 5467a432b84SKonstantin Belousov MPASS(pager_first <= pager_last); 5477a432b84SKonstantin Belousov MPASS(fs->first_pindex <= pager_last); 5487a432b84SKonstantin Belousov MPASS(fs->first_pindex >= pager_first); 5497a432b84SKonstantin Belousov MPASS(pager_last < fs->first_object->size); 550c42b43a0SKonstantin Belousov 551c42b43a0SKonstantin Belousov vm_fault_restore_map_lock(fs); 552d0e4e53eSMark Johnston bdry_idx = MAP_ENTRY_SPLIT_BOUNDARY_INDEX(fs->entry); 5537a432b84SKonstantin Belousov if (fs->map->timestamp != fs->map_generation) { 554d301b358SKonstantin Belousov if (bdry_idx == 0) { 5557a432b84SKonstantin Belousov vm_fault_populate_cleanup(fs->first_object, pager_first, 5567a432b84SKonstantin Belousov pager_last); 557d301b358SKonstantin Belousov } else { 558d301b358SKonstantin Belousov m = vm_page_lookup(fs->first_object, pager_first); 559d301b358SKonstantin Belousov if (m != fs->m) 560d301b358SKonstantin Belousov vm_page_xunbusy(m); 561d301b358SKonstantin Belousov } 562f1b642c2SMark Johnston return (FAULT_RESTART); 5637a432b84SKonstantin Belousov } 564c42b43a0SKonstantin Belousov 565c42b43a0SKonstantin Belousov /* 5667a432b84SKonstantin Belousov * The map is unchanged after our last unlock. Process the fault. 5677a432b84SKonstantin Belousov * 568d301b358SKonstantin Belousov * First, the special case of largepage mappings, where 569d301b358SKonstantin Belousov * populate only busies the first page in superpage run. 570d301b358SKonstantin Belousov */ 571d301b358SKonstantin Belousov if (bdry_idx != 0) { 57278257765SMark Johnston KASSERT(PMAP_HAS_LARGEPAGES, 57378257765SMark Johnston ("missing pmap support for large pages")); 574d301b358SKonstantin Belousov m = vm_page_lookup(fs->first_object, pager_first); 575d301b358SKonstantin Belousov vm_fault_populate_check_page(m); 576d301b358SKonstantin Belousov VM_OBJECT_WUNLOCK(fs->first_object); 577d301b358SKonstantin Belousov vaddr = fs->entry->start + IDX_TO_OFF(pager_first) - 578d301b358SKonstantin Belousov fs->entry->offset; 579d301b358SKonstantin Belousov /* assert alignment for entry */ 580d301b358SKonstantin Belousov KASSERT((vaddr & (pagesizes[bdry_idx] - 1)) == 0, 581d301b358SKonstantin Belousov ("unaligned superpage start %#jx pager_first %#jx offset %#jx vaddr %#jx", 582d301b358SKonstantin Belousov (uintmax_t)fs->entry->start, (uintmax_t)pager_first, 583d301b358SKonstantin Belousov (uintmax_t)fs->entry->offset, (uintmax_t)vaddr)); 584d301b358SKonstantin Belousov KASSERT((VM_PAGE_TO_PHYS(m) & (pagesizes[bdry_idx] - 1)) == 0, 585d301b358SKonstantin Belousov ("unaligned superpage m %p %#jx", m, 586d301b358SKonstantin Belousov (uintmax_t)VM_PAGE_TO_PHYS(m))); 587d301b358SKonstantin Belousov rv = pmap_enter(fs->map->pmap, vaddr, m, fs->prot, 588d301b358SKonstantin Belousov fs->fault_type | (fs->wired ? PMAP_ENTER_WIRED : 0) | 589d301b358SKonstantin Belousov PMAP_ENTER_LARGEPAGE, bdry_idx); 590d301b358SKonstantin Belousov VM_OBJECT_WLOCK(fs->first_object); 591d301b358SKonstantin Belousov vm_page_xunbusy(m); 592f1b642c2SMark Johnston if (rv != KERN_SUCCESS) { 593f1b642c2SMark Johnston res = FAULT_FAILURE; 594c7b913aaSKonstantin Belousov goto out; 595f1b642c2SMark Johnston } 596d301b358SKonstantin Belousov if ((fs->fault_flags & VM_FAULT_WIRE) != 0) { 597d301b358SKonstantin Belousov for (i = 0; i < atop(pagesizes[bdry_idx]); i++) 598d301b358SKonstantin Belousov vm_page_wire(m + i); 599d301b358SKonstantin Belousov } 600d301b358SKonstantin Belousov if (fs->m_hold != NULL) { 601d301b358SKonstantin Belousov *fs->m_hold = m + (fs->first_pindex - pager_first); 602d301b358SKonstantin Belousov vm_page_wire(*fs->m_hold); 603d301b358SKonstantin Belousov } 604d301b358SKonstantin Belousov goto out; 605d301b358SKonstantin Belousov } 606d301b358SKonstantin Belousov 607d301b358SKonstantin Belousov /* 6087a432b84SKonstantin Belousov * The range [pager_first, pager_last] that is given to the 6097a432b84SKonstantin Belousov * pager is only a hint. The pager may populate any range 6107a432b84SKonstantin Belousov * within the object that includes the requested page index. 6117a432b84SKonstantin Belousov * In case the pager expanded the range, clip it to fit into 6127a432b84SKonstantin Belousov * the map entry. 613c42b43a0SKonstantin Belousov */ 61489564188SAlan Cox map_first = OFF_TO_IDX(fs->entry->offset); 61589564188SAlan Cox if (map_first > pager_first) { 6167a432b84SKonstantin Belousov vm_fault_populate_cleanup(fs->first_object, pager_first, 6177a432b84SKonstantin Belousov map_first - 1); 61889564188SAlan Cox pager_first = map_first; 61989564188SAlan Cox } 62089564188SAlan Cox map_last = map_first + atop(fs->entry->end - fs->entry->start) - 1; 62189564188SAlan Cox if (map_last < pager_last) { 6227a432b84SKonstantin Belousov vm_fault_populate_cleanup(fs->first_object, map_last + 1, 6237a432b84SKonstantin Belousov pager_last); 62489564188SAlan Cox pager_last = map_last; 62589564188SAlan Cox } 62689564188SAlan Cox for (pidx = pager_first, m = vm_page_lookup(fs->first_object, pidx); 62770183daaSAlan Cox pidx <= pager_last; 628*b3cec803SDoug Moore pidx += npages, m = TAILQ_NEXT(&m[npages - 1], listq)) { 62970183daaSAlan Cox vaddr = fs->entry->start + IDX_TO_OFF(pidx) - fs->entry->offset; 630*b3cec803SDoug Moore KASSERT(m != NULL && m->pindex == pidx, 631*b3cec803SDoug Moore ("%s: pindex mismatch", __func__)); 63270183daaSAlan Cox psind = m->psind; 6333e00c11aSAlan Cox while (psind > 0 && ((vaddr & (pagesizes[psind] - 1)) != 0 || 63470183daaSAlan Cox pidx + OFF_TO_IDX(pagesizes[psind]) - 1 > pager_last || 635e4078494SBojan Novković !pmap_ps_enabled(fs->map->pmap))) 6363e00c11aSAlan Cox psind--; 6378dc8feb5SJason A. Harmening 63870183daaSAlan Cox npages = atop(pagesizes[psind]); 63970183daaSAlan Cox for (i = 0; i < npages; i++) { 64070183daaSAlan Cox vm_fault_populate_check_page(&m[i]); 6412c2f4413SJeff Roberson vm_fault_dirty(fs, &m[i]); 64270183daaSAlan Cox } 643c42b43a0SKonstantin Belousov VM_OBJECT_WUNLOCK(fs->first_object); 6442c2f4413SJeff Roberson rv = pmap_enter(fs->map->pmap, vaddr, m, fs->prot, fs->fault_type | 6452c2f4413SJeff Roberson (fs->wired ? PMAP_ENTER_WIRED : 0), psind); 6468dc8feb5SJason A. Harmening 6478dc8feb5SJason A. Harmening /* 6488dc8feb5SJason A. Harmening * pmap_enter() may fail for a superpage mapping if additional 6498dc8feb5SJason A. Harmening * protection policies prevent the full mapping. 6508dc8feb5SJason A. Harmening * For example, this will happen on amd64 if the entire 6518dc8feb5SJason A. Harmening * address range does not share the same userspace protection 6528dc8feb5SJason A. Harmening * key. Revert to single-page mappings if this happens. 6538dc8feb5SJason A. Harmening */ 6548dc8feb5SJason A. Harmening MPASS(rv == KERN_SUCCESS || 6558dc8feb5SJason A. Harmening (psind > 0 && rv == KERN_PROTECTION_FAILURE)); 6568dc8feb5SJason A. Harmening if (__predict_false(psind > 0 && 6578dc8feb5SJason A. Harmening rv == KERN_PROTECTION_FAILURE)) { 65888642d97SMark Johnston MPASS(!fs->wired); 659e7a9df16SKonstantin Belousov for (i = 0; i < npages; i++) { 660e7a9df16SKonstantin Belousov rv = pmap_enter(fs->map->pmap, vaddr + ptoa(i), 66188642d97SMark Johnston &m[i], fs->prot, fs->fault_type, 0); 662e7a9df16SKonstantin Belousov MPASS(rv == KERN_SUCCESS); 663e7a9df16SKonstantin Belousov } 664e7a9df16SKonstantin Belousov } 6658dc8feb5SJason A. Harmening 666c42b43a0SKonstantin Belousov VM_OBJECT_WLOCK(fs->first_object); 66770183daaSAlan Cox for (i = 0; i < npages; i++) { 66888642d97SMark Johnston if ((fs->fault_flags & VM_FAULT_WIRE) != 0 && 66988642d97SMark Johnston m[i].pindex == fs->first_pindex) 67070183daaSAlan Cox vm_page_wire(&m[i]); 6719f5632e6SMark Johnston else 67270183daaSAlan Cox vm_page_activate(&m[i]); 67388642d97SMark Johnston if (fs->m_hold != NULL && 67488642d97SMark Johnston m[i].pindex == fs->first_pindex) { 6752c2f4413SJeff Roberson (*fs->m_hold) = &m[i]; 676eeacb3b0SMark Johnston vm_page_wire(&m[i]); 677c42b43a0SKonstantin Belousov } 6784cdea4a8SJeff Roberson vm_page_xunbusy(&m[i]); 67970183daaSAlan Cox } 680c42b43a0SKonstantin Belousov } 681d301b358SKonstantin Belousov out: 682c42b43a0SKonstantin Belousov curthread->td_ru.ru_majflt++; 683f1b642c2SMark Johnston return (res); 684c42b43a0SKonstantin Belousov } 685c42b43a0SKonstantin Belousov 686df08823dSKonstantin Belousov static int prot_fault_translation; 687df08823dSKonstantin Belousov SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RWTUN, 688df08823dSKonstantin Belousov &prot_fault_translation, 0, 689df08823dSKonstantin Belousov "Control signal to deliver on protection fault"); 690df08823dSKonstantin Belousov 691df08823dSKonstantin Belousov /* compat definition to keep common code for signal translation */ 692df08823dSKonstantin Belousov #define UCODE_PAGEFLT 12 693df08823dSKonstantin Belousov #ifdef T_PAGEFLT 694df08823dSKonstantin Belousov _Static_assert(UCODE_PAGEFLT == T_PAGEFLT, "T_PAGEFLT"); 695df08823dSKonstantin Belousov #endif 696df08823dSKonstantin Belousov 697df8bae1dSRodney W. Grimes /* 698df08823dSKonstantin Belousov * vm_fault_trap: 699df8bae1dSRodney W. Grimes * 700956f3135SPhilippe Charnier * Handle a page fault occurring at the given address, 701df8bae1dSRodney W. Grimes * requiring the given permissions, in the map specified. 702df8bae1dSRodney W. Grimes * If successful, the page is inserted into the 703df8bae1dSRodney W. Grimes * associated physical map. 704df8bae1dSRodney W. Grimes * 705df8bae1dSRodney W. Grimes * NOTE: the given address should be truncated to the 706df8bae1dSRodney W. Grimes * proper page address. 707df8bae1dSRodney W. Grimes * 708df8bae1dSRodney W. Grimes * KERN_SUCCESS is returned if the page fault is handled; otherwise, 709df8bae1dSRodney W. Grimes * a standard error specifying why the fault is fatal is returned. 710df8bae1dSRodney W. Grimes * 711df8bae1dSRodney W. Grimes * The map in question must be referenced, and remains so. 7120cddd8f0SMatthew Dillon * Caller may hold no locks. 713df8bae1dSRodney W. Grimes */ 714df8bae1dSRodney W. Grimes int 715df08823dSKonstantin Belousov vm_fault_trap(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, 716df08823dSKonstantin Belousov int fault_flags, int *signo, int *ucode) 71723955314SAlfred Perlstein { 71835818d2eSJohn Baldwin int result; 719acd11c74SAlan Cox 720df08823dSKonstantin Belousov MPASS(signo == NULL || ucode != NULL); 72135818d2eSJohn Baldwin #ifdef KTRACE 722c31cec45SKonstantin Belousov if (map != kernel_map && KTRPOINT(curthread, KTR_FAULT)) 72335818d2eSJohn Baldwin ktrfault(vaddr, fault_type); 72435818d2eSJohn Baldwin #endif 725df08823dSKonstantin Belousov result = vm_fault(map, trunc_page(vaddr), fault_type, fault_flags, 726be996836SAttilio Rao NULL); 727df08823dSKonstantin Belousov KASSERT(result == KERN_SUCCESS || result == KERN_FAILURE || 728df08823dSKonstantin Belousov result == KERN_INVALID_ADDRESS || 729df08823dSKonstantin Belousov result == KERN_RESOURCE_SHORTAGE || 730df08823dSKonstantin Belousov result == KERN_PROTECTION_FAILURE || 731df08823dSKonstantin Belousov result == KERN_OUT_OF_BOUNDS, 732df08823dSKonstantin Belousov ("Unexpected Mach error %d from vm_fault()", result)); 73335818d2eSJohn Baldwin #ifdef KTRACE 734c31cec45SKonstantin Belousov if (map != kernel_map && KTRPOINT(curthread, KTR_FAULTEND)) 73535818d2eSJohn Baldwin ktrfaultend(result); 73635818d2eSJohn Baldwin #endif 737df08823dSKonstantin Belousov if (result != KERN_SUCCESS && signo != NULL) { 738df08823dSKonstantin Belousov switch (result) { 739df08823dSKonstantin Belousov case KERN_FAILURE: 740df08823dSKonstantin Belousov case KERN_INVALID_ADDRESS: 741df08823dSKonstantin Belousov *signo = SIGSEGV; 742df08823dSKonstantin Belousov *ucode = SEGV_MAPERR; 743df08823dSKonstantin Belousov break; 744df08823dSKonstantin Belousov case KERN_RESOURCE_SHORTAGE: 745df08823dSKonstantin Belousov *signo = SIGBUS; 746df08823dSKonstantin Belousov *ucode = BUS_OOMERR; 747df08823dSKonstantin Belousov break; 748df08823dSKonstantin Belousov case KERN_OUT_OF_BOUNDS: 749df08823dSKonstantin Belousov *signo = SIGBUS; 750df08823dSKonstantin Belousov *ucode = BUS_OBJERR; 751df08823dSKonstantin Belousov break; 752df08823dSKonstantin Belousov case KERN_PROTECTION_FAILURE: 753df08823dSKonstantin Belousov if (prot_fault_translation == 0) { 754df08823dSKonstantin Belousov /* 755df08823dSKonstantin Belousov * Autodetect. This check also covers 756df08823dSKonstantin Belousov * the images without the ABI-tag ELF 757df08823dSKonstantin Belousov * note. 758df08823dSKonstantin Belousov */ 759df08823dSKonstantin Belousov if (SV_CURPROC_ABI() == SV_ABI_FREEBSD && 760df08823dSKonstantin Belousov curproc->p_osrel >= P_OSREL_SIGSEGV) { 761df08823dSKonstantin Belousov *signo = SIGSEGV; 762df08823dSKonstantin Belousov *ucode = SEGV_ACCERR; 763df08823dSKonstantin Belousov } else { 764df08823dSKonstantin Belousov *signo = SIGBUS; 765df08823dSKonstantin Belousov *ucode = UCODE_PAGEFLT; 766df08823dSKonstantin Belousov } 767df08823dSKonstantin Belousov } else if (prot_fault_translation == 1) { 768df08823dSKonstantin Belousov /* Always compat mode. */ 769df08823dSKonstantin Belousov *signo = SIGBUS; 770df08823dSKonstantin Belousov *ucode = UCODE_PAGEFLT; 771df08823dSKonstantin Belousov } else { 772df08823dSKonstantin Belousov /* Always SIGSEGV mode. */ 773df08823dSKonstantin Belousov *signo = SIGSEGV; 774df08823dSKonstantin Belousov *ucode = SEGV_ACCERR; 775df08823dSKonstantin Belousov } 776df08823dSKonstantin Belousov break; 777df08823dSKonstantin Belousov default: 778df08823dSKonstantin Belousov KASSERT(0, ("Unexpected Mach error %d from vm_fault()", 779df08823dSKonstantin Belousov result)); 780df08823dSKonstantin Belousov break; 781df08823dSKonstantin Belousov } 782df08823dSKonstantin Belousov } 78335818d2eSJohn Baldwin return (result); 784acd11c74SAlan Cox } 785acd11c74SAlan Cox 786fdb1dbb1SMateusz Guzik static bool 787fdb1dbb1SMateusz Guzik vm_fault_object_ensure_wlocked(struct faultstate *fs) 788fdb1dbb1SMateusz Guzik { 789fdb1dbb1SMateusz Guzik if (fs->object == fs->first_object) 790fdb1dbb1SMateusz Guzik VM_OBJECT_ASSERT_WLOCKED(fs->object); 791fdb1dbb1SMateusz Guzik 792fdb1dbb1SMateusz Guzik if (!fs->can_read_lock) { 793fdb1dbb1SMateusz Guzik VM_OBJECT_ASSERT_WLOCKED(fs->object); 794fdb1dbb1SMateusz Guzik return (true); 795fdb1dbb1SMateusz Guzik } 796fdb1dbb1SMateusz Guzik 797fdb1dbb1SMateusz Guzik if (VM_OBJECT_WOWNED(fs->object)) 798fdb1dbb1SMateusz Guzik return (true); 799fdb1dbb1SMateusz Guzik 800fdb1dbb1SMateusz Guzik if (VM_OBJECT_TRYUPGRADE(fs->object)) 801fdb1dbb1SMateusz Guzik return (true); 802fdb1dbb1SMateusz Guzik 803fdb1dbb1SMateusz Guzik return (false); 804fdb1dbb1SMateusz Guzik } 805fdb1dbb1SMateusz Guzik 806f1b642c2SMark Johnston static enum fault_status 8071e40fe41SJeff Roberson vm_fault_lock_vnode(struct faultstate *fs, bool objlocked) 8080ddd3082SKonstantin Belousov { 8090ddd3082SKonstantin Belousov struct vnode *vp; 8100ddd3082SKonstantin Belousov int error, locked; 8110ddd3082SKonstantin Belousov 8120ddd3082SKonstantin Belousov if (fs->object->type != OBJT_VNODE) 813f1b642c2SMark Johnston return (FAULT_CONTINUE); 8140ddd3082SKonstantin Belousov vp = fs->object->handle; 81516b0c092SKonstantin Belousov if (vp == fs->vp) { 81616b0c092SKonstantin Belousov ASSERT_VOP_LOCKED(vp, "saved vnode is not locked"); 817f1b642c2SMark Johnston return (FAULT_CONTINUE); 81816b0c092SKonstantin Belousov } 8190ddd3082SKonstantin Belousov 8200ddd3082SKonstantin Belousov /* 8210ddd3082SKonstantin Belousov * Perform an unlock in case the desired vnode changed while 8220ddd3082SKonstantin Belousov * the map was unlocked during a retry. 8230ddd3082SKonstantin Belousov */ 8240a310c94SMateusz Guzik vm_fault_unlock_vp(fs); 8250ddd3082SKonstantin Belousov 8260ddd3082SKonstantin Belousov locked = VOP_ISLOCKED(vp); 8270ddd3082SKonstantin Belousov if (locked != LK_EXCLUSIVE) 8280ddd3082SKonstantin Belousov locked = LK_SHARED; 8290ddd3082SKonstantin Belousov 8300ddd3082SKonstantin Belousov /* 8310ddd3082SKonstantin Belousov * We must not sleep acquiring the vnode lock while we have 8320ddd3082SKonstantin Belousov * the page exclusive busied or the object's 8330ddd3082SKonstantin Belousov * paging-in-progress count incremented. Otherwise, we could 8340ddd3082SKonstantin Belousov * deadlock. 8350ddd3082SKonstantin Belousov */ 836a92a971bSMateusz Guzik error = vget(vp, locked | LK_CANRECURSE | LK_NOWAIT); 8370ddd3082SKonstantin Belousov if (error == 0) { 8380ddd3082SKonstantin Belousov fs->vp = vp; 839f1b642c2SMark Johnston return (FAULT_CONTINUE); 8400ddd3082SKonstantin Belousov } 8410ddd3082SKonstantin Belousov 8420ddd3082SKonstantin Belousov vhold(vp); 8431e40fe41SJeff Roberson if (objlocked) 8440a310c94SMateusz Guzik vm_fault_unlock_and_deallocate(fs); 8451e40fe41SJeff Roberson else 8460a310c94SMateusz Guzik vm_fault_deallocate(fs); 847a92a971bSMateusz Guzik error = vget(vp, locked | LK_RETRY | LK_CANRECURSE); 8480ddd3082SKonstantin Belousov vdrop(vp); 8490ddd3082SKonstantin Belousov fs->vp = vp; 8500ddd3082SKonstantin Belousov KASSERT(error == 0, ("vm_fault: vget failed %d", error)); 851f1b642c2SMark Johnston return (FAULT_RESTART); 8520ddd3082SKonstantin Belousov } 8530ddd3082SKonstantin Belousov 854bef91632SJeff Roberson /* 8555949b1caSJeff Roberson * Calculate the desired readahead. Handle drop-behind. 8565949b1caSJeff Roberson * 8575949b1caSJeff Roberson * Returns the number of readahead blocks to pass to the pager. 8585949b1caSJeff Roberson */ 8595949b1caSJeff Roberson static int 8605949b1caSJeff Roberson vm_fault_readahead(struct faultstate *fs) 8615949b1caSJeff Roberson { 8625949b1caSJeff Roberson int era, nera; 8635949b1caSJeff Roberson u_char behavior; 8645949b1caSJeff Roberson 8655949b1caSJeff Roberson KASSERT(fs->lookup_still_valid, ("map unlocked")); 8665949b1caSJeff Roberson era = fs->entry->read_ahead; 8675949b1caSJeff Roberson behavior = vm_map_entry_behavior(fs->entry); 8685949b1caSJeff Roberson if (behavior == MAP_ENTRY_BEHAV_RANDOM) { 8695949b1caSJeff Roberson nera = 0; 8705949b1caSJeff Roberson } else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) { 8715949b1caSJeff Roberson nera = VM_FAULT_READ_AHEAD_MAX; 8725949b1caSJeff Roberson if (fs->vaddr == fs->entry->next_read) 8735949b1caSJeff Roberson vm_fault_dontneed(fs, fs->vaddr, nera); 8745949b1caSJeff Roberson } else if (fs->vaddr == fs->entry->next_read) { 8755949b1caSJeff Roberson /* 8765949b1caSJeff Roberson * This is a sequential fault. Arithmetically 8775949b1caSJeff Roberson * increase the requested number of pages in 8785949b1caSJeff Roberson * the read-ahead window. The requested 8795949b1caSJeff Roberson * number of pages is "# of sequential faults 8805949b1caSJeff Roberson * x (read ahead min + 1) + read ahead min" 8815949b1caSJeff Roberson */ 8825949b1caSJeff Roberson nera = VM_FAULT_READ_AHEAD_MIN; 8835949b1caSJeff Roberson if (era > 0) { 8845949b1caSJeff Roberson nera += era + 1; 8855949b1caSJeff Roberson if (nera > VM_FAULT_READ_AHEAD_MAX) 8865949b1caSJeff Roberson nera = VM_FAULT_READ_AHEAD_MAX; 8875949b1caSJeff Roberson } 8885949b1caSJeff Roberson if (era == VM_FAULT_READ_AHEAD_MAX) 8895949b1caSJeff Roberson vm_fault_dontneed(fs, fs->vaddr, nera); 8905949b1caSJeff Roberson } else { 8915949b1caSJeff Roberson /* 8925949b1caSJeff Roberson * This is a non-sequential fault. 8935949b1caSJeff Roberson */ 8945949b1caSJeff Roberson nera = 0; 8955949b1caSJeff Roberson } 8965949b1caSJeff Roberson if (era != nera) { 8975949b1caSJeff Roberson /* 8985949b1caSJeff Roberson * A read lock on the map suffices to update 8995949b1caSJeff Roberson * the read ahead count safely. 9005949b1caSJeff Roberson */ 9015949b1caSJeff Roberson fs->entry->read_ahead = nera; 9025949b1caSJeff Roberson } 9035949b1caSJeff Roberson 9045949b1caSJeff Roberson return (nera); 9055949b1caSJeff Roberson } 9065949b1caSJeff Roberson 907c308a3a6SJeff Roberson static int 908c308a3a6SJeff Roberson vm_fault_lookup(struct faultstate *fs) 909c308a3a6SJeff Roberson { 910c308a3a6SJeff Roberson int result; 911c308a3a6SJeff Roberson 912c308a3a6SJeff Roberson KASSERT(!fs->lookup_still_valid, 913c308a3a6SJeff Roberson ("vm_fault_lookup: Map already locked.")); 914c308a3a6SJeff Roberson result = vm_map_lookup(&fs->map, fs->vaddr, fs->fault_type | 915c308a3a6SJeff Roberson VM_PROT_FAULT_LOOKUP, &fs->entry, &fs->first_object, 916c308a3a6SJeff Roberson &fs->first_pindex, &fs->prot, &fs->wired); 917c308a3a6SJeff Roberson if (result != KERN_SUCCESS) { 9180a310c94SMateusz Guzik vm_fault_unlock_vp(fs); 919c308a3a6SJeff Roberson return (result); 920c308a3a6SJeff Roberson } 921c308a3a6SJeff Roberson 922c308a3a6SJeff Roberson fs->map_generation = fs->map->timestamp; 923c308a3a6SJeff Roberson 924c308a3a6SJeff Roberson if (fs->entry->eflags & MAP_ENTRY_NOFAULT) { 925c308a3a6SJeff Roberson panic("%s: fault on nofault entry, addr: %#lx", 926c308a3a6SJeff Roberson __func__, (u_long)fs->vaddr); 927c308a3a6SJeff Roberson } 928c308a3a6SJeff Roberson 929c308a3a6SJeff Roberson if (fs->entry->eflags & MAP_ENTRY_IN_TRANSITION && 930c308a3a6SJeff Roberson fs->entry->wiring_thread != curthread) { 931c308a3a6SJeff Roberson vm_map_unlock_read(fs->map); 932c308a3a6SJeff Roberson vm_map_lock(fs->map); 933c308a3a6SJeff Roberson if (vm_map_lookup_entry(fs->map, fs->vaddr, &fs->entry) && 934c308a3a6SJeff Roberson (fs->entry->eflags & MAP_ENTRY_IN_TRANSITION)) { 9350a310c94SMateusz Guzik vm_fault_unlock_vp(fs); 936c308a3a6SJeff Roberson fs->entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; 937c308a3a6SJeff Roberson vm_map_unlock_and_wait(fs->map, 0); 938c308a3a6SJeff Roberson } else 939c308a3a6SJeff Roberson vm_map_unlock(fs->map); 940c308a3a6SJeff Roberson return (KERN_RESOURCE_SHORTAGE); 941c308a3a6SJeff Roberson } 942c308a3a6SJeff Roberson 943c308a3a6SJeff Roberson MPASS((fs->entry->eflags & MAP_ENTRY_GUARD) == 0); 944c308a3a6SJeff Roberson 945c308a3a6SJeff Roberson if (fs->wired) 946c308a3a6SJeff Roberson fs->fault_type = fs->prot | (fs->fault_type & VM_PROT_COPY); 947c308a3a6SJeff Roberson else 948c308a3a6SJeff Roberson KASSERT((fs->fault_flags & VM_FAULT_WIRE) == 0, 949c308a3a6SJeff Roberson ("!fs->wired && VM_FAULT_WIRE")); 950c308a3a6SJeff Roberson fs->lookup_still_valid = true; 951c308a3a6SJeff Roberson 952c308a3a6SJeff Roberson return (KERN_SUCCESS); 953c308a3a6SJeff Roberson } 954c308a3a6SJeff Roberson 955fcb04758SJeff Roberson static int 956fcb04758SJeff Roberson vm_fault_relookup(struct faultstate *fs) 957fcb04758SJeff Roberson { 958fcb04758SJeff Roberson vm_object_t retry_object; 959fcb04758SJeff Roberson vm_pindex_t retry_pindex; 960fcb04758SJeff Roberson vm_prot_t retry_prot; 961fcb04758SJeff Roberson int result; 962fcb04758SJeff Roberson 963fcb04758SJeff Roberson if (!vm_map_trylock_read(fs->map)) 964fcb04758SJeff Roberson return (KERN_RESTART); 965fcb04758SJeff Roberson 966fcb04758SJeff Roberson fs->lookup_still_valid = true; 967fcb04758SJeff Roberson if (fs->map->timestamp == fs->map_generation) 968fcb04758SJeff Roberson return (KERN_SUCCESS); 969fcb04758SJeff Roberson 970fcb04758SJeff Roberson result = vm_map_lookup_locked(&fs->map, fs->vaddr, fs->fault_type, 971fcb04758SJeff Roberson &fs->entry, &retry_object, &retry_pindex, &retry_prot, 972fcb04758SJeff Roberson &fs->wired); 973fcb04758SJeff Roberson if (result != KERN_SUCCESS) { 974fcb04758SJeff Roberson /* 975fcb04758SJeff Roberson * If retry of map lookup would have blocked then 976fcb04758SJeff Roberson * retry fault from start. 977fcb04758SJeff Roberson */ 978fcb04758SJeff Roberson if (result == KERN_FAILURE) 979fcb04758SJeff Roberson return (KERN_RESTART); 980fcb04758SJeff Roberson return (result); 981fcb04758SJeff Roberson } 982fcb04758SJeff Roberson if (retry_object != fs->first_object || 983fcb04758SJeff Roberson retry_pindex != fs->first_pindex) 984fcb04758SJeff Roberson return (KERN_RESTART); 985fcb04758SJeff Roberson 986fcb04758SJeff Roberson /* 987fcb04758SJeff Roberson * Check whether the protection has changed or the object has 988fcb04758SJeff Roberson * been copied while we left the map unlocked. Changing from 989fcb04758SJeff Roberson * read to write permission is OK - we leave the page 990fcb04758SJeff Roberson * write-protected, and catch the write fault. Changing from 991fcb04758SJeff Roberson * write to read permission means that we can't mark the page 992fcb04758SJeff Roberson * write-enabled after all. 993fcb04758SJeff Roberson */ 994fcb04758SJeff Roberson fs->prot &= retry_prot; 995fcb04758SJeff Roberson fs->fault_type &= retry_prot; 996fcb04758SJeff Roberson if (fs->prot == 0) 997fcb04758SJeff Roberson return (KERN_RESTART); 998fcb04758SJeff Roberson 999fcb04758SJeff Roberson /* Reassert because wired may have changed. */ 1000fcb04758SJeff Roberson KASSERT(fs->wired || (fs->fault_flags & VM_FAULT_WIRE) == 0, 1001fcb04758SJeff Roberson ("!wired && VM_FAULT_WIRE")); 1002fcb04758SJeff Roberson 1003fcb04758SJeff Roberson return (KERN_SUCCESS); 1004fcb04758SJeff Roberson } 1005fcb04758SJeff Roberson 10065936b6a8SJeff Roberson static void 10075936b6a8SJeff Roberson vm_fault_cow(struct faultstate *fs) 10085936b6a8SJeff Roberson { 10095936b6a8SJeff Roberson bool is_first_object_locked; 10105936b6a8SJeff Roberson 1011982693bbSMark Johnston KASSERT(fs->object != fs->first_object, 1012982693bbSMark Johnston ("source and target COW objects are identical")); 1013982693bbSMark Johnston 10145936b6a8SJeff Roberson /* 10155936b6a8SJeff Roberson * This allows pages to be virtually copied from a backing_object 10165936b6a8SJeff Roberson * into the first_object, where the backing object has no other 10175936b6a8SJeff Roberson * refs to it, and cannot gain any more refs. Instead of a bcopy, 10185936b6a8SJeff Roberson * we just move the page from the backing object to the first 10195936b6a8SJeff Roberson * object. Note that we must mark the page dirty in the first 10205936b6a8SJeff Roberson * object so that it will go out to swap when needed. 10215936b6a8SJeff Roberson */ 10225936b6a8SJeff Roberson is_first_object_locked = false; 10235936b6a8SJeff Roberson if ( 10245936b6a8SJeff Roberson /* 10255936b6a8SJeff Roberson * Only one shadow object and no other refs. 10265936b6a8SJeff Roberson */ 10275936b6a8SJeff Roberson fs->object->shadow_count == 1 && fs->object->ref_count == 1 && 10285936b6a8SJeff Roberson /* 10295936b6a8SJeff Roberson * No other ways to look the object up 10305936b6a8SJeff Roberson */ 10315936b6a8SJeff Roberson fs->object->handle == NULL && (fs->object->flags & OBJ_ANON) != 0 && 10325936b6a8SJeff Roberson /* 10335936b6a8SJeff Roberson * We don't chase down the shadow chain and we can acquire locks. 10345936b6a8SJeff Roberson */ 10355936b6a8SJeff Roberson (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs->first_object)) && 10365936b6a8SJeff Roberson fs->object == fs->first_object->backing_object && 10375936b6a8SJeff Roberson VM_OBJECT_TRYWLOCK(fs->object)) { 10385936b6a8SJeff Roberson /* 10395936b6a8SJeff Roberson * Remove but keep xbusy for replace. fs->m is moved into 10405936b6a8SJeff Roberson * fs->first_object and left busy while fs->first_m is 10415936b6a8SJeff Roberson * conditionally freed. 10425936b6a8SJeff Roberson */ 10435936b6a8SJeff Roberson vm_page_remove_xbusy(fs->m); 10445936b6a8SJeff Roberson vm_page_replace(fs->m, fs->first_object, fs->first_pindex, 10455936b6a8SJeff Roberson fs->first_m); 10465936b6a8SJeff Roberson vm_page_dirty(fs->m); 10475936b6a8SJeff Roberson #if VM_NRESERVLEVEL > 0 10485936b6a8SJeff Roberson /* 10495936b6a8SJeff Roberson * Rename the reservation. 10505936b6a8SJeff Roberson */ 10515936b6a8SJeff Roberson vm_reserv_rename(fs->m, fs->first_object, fs->object, 10525936b6a8SJeff Roberson OFF_TO_IDX(fs->first_object->backing_object_offset)); 10535936b6a8SJeff Roberson #endif 10545936b6a8SJeff Roberson VM_OBJECT_WUNLOCK(fs->object); 10555936b6a8SJeff Roberson VM_OBJECT_WUNLOCK(fs->first_object); 10565936b6a8SJeff Roberson fs->first_m = fs->m; 10575936b6a8SJeff Roberson fs->m = NULL; 10585936b6a8SJeff Roberson VM_CNT_INC(v_cow_optim); 10595936b6a8SJeff Roberson } else { 10605936b6a8SJeff Roberson if (is_first_object_locked) 10615936b6a8SJeff Roberson VM_OBJECT_WUNLOCK(fs->first_object); 10625936b6a8SJeff Roberson /* 10635936b6a8SJeff Roberson * Oh, well, lets copy it. 10645936b6a8SJeff Roberson */ 10655936b6a8SJeff Roberson pmap_copy_page(fs->m, fs->first_m); 10665936b6a8SJeff Roberson vm_page_valid(fs->first_m); 10675936b6a8SJeff Roberson if (fs->wired && (fs->fault_flags & VM_FAULT_WIRE) == 0) { 10685936b6a8SJeff Roberson vm_page_wire(fs->first_m); 10695936b6a8SJeff Roberson vm_page_unwire(fs->m, PQ_INACTIVE); 10705936b6a8SJeff Roberson } 10715936b6a8SJeff Roberson /* 10725936b6a8SJeff Roberson * Save the cow page to be released after 10735936b6a8SJeff Roberson * pmap_enter is complete. 10745936b6a8SJeff Roberson */ 10755936b6a8SJeff Roberson fs->m_cow = fs->m; 10765936b6a8SJeff Roberson fs->m = NULL; 1077982693bbSMark Johnston 10785936b6a8SJeff Roberson /* 1079982693bbSMark Johnston * Typically, the shadow object is either private to this 1080982693bbSMark Johnston * address space (OBJ_ONEMAPPING) or its pages are read only. 1081982693bbSMark Johnston * In the highly unusual case where the pages of a shadow object 1082982693bbSMark Johnston * are read/write shared between this and other address spaces, 1083982693bbSMark Johnston * we need to ensure that any pmap-level mappings to the 1084982693bbSMark Johnston * original, copy-on-write page from the backing object are 1085982693bbSMark Johnston * removed from those other address spaces. 1086982693bbSMark Johnston * 1087982693bbSMark Johnston * The flag check is racy, but this is tolerable: if 1088982693bbSMark Johnston * OBJ_ONEMAPPING is cleared after the check, the busy state 1089982693bbSMark Johnston * ensures that new mappings of m_cow can't be created. 1090982693bbSMark Johnston * pmap_enter() will replace an existing mapping in the current 1091982693bbSMark Johnston * address space. If OBJ_ONEMAPPING is set after the check, 1092982693bbSMark Johnston * removing mappings will at worse trigger some unnecessary page 1093982693bbSMark Johnston * faults. 10945936b6a8SJeff Roberson */ 1095982693bbSMark Johnston vm_page_assert_xbusied(fs->m_cow); 1096982693bbSMark Johnston if ((fs->first_object->flags & OBJ_ONEMAPPING) == 0) 1097982693bbSMark Johnston pmap_remove_all(fs->m_cow); 1098982693bbSMark Johnston } 1099982693bbSMark Johnston 11005936b6a8SJeff Roberson vm_object_pip_wakeup(fs->object); 11015936b6a8SJeff Roberson 11025936b6a8SJeff Roberson /* 11035936b6a8SJeff Roberson * Only use the new page below... 11045936b6a8SJeff Roberson */ 11055936b6a8SJeff Roberson fs->object = fs->first_object; 11065936b6a8SJeff Roberson fs->pindex = fs->first_pindex; 11075936b6a8SJeff Roberson fs->m = fs->first_m; 11085936b6a8SJeff Roberson VM_CNT_INC(v_cow_faults); 11095936b6a8SJeff Roberson curthread->td_cow++; 11105936b6a8SJeff Roberson } 11115936b6a8SJeff Roberson 1112fdb1dbb1SMateusz Guzik static enum fault_next_status 111391eb2e90SJeff Roberson vm_fault_next(struct faultstate *fs) 111491eb2e90SJeff Roberson { 111591eb2e90SJeff Roberson vm_object_t next_object; 111691eb2e90SJeff Roberson 1117fdb1dbb1SMateusz Guzik if (fs->object == fs->first_object || !fs->can_read_lock) 111873b951cdSMateusz Guzik VM_OBJECT_ASSERT_WLOCKED(fs->object); 1119fdb1dbb1SMateusz Guzik else 1120fdb1dbb1SMateusz Guzik VM_OBJECT_ASSERT_LOCKED(fs->object); 112173b951cdSMateusz Guzik 112291eb2e90SJeff Roberson /* 112391eb2e90SJeff Roberson * The requested page does not exist at this object/ 112491eb2e90SJeff Roberson * offset. Remove the invalid page from the object, 112591eb2e90SJeff Roberson * waking up anyone waiting for it, and continue on to 112691eb2e90SJeff Roberson * the next object. However, if this is the top-level 112791eb2e90SJeff Roberson * object, we must leave the busy page in place to 112891eb2e90SJeff Roberson * prevent another process from rushing past us, and 112991eb2e90SJeff Roberson * inserting the page in that object at the same time 113091eb2e90SJeff Roberson * that we are. 113191eb2e90SJeff Roberson */ 113291eb2e90SJeff Roberson if (fs->object == fs->first_object) { 113391eb2e90SJeff Roberson fs->first_m = fs->m; 113491eb2e90SJeff Roberson fs->m = NULL; 11353c3a434fSMateusz Guzik } else if (fs->m != NULL) { 1136fdb1dbb1SMateusz Guzik if (!vm_fault_object_ensure_wlocked(fs)) { 1137fdb1dbb1SMateusz Guzik fs->can_read_lock = false; 11380a310c94SMateusz Guzik vm_fault_unlock_and_deallocate(fs); 1139fdb1dbb1SMateusz Guzik return (FAULT_NEXT_RESTART); 1140fdb1dbb1SMateusz Guzik } 11410a310c94SMateusz Guzik vm_fault_page_free(&fs->m); 1142fdb1dbb1SMateusz Guzik } 114391eb2e90SJeff Roberson 114491eb2e90SJeff Roberson /* 114591eb2e90SJeff Roberson * Move on to the next object. Lock the next object before 114691eb2e90SJeff Roberson * unlocking the current one. 114791eb2e90SJeff Roberson */ 114891eb2e90SJeff Roberson next_object = fs->object->backing_object; 1149fb4d37eaSJeff Roberson if (next_object == NULL) 1150fdb1dbb1SMateusz Guzik return (FAULT_NEXT_NOOBJ); 1151fb4d37eaSJeff Roberson MPASS(fs->first_m != NULL); 1152fb4d37eaSJeff Roberson KASSERT(fs->object != next_object, ("object loop %p", next_object)); 1153fdb1dbb1SMateusz Guzik if (fs->can_read_lock) 1154fdb1dbb1SMateusz Guzik VM_OBJECT_RLOCK(next_object); 1155fdb1dbb1SMateusz Guzik else 1156fb4d37eaSJeff Roberson VM_OBJECT_WLOCK(next_object); 1157fb4d37eaSJeff Roberson vm_object_pip_add(next_object, 1); 1158fb4d37eaSJeff Roberson if (fs->object != fs->first_object) 1159fb4d37eaSJeff Roberson vm_object_pip_wakeup(fs->object); 1160fb4d37eaSJeff Roberson fs->pindex += OFF_TO_IDX(fs->object->backing_object_offset); 1161fdb1dbb1SMateusz Guzik VM_OBJECT_UNLOCK(fs->object); 1162fb4d37eaSJeff Roberson fs->object = next_object; 1163fb4d37eaSJeff Roberson 1164fdb1dbb1SMateusz Guzik return (FAULT_NEXT_GOTOBJ); 1165fb4d37eaSJeff Roberson } 1166fb4d37eaSJeff Roberson 1167fb4d37eaSJeff Roberson static void 1168fb4d37eaSJeff Roberson vm_fault_zerofill(struct faultstate *fs) 1169fb4d37eaSJeff Roberson { 1170fb4d37eaSJeff Roberson 117191eb2e90SJeff Roberson /* 117291eb2e90SJeff Roberson * If there's no object left, fill the page in the top 117391eb2e90SJeff Roberson * object with zeros. 117491eb2e90SJeff Roberson */ 117591eb2e90SJeff Roberson if (fs->object != fs->first_object) { 117691eb2e90SJeff Roberson vm_object_pip_wakeup(fs->object); 117791eb2e90SJeff Roberson fs->object = fs->first_object; 117891eb2e90SJeff Roberson fs->pindex = fs->first_pindex; 117991eb2e90SJeff Roberson } 118091eb2e90SJeff Roberson MPASS(fs->first_m != NULL); 118191eb2e90SJeff Roberson MPASS(fs->m == NULL); 118291eb2e90SJeff Roberson fs->m = fs->first_m; 118391eb2e90SJeff Roberson fs->first_m = NULL; 118491eb2e90SJeff Roberson 118591eb2e90SJeff Roberson /* 118691eb2e90SJeff Roberson * Zero the page if necessary and mark it valid. 118791eb2e90SJeff Roberson */ 118891eb2e90SJeff Roberson if ((fs->m->flags & PG_ZERO) == 0) { 118991eb2e90SJeff Roberson pmap_zero_page(fs->m); 119091eb2e90SJeff Roberson } else { 119191eb2e90SJeff Roberson VM_CNT_INC(v_ozfod); 119291eb2e90SJeff Roberson } 119391eb2e90SJeff Roberson VM_CNT_INC(v_zfod); 119491eb2e90SJeff Roberson vm_page_valid(fs->m); 119591eb2e90SJeff Roberson } 119691eb2e90SJeff Roberson 1197df794f5cSJeff Roberson /* 1198174aad04SKonstantin Belousov * Initiate page fault after timeout. Returns true if caller should 1199174aad04SKonstantin Belousov * do vm_waitpfault() after the call. 1200174aad04SKonstantin Belousov */ 1201174aad04SKonstantin Belousov static bool 1202174aad04SKonstantin Belousov vm_fault_allocate_oom(struct faultstate *fs) 1203174aad04SKonstantin Belousov { 1204174aad04SKonstantin Belousov struct timeval now; 1205174aad04SKonstantin Belousov 12060a310c94SMateusz Guzik vm_fault_unlock_and_deallocate(fs); 1207174aad04SKonstantin Belousov if (vm_pfault_oom_attempts < 0) 1208174aad04SKonstantin Belousov return (true); 1209174aad04SKonstantin Belousov if (!fs->oom_started) { 1210174aad04SKonstantin Belousov fs->oom_started = true; 1211174aad04SKonstantin Belousov getmicrotime(&fs->oom_start_time); 1212174aad04SKonstantin Belousov return (true); 1213174aad04SKonstantin Belousov } 1214174aad04SKonstantin Belousov 1215174aad04SKonstantin Belousov getmicrotime(&now); 1216174aad04SKonstantin Belousov timevalsub(&now, &fs->oom_start_time); 1217174aad04SKonstantin Belousov if (now.tv_sec < vm_pfault_oom_attempts * vm_pfault_oom_wait) 1218174aad04SKonstantin Belousov return (true); 1219174aad04SKonstantin Belousov 1220174aad04SKonstantin Belousov if (bootverbose) 1221174aad04SKonstantin Belousov printf( 1222174aad04SKonstantin Belousov "proc %d (%s) failed to alloc page on fault, starting OOM\n", 1223174aad04SKonstantin Belousov curproc->p_pid, curproc->p_comm); 1224174aad04SKonstantin Belousov vm_pageout_oom(VM_OOM_MEM_PF); 1225174aad04SKonstantin Belousov fs->oom_started = false; 1226174aad04SKonstantin Belousov return (false); 1227174aad04SKonstantin Belousov } 1228174aad04SKonstantin Belousov 1229174aad04SKonstantin Belousov /* 1230df794f5cSJeff Roberson * Allocate a page directly or via the object populate method. 1231df794f5cSJeff Roberson */ 1232f1b642c2SMark Johnston static enum fault_status 1233df794f5cSJeff Roberson vm_fault_allocate(struct faultstate *fs) 1234df794f5cSJeff Roberson { 1235df794f5cSJeff Roberson struct domainset *dset; 1236f1b642c2SMark Johnston enum fault_status res; 1237df794f5cSJeff Roberson 1238df794f5cSJeff Roberson if ((fs->object->flags & OBJ_SIZEVNLOCK) != 0) { 1239f1b642c2SMark Johnston res = vm_fault_lock_vnode(fs, true); 1240f1b642c2SMark Johnston MPASS(res == FAULT_CONTINUE || res == FAULT_RESTART); 1241f1b642c2SMark Johnston if (res == FAULT_RESTART) 1242f1b642c2SMark Johnston return (res); 1243df794f5cSJeff Roberson } 1244df794f5cSJeff Roberson 1245f1b642c2SMark Johnston if (fs->pindex >= fs->object->size) { 12460a310c94SMateusz Guzik vm_fault_unlock_and_deallocate(fs); 1247f1b642c2SMark Johnston return (FAULT_OUT_OF_BOUNDS); 1248f1b642c2SMark Johnston } 1249df794f5cSJeff Roberson 1250df794f5cSJeff Roberson if (fs->object == fs->first_object && 1251df794f5cSJeff Roberson (fs->first_object->flags & OBJ_POPULATE) != 0 && 1252df794f5cSJeff Roberson fs->first_object->shadow_count == 0) { 1253f1b642c2SMark Johnston res = vm_fault_populate(fs); 1254f1b642c2SMark Johnston switch (res) { 1255f1b642c2SMark Johnston case FAULT_SUCCESS: 1256f1b642c2SMark Johnston case FAULT_FAILURE: 1257f1b642c2SMark Johnston case FAULT_RESTART: 12580a310c94SMateusz Guzik vm_fault_unlock_and_deallocate(fs); 1259f1b642c2SMark Johnston return (res); 1260f1b642c2SMark Johnston case FAULT_CONTINUE: 1261df794f5cSJeff Roberson /* 1262df794f5cSJeff Roberson * Pager's populate() method 1263df794f5cSJeff Roberson * returned VM_PAGER_BAD. 1264df794f5cSJeff Roberson */ 1265df794f5cSJeff Roberson break; 1266df794f5cSJeff Roberson default: 1267df794f5cSJeff Roberson panic("inconsistent return codes"); 1268df794f5cSJeff Roberson } 1269df794f5cSJeff Roberson } 1270df794f5cSJeff Roberson 1271df794f5cSJeff Roberson /* 1272df794f5cSJeff Roberson * Allocate a new page for this object/offset pair. 1273df794f5cSJeff Roberson * 1274b801c79dSMark Johnston * If the process has a fatal signal pending, prioritize the allocation 1275b801c79dSMark Johnston * with the expectation that the process will exit shortly and free some 1276b801c79dSMark Johnston * pages. In particular, the signal may have been posted by the page 1277b801c79dSMark Johnston * daemon in an attempt to resolve an out-of-memory condition. 1278b801c79dSMark Johnston * 1279b801c79dSMark Johnston * The unlocked read of the p_flag is harmless. At worst, the P_KILLED 1280b801c79dSMark Johnston * might be not observed here, and allocation fails, causing a restart 1281b801c79dSMark Johnston * and new reading of the p_flag. 1282df794f5cSJeff Roberson */ 1283df794f5cSJeff Roberson dset = fs->object->domain.dr_policy; 1284df794f5cSJeff Roberson if (dset == NULL) 1285df794f5cSJeff Roberson dset = curthread->td_domain.dr_policy; 1286df794f5cSJeff Roberson if (!vm_page_count_severe_set(&dset->ds_mask) || P_KILLED(curproc)) { 1287df794f5cSJeff Roberson #if VM_NRESERVLEVEL > 0 1288df794f5cSJeff Roberson vm_object_color(fs->object, atop(fs->vaddr) - fs->pindex); 1289df794f5cSJeff Roberson #endif 1290ec201dddSKonstantin Belousov if (!vm_pager_can_alloc_page(fs->object, fs->pindex)) { 12910a310c94SMateusz Guzik vm_fault_unlock_and_deallocate(fs); 1292ec201dddSKonstantin Belousov return (FAULT_FAILURE); 1293ec201dddSKonstantin Belousov } 1294b801c79dSMark Johnston fs->m = vm_page_alloc(fs->object, fs->pindex, 1295b801c79dSMark Johnston P_KILLED(curproc) ? VM_ALLOC_SYSTEM : 0); 1296df794f5cSJeff Roberson } 1297df794f5cSJeff Roberson if (fs->m == NULL) { 1298174aad04SKonstantin Belousov if (vm_fault_allocate_oom(fs)) 1299df794f5cSJeff Roberson vm_waitpfault(dset, vm_pfault_oom_wait * hz); 1300f1b642c2SMark Johnston return (FAULT_RESTART); 1301df794f5cSJeff Roberson } 1302174aad04SKonstantin Belousov fs->oom_started = false; 1303df794f5cSJeff Roberson 1304f1b642c2SMark Johnston return (FAULT_CONTINUE); 1305df794f5cSJeff Roberson } 13065909dafeSJeff Roberson 13075909dafeSJeff Roberson /* 13085909dafeSJeff Roberson * Call the pager to retrieve the page if there is a chance 13095909dafeSJeff Roberson * that the pager has it, and potentially retrieve additional 13105909dafeSJeff Roberson * pages at the same time. 13115909dafeSJeff Roberson */ 1312f1b642c2SMark Johnston static enum fault_status 131345c09a74SMark Johnston vm_fault_getpages(struct faultstate *fs, int *behindp, int *aheadp) 13145909dafeSJeff Roberson { 13155909dafeSJeff Roberson vm_offset_t e_end, e_start; 13165909dafeSJeff Roberson int ahead, behind, cluster_offset, rv; 1317f1b642c2SMark Johnston enum fault_status status; 13185909dafeSJeff Roberson u_char behavior; 13195909dafeSJeff Roberson 13205909dafeSJeff Roberson /* 13215909dafeSJeff Roberson * Prepare for unlocking the map. Save the map 13225909dafeSJeff Roberson * entry's start and end addresses, which are used to 13235909dafeSJeff Roberson * optimize the size of the pager operation below. 13245909dafeSJeff Roberson * Even if the map entry's addresses change after 13255909dafeSJeff Roberson * unlocking the map, using the saved addresses is 13265909dafeSJeff Roberson * safe. 13275909dafeSJeff Roberson */ 13285909dafeSJeff Roberson e_start = fs->entry->start; 13295909dafeSJeff Roberson e_end = fs->entry->end; 13305909dafeSJeff Roberson behavior = vm_map_entry_behavior(fs->entry); 13315909dafeSJeff Roberson 13325909dafeSJeff Roberson /* 133345c09a74SMark Johnston * If the pager for the current object might have 133445c09a74SMark Johnston * the page, then determine the number of additional 133545c09a74SMark Johnston * pages to read and potentially reprioritize 133645c09a74SMark Johnston * previously read pages for earlier reclamation. 133745c09a74SMark Johnston * These operations should only be performed once per 133845c09a74SMark Johnston * page fault. Even if the current pager doesn't 133945c09a74SMark Johnston * have the page, the number of additional pages to 134045c09a74SMark Johnston * read will apply to subsequent objects in the 134145c09a74SMark Johnston * shadow chain. 134245c09a74SMark Johnston */ 134345c09a74SMark Johnston if (fs->nera == -1 && !P_KILLED(curproc)) 134445c09a74SMark Johnston fs->nera = vm_fault_readahead(fs); 134545c09a74SMark Johnston 134645c09a74SMark Johnston /* 13475909dafeSJeff Roberson * Release the map lock before locking the vnode or 13485909dafeSJeff Roberson * sleeping in the pager. (If the current object has 13495909dafeSJeff Roberson * a shadow, then an earlier iteration of this loop 13505909dafeSJeff Roberson * may have already unlocked the map.) 13515909dafeSJeff Roberson */ 13520a310c94SMateusz Guzik vm_fault_unlock_map(fs); 13535909dafeSJeff Roberson 1354f1b642c2SMark Johnston status = vm_fault_lock_vnode(fs, false); 1355f1b642c2SMark Johnston MPASS(status == FAULT_CONTINUE || status == FAULT_RESTART); 1356f1b642c2SMark Johnston if (status == FAULT_RESTART) 1357f1b642c2SMark Johnston return (status); 13585909dafeSJeff Roberson KASSERT(fs->vp == NULL || !fs->map->system_map, 13595909dafeSJeff Roberson ("vm_fault: vnode-backed object mapped by system map")); 13605909dafeSJeff Roberson 13615909dafeSJeff Roberson /* 13625909dafeSJeff Roberson * Page in the requested page and hint the pager, 13635909dafeSJeff Roberson * that it may bring up surrounding pages. 13645909dafeSJeff Roberson */ 136545c09a74SMark Johnston if (fs->nera == -1 || behavior == MAP_ENTRY_BEHAV_RANDOM || 13665909dafeSJeff Roberson P_KILLED(curproc)) { 13675909dafeSJeff Roberson behind = 0; 13685909dafeSJeff Roberson ahead = 0; 13695909dafeSJeff Roberson } else { 13705909dafeSJeff Roberson /* Is this a sequential fault? */ 137145c09a74SMark Johnston if (fs->nera > 0) { 13725909dafeSJeff Roberson behind = 0; 137345c09a74SMark Johnston ahead = fs->nera; 13745909dafeSJeff Roberson } else { 13755909dafeSJeff Roberson /* 13765909dafeSJeff Roberson * Request a cluster of pages that is 13775909dafeSJeff Roberson * aligned to a VM_FAULT_READ_DEFAULT 13785909dafeSJeff Roberson * page offset boundary within the 13795909dafeSJeff Roberson * object. Alignment to a page offset 13805909dafeSJeff Roberson * boundary is more likely to coincide 13815909dafeSJeff Roberson * with the underlying file system 13825909dafeSJeff Roberson * block than alignment to a virtual 13835909dafeSJeff Roberson * address boundary. 13845909dafeSJeff Roberson */ 13855909dafeSJeff Roberson cluster_offset = fs->pindex % VM_FAULT_READ_DEFAULT; 13865909dafeSJeff Roberson behind = ulmin(cluster_offset, 13875909dafeSJeff Roberson atop(fs->vaddr - e_start)); 13885909dafeSJeff Roberson ahead = VM_FAULT_READ_DEFAULT - 1 - cluster_offset; 13895909dafeSJeff Roberson } 13905909dafeSJeff Roberson ahead = ulmin(ahead, atop(e_end - fs->vaddr) - 1); 13915909dafeSJeff Roberson } 13925909dafeSJeff Roberson *behindp = behind; 13935909dafeSJeff Roberson *aheadp = ahead; 13945909dafeSJeff Roberson rv = vm_pager_get_pages(fs->object, &fs->m, 1, behindp, aheadp); 13955909dafeSJeff Roberson if (rv == VM_PAGER_OK) 1396f1b642c2SMark Johnston return (FAULT_HARD); 13975909dafeSJeff Roberson if (rv == VM_PAGER_ERROR) 13985909dafeSJeff Roberson printf("vm_fault: pager read error, pid %d (%s)\n", 13995909dafeSJeff Roberson curproc->p_pid, curproc->p_comm); 14005909dafeSJeff Roberson /* 14015909dafeSJeff Roberson * If an I/O error occurred or the requested page was 14025909dafeSJeff Roberson * outside the range of the pager, clean up and return 14035909dafeSJeff Roberson * an error. 14045909dafeSJeff Roberson */ 140545c09a74SMark Johnston if (rv == VM_PAGER_ERROR || rv == VM_PAGER_BAD) { 140645c09a74SMark Johnston VM_OBJECT_WLOCK(fs->object); 14070a310c94SMateusz Guzik vm_fault_page_free(&fs->m); 14080a310c94SMateusz Guzik vm_fault_unlock_and_deallocate(fs); 1409f1b642c2SMark Johnston return (FAULT_OUT_OF_BOUNDS); 141045c09a74SMark Johnston } 141145c09a74SMark Johnston KASSERT(rv == VM_PAGER_FAIL, 14129a89977bSPeter Jeremy ("%s: unexpected pager error %d", __func__, rv)); 1413f1b642c2SMark Johnston return (FAULT_CONTINUE); 14145909dafeSJeff Roberson } 14155909dafeSJeff Roberson 14165949b1caSJeff Roberson /* 1417bef91632SJeff Roberson * Wait/Retry if the page is busy. We have to do this if the page is 1418bef91632SJeff Roberson * either exclusive or shared busy because the vm_pager may be using 1419bef91632SJeff Roberson * read busy for pageouts (and even pageins if it is the vnode pager), 1420bef91632SJeff Roberson * and we could end up trying to pagein and pageout the same page 1421bef91632SJeff Roberson * simultaneously. 1422bef91632SJeff Roberson * 1423bef91632SJeff Roberson * We can theoretically allow the busy case on a read fault if the page 1424bef91632SJeff Roberson * is marked valid, but since such pages are typically already pmap'd, 1425bef91632SJeff Roberson * putting that special case in might be more effort then it is worth. 1426bef91632SJeff Roberson * We cannot under any circumstances mess around with a shared busied 1427bef91632SJeff Roberson * page except, perhaps, to pmap it. 1428bef91632SJeff Roberson */ 1429bef91632SJeff Roberson static void 1430bef91632SJeff Roberson vm_fault_busy_sleep(struct faultstate *fs) 1431bef91632SJeff Roberson { 1432bef91632SJeff Roberson /* 1433bef91632SJeff Roberson * Reference the page before unlocking and 1434bef91632SJeff Roberson * sleeping so that the page daemon is less 1435bef91632SJeff Roberson * likely to reclaim it. 1436bef91632SJeff Roberson */ 1437bef91632SJeff Roberson vm_page_aflag_set(fs->m, PGA_REFERENCED); 1438bef91632SJeff Roberson if (fs->object != fs->first_object) { 14390a310c94SMateusz Guzik vm_fault_page_release(&fs->first_m); 1440bef91632SJeff Roberson vm_object_pip_wakeup(fs->first_object); 1441bef91632SJeff Roberson } 1442bef91632SJeff Roberson vm_object_pip_wakeup(fs->object); 14430a310c94SMateusz Guzik vm_fault_unlock_map(fs); 144487b64663SMark Johnston if (fs->m != vm_page_lookup(fs->object, fs->pindex) || 144587b64663SMark Johnston !vm_page_busy_sleep(fs->m, "vmpfw", 0)) 1446fdb1dbb1SMateusz Guzik VM_OBJECT_UNLOCK(fs->object); 1447bef91632SJeff Roberson VM_CNT_INC(v_intrans); 1448bef91632SJeff Roberson vm_object_deallocate(fs->first_object); 1449bef91632SJeff Roberson } 1450bef91632SJeff Roberson 1451d47d3a94SMark Johnston /* 1452d47d3a94SMark Johnston * Handle page lookup, populate, allocate, page-in for the current 1453d47d3a94SMark Johnston * object. 1454d47d3a94SMark Johnston * 1455d47d3a94SMark Johnston * The object is locked on entry and will remain locked with a return 1456d47d3a94SMark Johnston * code of FAULT_CONTINUE so that fault may follow the shadow chain. 1457d47d3a94SMark Johnston * Otherwise, the object will be unlocked upon return. 1458d47d3a94SMark Johnston */ 1459d47d3a94SMark Johnston static enum fault_status 1460d47d3a94SMark Johnston vm_fault_object(struct faultstate *fs, int *behindp, int *aheadp) 1461d47d3a94SMark Johnston { 1462d47d3a94SMark Johnston enum fault_status res; 1463d47d3a94SMark Johnston bool dead; 1464d47d3a94SMark Johnston 1465fdb1dbb1SMateusz Guzik if (fs->object == fs->first_object || !fs->can_read_lock) 146673b951cdSMateusz Guzik VM_OBJECT_ASSERT_WLOCKED(fs->object); 1467fdb1dbb1SMateusz Guzik else 1468fdb1dbb1SMateusz Guzik VM_OBJECT_ASSERT_LOCKED(fs->object); 146973b951cdSMateusz Guzik 1470d47d3a94SMark Johnston /* 1471d47d3a94SMark Johnston * If the object is marked for imminent termination, we retry 1472d47d3a94SMark Johnston * here, since the collapse pass has raced with us. Otherwise, 1473d47d3a94SMark Johnston * if we see terminally dead object, return fail. 1474d47d3a94SMark Johnston */ 1475d47d3a94SMark Johnston if ((fs->object->flags & OBJ_DEAD) != 0) { 1476d47d3a94SMark Johnston dead = fs->object->type == OBJT_DEAD; 14770a310c94SMateusz Guzik vm_fault_unlock_and_deallocate(fs); 1478d47d3a94SMark Johnston if (dead) 1479d47d3a94SMark Johnston return (FAULT_PROTECTION_FAILURE); 1480d47d3a94SMark Johnston pause("vmf_de", 1); 1481d47d3a94SMark Johnston return (FAULT_RESTART); 1482d47d3a94SMark Johnston } 1483d47d3a94SMark Johnston 1484d47d3a94SMark Johnston /* 1485d47d3a94SMark Johnston * See if the page is resident. 1486d47d3a94SMark Johnston */ 1487d47d3a94SMark Johnston fs->m = vm_page_lookup(fs->object, fs->pindex); 1488d47d3a94SMark Johnston if (fs->m != NULL) { 1489d47d3a94SMark Johnston if (!vm_page_tryxbusy(fs->m)) { 1490d47d3a94SMark Johnston vm_fault_busy_sleep(fs); 1491d47d3a94SMark Johnston return (FAULT_RESTART); 1492d47d3a94SMark Johnston } 1493d47d3a94SMark Johnston 1494d47d3a94SMark Johnston /* 1495d47d3a94SMark Johnston * The page is marked busy for other processes and the 1496d47d3a94SMark Johnston * pagedaemon. If it is still completely valid we are 1497d47d3a94SMark Johnston * done. 1498d47d3a94SMark Johnston */ 1499d47d3a94SMark Johnston if (vm_page_all_valid(fs->m)) { 1500fdb1dbb1SMateusz Guzik VM_OBJECT_UNLOCK(fs->object); 1501d47d3a94SMark Johnston return (FAULT_SOFT); 1502d47d3a94SMark Johnston } 1503d47d3a94SMark Johnston } 1504d47d3a94SMark Johnston 1505d47d3a94SMark Johnston /* 1506d47d3a94SMark Johnston * Page is not resident. If the pager might contain the page 1507d47d3a94SMark Johnston * or this is the beginning of the search, allocate a new 15085d32157dSMark Johnston * page. 1509d47d3a94SMark Johnston */ 15100a310c94SMateusz Guzik if (fs->m == NULL && (vm_fault_object_needs_getpages(fs->object) || 1511d47d3a94SMark Johnston fs->object == fs->first_object)) { 1512fdb1dbb1SMateusz Guzik if (!vm_fault_object_ensure_wlocked(fs)) { 1513fdb1dbb1SMateusz Guzik fs->can_read_lock = false; 15140a310c94SMateusz Guzik vm_fault_unlock_and_deallocate(fs); 1515fdb1dbb1SMateusz Guzik return (FAULT_RESTART); 1516fdb1dbb1SMateusz Guzik } 1517d47d3a94SMark Johnston res = vm_fault_allocate(fs); 1518d47d3a94SMark Johnston if (res != FAULT_CONTINUE) 1519d47d3a94SMark Johnston return (res); 1520d47d3a94SMark Johnston } 1521d47d3a94SMark Johnston 1522d47d3a94SMark Johnston /* 1523e08302f6SMark Johnston * Check to see if the pager can possibly satisfy this fault. 1524e08302f6SMark Johnston * If not, skip to the next object without dropping the lock to 1525e08302f6SMark Johnston * preserve atomicity of shadow faults. 1526d47d3a94SMark Johnston */ 15270a310c94SMateusz Guzik if (vm_fault_object_needs_getpages(fs->object)) { 1528d47d3a94SMark Johnston /* 1529d47d3a94SMark Johnston * At this point, we have either allocated a new page 1530d47d3a94SMark Johnston * or found an existing page that is only partially 1531d47d3a94SMark Johnston * valid. 1532d47d3a94SMark Johnston * 1533d47d3a94SMark Johnston * We hold a reference on the current object and the 1534d47d3a94SMark Johnston * page is exclusive busied. The exclusive busy 1535d47d3a94SMark Johnston * prevents simultaneous faults and collapses while 1536d47d3a94SMark Johnston * the object lock is dropped. 1537d47d3a94SMark Johnston */ 1538fdb1dbb1SMateusz Guzik VM_OBJECT_UNLOCK(fs->object); 1539d47d3a94SMark Johnston res = vm_fault_getpages(fs, behindp, aheadp); 1540d47d3a94SMark Johnston if (res == FAULT_CONTINUE) 1541d47d3a94SMark Johnston VM_OBJECT_WLOCK(fs->object); 1542d47d3a94SMark Johnston } else { 1543d47d3a94SMark Johnston res = FAULT_CONTINUE; 1544d47d3a94SMark Johnston } 1545d47d3a94SMark Johnston return (res); 1546d47d3a94SMark Johnston } 1547d47d3a94SMark Johnston 1548acd11c74SAlan Cox int 1549df08823dSKonstantin Belousov vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, 1550acd11c74SAlan Cox int fault_flags, vm_page_t *m_hold) 1551acd11c74SAlan Cox { 15524866e085SJohn Dyson struct faultstate fs; 1553f1b642c2SMark Johnston int ahead, behind, faultcount, rv; 1554f1b642c2SMark Johnston enum fault_status res; 1555fdb1dbb1SMateusz Guzik enum fault_next_status res_next; 1556d47d3a94SMark Johnston bool hardfault; 1557df8bae1dSRodney W. Grimes 155883c9dea1SGleb Smirnoff VM_CNT_INC(v_vm_faults); 1559c31cec45SKonstantin Belousov 1560c31cec45SKonstantin Belousov if ((curthread->td_pflags & TDP_NOFAULTING) != 0) 1561c31cec45SKonstantin Belousov return (KERN_PROTECTION_FAILURE); 1562c31cec45SKonstantin Belousov 1563d2bf64c3SKonstantin Belousov fs.vp = NULL; 15645949b1caSJeff Roberson fs.vaddr = vaddr; 15652c2f4413SJeff Roberson fs.m_hold = m_hold; 15662c2f4413SJeff Roberson fs.fault_flags = fault_flags; 1567c308a3a6SJeff Roberson fs.map = map; 1568c308a3a6SJeff Roberson fs.lookup_still_valid = false; 1569174aad04SKonstantin Belousov fs.oom_started = false; 157045c09a74SMark Johnston fs.nera = -1; 1571fdb1dbb1SMateusz Guzik fs.can_read_lock = true; 1572b0cd2017SGleb Smirnoff faultcount = 0; 1573320023e2SAlan Cox hardfault = false; 1574df8bae1dSRodney W. Grimes 1575245139c6SKonstantin Belousov RetryFault: 15762c2f4413SJeff Roberson fs.fault_type = fault_type; 1577df8bae1dSRodney W. Grimes 1578df8bae1dSRodney W. Grimes /* 15790d94caffSDavid Greenman * Find the backing store object and offset into it to begin the 15800d94caffSDavid Greenman * search. 1581df8bae1dSRodney W. Grimes */ 1582f1b642c2SMark Johnston rv = vm_fault_lookup(&fs); 1583f1b642c2SMark Johnston if (rv != KERN_SUCCESS) { 1584f1b642c2SMark Johnston if (rv == KERN_RESOURCE_SHORTAGE) 1585c308a3a6SJeff Roberson goto RetryFault; 1586f1b642c2SMark Johnston return (rv); 158709e0c6ccSJohn Dyson } 158809e0c6ccSJohn Dyson 15898d67b8c8SAlan Cox /* 15908d67b8c8SAlan Cox * Try to avoid lock contention on the top-level object through 15918d67b8c8SAlan Cox * special-case handling of some types of page faults, specifically, 159267d0e293SJeff Roberson * those that are mapping an existing page from the top-level object. 159367d0e293SJeff Roberson * Under this condition, a read lock on the object suffices, allowing 159467d0e293SJeff Roberson * multiple page faults of a similar type to run in parallel. 15958d67b8c8SAlan Cox */ 1596afe55ca3SKonstantin Belousov if (fs.vp == NULL /* avoid locked vnode leak */ && 1597d301b358SKonstantin Belousov (fs.entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) == 0 && 15982c2f4413SJeff Roberson (fs.fault_flags & (VM_FAULT_WIRE | VM_FAULT_DIRTY)) == 0) { 1599f1b642c2SMark Johnston res = vm_fault_soft_fast(&fs); 16000e71f4f7SMateusz Guzik if (res == FAULT_SUCCESS) { 16010e71f4f7SMateusz Guzik VM_OBJECT_ASSERT_UNLOCKED(fs.first_object); 1602f1b642c2SMark Johnston return (KERN_SUCCESS); 1603afe55ca3SKonstantin Belousov } 16040e71f4f7SMateusz Guzik VM_OBJECT_ASSERT_WLOCKED(fs.first_object); 1605afe55ca3SKonstantin Belousov } else { 1606afe55ca3SKonstantin Belousov VM_OBJECT_WLOCK(fs.first_object); 1607afe55ca3SKonstantin Belousov } 1608afe55ca3SKonstantin Belousov 160995e5e988SJohn Dyson /* 161095e5e988SJohn Dyson * Make a reference to this object to prevent its disposal while we 161195e5e988SJohn Dyson * are messing with it. Once we have the reference, the map is free 161295e5e988SJohn Dyson * to be diddled. Since objects reference their shadows (and copies), 161395e5e988SJohn Dyson * they will stay around as well. 1614fe8e0238SMatthew Dillon * 1615fe8e0238SMatthew Dillon * Bump the paging-in-progress count to prevent size changes (e.g. 1616dda4d369SAlan Cox * truncation operations) during I/O. 161795e5e988SJohn Dyson */ 1618a976eb5eSAlan Cox vm_object_reference_locked(fs.first_object); 1619d474eaaaSDoug Rabson vm_object_pip_add(fs.first_object, 1); 162095e5e988SJohn Dyson 162158447749SJeff Roberson fs.m_cow = fs.m = fs.first_m = NULL; 1622df8bae1dSRodney W. Grimes 1623df8bae1dSRodney W. Grimes /* 1624df8bae1dSRodney W. Grimes * Search for the page at object/offset. 1625df8bae1dSRodney W. Grimes */ 16264866e085SJohn Dyson fs.object = fs.first_object; 16274866e085SJohn Dyson fs.pindex = fs.first_pindex; 1628d301b358SKonstantin Belousov 1629d301b358SKonstantin Belousov if ((fs.entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) != 0) { 1630f1b642c2SMark Johnston res = vm_fault_allocate(&fs); 1631f1b642c2SMark Johnston switch (res) { 1632f1b642c2SMark Johnston case FAULT_RESTART: 1633d301b358SKonstantin Belousov goto RetryFault; 1634f1b642c2SMark Johnston case FAULT_SUCCESS: 1635f1b642c2SMark Johnston return (KERN_SUCCESS); 1636f1b642c2SMark Johnston case FAULT_FAILURE: 1637f1b642c2SMark Johnston return (KERN_FAILURE); 1638f1b642c2SMark Johnston case FAULT_OUT_OF_BOUNDS: 1639f1b642c2SMark Johnston return (KERN_OUT_OF_BOUNDS); 1640f1b642c2SMark Johnston case FAULT_CONTINUE: 1641d301b358SKonstantin Belousov break; 1642d301b358SKonstantin Belousov default: 1643f1b642c2SMark Johnston panic("vm_fault: Unhandled status %d", res); 1644d301b358SKonstantin Belousov } 1645d301b358SKonstantin Belousov } 1646d301b358SKonstantin Belousov 1647df8bae1dSRodney W. Grimes while (TRUE) { 16484bf95d00SJeff Roberson KASSERT(fs.m == NULL, 16494bf95d00SJeff Roberson ("page still set %p at loop start", fs.m)); 165047221757SJohn Dyson 1651d47d3a94SMark Johnston res = vm_fault_object(&fs, &behind, &ahead); 1652f1b642c2SMark Johnston switch (res) { 1653d47d3a94SMark Johnston case FAULT_SOFT: 1654d47d3a94SMark Johnston goto found; 1655d47d3a94SMark Johnston case FAULT_HARD: 1656d47d3a94SMark Johnston faultcount = behind + 1 + ahead; 1657d47d3a94SMark Johnston hardfault = true; 1658d47d3a94SMark Johnston goto found; 1659f1b642c2SMark Johnston case FAULT_RESTART: 1660df794f5cSJeff Roberson goto RetryFault; 1661f1b642c2SMark Johnston case FAULT_SUCCESS: 1662f1b642c2SMark Johnston return (KERN_SUCCESS); 1663f1b642c2SMark Johnston case FAULT_FAILURE: 1664f1b642c2SMark Johnston return (KERN_FAILURE); 1665f1b642c2SMark Johnston case FAULT_OUT_OF_BOUNDS: 1666f1b642c2SMark Johnston return (KERN_OUT_OF_BOUNDS); 1667d47d3a94SMark Johnston case FAULT_PROTECTION_FAILURE: 1668d47d3a94SMark Johnston return (KERN_PROTECTION_FAILURE); 1669f1b642c2SMark Johnston case FAULT_CONTINUE: 1670c42b43a0SKonstantin Belousov break; 1671c42b43a0SKonstantin Belousov default: 1672f1b642c2SMark Johnston panic("vm_fault: Unhandled status %d", res); 1673c42b43a0SKonstantin Belousov } 16744bf95d00SJeff Roberson 1675521ddf39SAlan Cox /* 16765909dafeSJeff Roberson * The page was not found in the current object. Try to 16775909dafeSJeff Roberson * traverse into a backing object or zero fill if none is 16785909dafeSJeff Roberson * found. 1679521ddf39SAlan Cox */ 1680fdb1dbb1SMateusz Guzik res_next = vm_fault_next(&fs); 1681fdb1dbb1SMateusz Guzik if (res_next == FAULT_NEXT_RESTART) 1682fdb1dbb1SMateusz Guzik goto RetryFault; 1683fdb1dbb1SMateusz Guzik else if (res_next == FAULT_NEXT_GOTOBJ) 1684fb4d37eaSJeff Roberson continue; 1685fdb1dbb1SMateusz Guzik MPASS(res_next == FAULT_NEXT_NOOBJ); 1686f31695ccSMark Johnston if ((fs.fault_flags & VM_FAULT_NOFILL) != 0) { 1687f31695ccSMark Johnston if (fs.first_object == fs.object) 16880a310c94SMateusz Guzik vm_fault_page_free(&fs.first_m); 16890a310c94SMateusz Guzik vm_fault_unlock_and_deallocate(&fs); 1690f31695ccSMark Johnston return (KERN_OUT_OF_BOUNDS); 1691f31695ccSMark Johnston } 1692fdb1dbb1SMateusz Guzik VM_OBJECT_UNLOCK(fs.object); 1693fb4d37eaSJeff Roberson vm_fault_zerofill(&fs); 16947b9b301cSAlan Cox /* Don't try to prefault neighboring pages. */ 16957b9b301cSAlan Cox faultcount = 1; 1696d47d3a94SMark Johnston break; 1697df8bae1dSRodney W. Grimes } 16981c7c3c6aSMatthew Dillon 1699d47d3a94SMark Johnston found: 1700df8bae1dSRodney W. Grimes /* 1701d47d3a94SMark Johnston * A valid page has been found and exclusively busied. The 1702d47d3a94SMark Johnston * object lock must no longer be held. 1703df8bae1dSRodney W. Grimes */ 17041e40fe41SJeff Roberson vm_page_assert_xbusied(fs.m); 17051e40fe41SJeff Roberson VM_OBJECT_ASSERT_UNLOCKED(fs.object); 1706df8bae1dSRodney W. Grimes 1707df8bae1dSRodney W. Grimes /* 17080d94caffSDavid Greenman * If the page is being written, but isn't already owned by the 17090d94caffSDavid Greenman * top-level object, we have to copy it into a new page owned by the 17100d94caffSDavid Greenman * top-level object. 1711df8bae1dSRodney W. Grimes */ 17124866e085SJohn Dyson if (fs.object != fs.first_object) { 1713df8bae1dSRodney W. Grimes /* 17140d94caffSDavid Greenman * We only really need to copy if we want to write it. 1715df8bae1dSRodney W. Grimes */ 17162c2f4413SJeff Roberson if ((fs.fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { 17175936b6a8SJeff Roberson vm_fault_cow(&fs); 17189f1abe3dSAlan Cox /* 17199f1abe3dSAlan Cox * We only try to prefault read-only mappings to the 17209f1abe3dSAlan Cox * neighboring pages when this copy-on-write fault is 17219f1abe3dSAlan Cox * a hard fault. In other cases, trying to prefault 17229f1abe3dSAlan Cox * is typically wasted effort. 17239f1abe3dSAlan Cox */ 17249f1abe3dSAlan Cox if (faultcount == 0) 17259f1abe3dSAlan Cox faultcount = 1; 17269f1abe3dSAlan Cox 17270d94caffSDavid Greenman } else { 17282c2f4413SJeff Roberson fs.prot &= ~VM_PROT_WRITE; 1729df8bae1dSRodney W. Grimes } 1730df8bae1dSRodney W. Grimes } 1731df8bae1dSRodney W. Grimes 1732df8bae1dSRodney W. Grimes /* 17330d94caffSDavid Greenman * We must verify that the maps have not changed since our last 17340d94caffSDavid Greenman * lookup. 1735df8bae1dSRodney W. Grimes */ 173619dc5607STor Egge if (!fs.lookup_still_valid) { 1737f1b642c2SMark Johnston rv = vm_fault_relookup(&fs); 1738f1b642c2SMark Johnston if (rv != KERN_SUCCESS) { 17390a310c94SMateusz Guzik vm_fault_deallocate(&fs); 1740f1b642c2SMark Johnston if (rv == KERN_RESTART) 174119dc5607STor Egge goto RetryFault; 1742f1b642c2SMark Johnston return (rv); 1743df8bae1dSRodney W. Grimes } 174419dc5607STor Egge } 17451e40fe41SJeff Roberson VM_OBJECT_ASSERT_UNLOCKED(fs.object); 1746381b7242SAlan Cox 1747d2bf64c3SKonstantin Belousov /* 1748381b7242SAlan Cox * If the page was filled by a pager, save the virtual address that 1749381b7242SAlan Cox * should be faulted on next under a sequential access pattern to the 1750381b7242SAlan Cox * map entry. A read lock on the map suffices to update this address 1751381b7242SAlan Cox * safely. 1752d2bf64c3SKonstantin Belousov */ 17535758fe71SAlan Cox if (hardfault) 1754381b7242SAlan Cox fs.entry->next_read = vaddr + ptoa(ahead) + PAGE_SIZE; 1755d2bf64c3SKonstantin Belousov 17564221e284SAlan Cox /* 175778cfe1f7SAlan Cox * Page must be completely valid or it is not fit to 17584221e284SAlan Cox * map into user space. vm_pager_get_pages() ensures this. 17594221e284SAlan Cox */ 17601e40fe41SJeff Roberson vm_page_assert_xbusied(fs.m); 17610012f373SJeff Roberson KASSERT(vm_page_all_valid(fs.m), 176278cfe1f7SAlan Cox ("vm_fault: page %p partially invalid", fs.m)); 17631e40fe41SJeff Roberson 17642c2f4413SJeff Roberson vm_fault_dirty(&fs, fs.m); 1765cbfbaad8SAlan Cox 176686735996SAlan Cox /* 176786735996SAlan Cox * Put this page into the physical map. We had to do the unlock above 176886735996SAlan Cox * because pmap_enter() may sleep. We don't put the page 176986735996SAlan Cox * back on the active queue until later so that the pageout daemon 177086735996SAlan Cox * won't find it (yet). 177186735996SAlan Cox */ 17722c2f4413SJeff Roberson pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, 17732c2f4413SJeff Roberson fs.fault_type | (fs.wired ? PMAP_ENTER_WIRED : 0), 0); 17742c2f4413SJeff Roberson if (faultcount != 1 && (fs.fault_flags & VM_FAULT_WIRE) == 0 && 17752c2f4413SJeff Roberson fs.wired == 0) 1776b0cd2017SGleb Smirnoff vm_fault_prefault(&fs, vaddr, 1777b0cd2017SGleb Smirnoff faultcount > 0 ? behind : PFBAK, 1778a7163bb9SKonstantin Belousov faultcount > 0 ? ahead : PFFOR, false); 1779ff97964aSJohn Dyson 1780df8bae1dSRodney W. Grimes /* 17810d94caffSDavid Greenman * If the page is not wired down, then put it where the pageout daemon 17820d94caffSDavid Greenman * can find it. 1783df8bae1dSRodney W. Grimes */ 17842c2f4413SJeff Roberson if ((fs.fault_flags & VM_FAULT_WIRE) != 0) 17854866e085SJohn Dyson vm_page_wire(fs.m); 17869f5632e6SMark Johnston else 17874866e085SJohn Dyson vm_page_activate(fs.m); 17882c2f4413SJeff Roberson if (fs.m_hold != NULL) { 17892c2f4413SJeff Roberson (*fs.m_hold) = fs.m; 1790eeacb3b0SMark Johnston vm_page_wire(fs.m); 1791acd11c74SAlan Cox } 1792c7aebda8SAttilio Rao vm_page_xunbusy(fs.m); 17934bf95d00SJeff Roberson fs.m = NULL; 1794eeec6babSJohn Baldwin 1795eebf3286SAlan Cox /* 1796eebf3286SAlan Cox * Unlock everything, and return 1797eebf3286SAlan Cox */ 17980a310c94SMateusz Guzik vm_fault_deallocate(&fs); 1799b3a01bdfSAndrey Zonov if (hardfault) { 180083c9dea1SGleb Smirnoff VM_CNT_INC(v_io_faults); 18011c4bcd05SJeff Roberson curthread->td_ru.ru_majflt++; 1802ae34b6ffSEdward Tomasz Napierala #ifdef RACCT 1803ae34b6ffSEdward Tomasz Napierala if (racct_enable && fs.object->type == OBJT_VNODE) { 1804ae34b6ffSEdward Tomasz Napierala PROC_LOCK(curproc); 18052c2f4413SJeff Roberson if ((fs.fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { 1806ae34b6ffSEdward Tomasz Napierala racct_add_force(curproc, RACCT_WRITEBPS, 1807ae34b6ffSEdward Tomasz Napierala PAGE_SIZE + behind * PAGE_SIZE); 1808ae34b6ffSEdward Tomasz Napierala racct_add_force(curproc, RACCT_WRITEIOPS, 1); 1809ae34b6ffSEdward Tomasz Napierala } else { 1810ae34b6ffSEdward Tomasz Napierala racct_add_force(curproc, RACCT_READBPS, 1811ae34b6ffSEdward Tomasz Napierala PAGE_SIZE + ahead * PAGE_SIZE); 1812ae34b6ffSEdward Tomasz Napierala racct_add_force(curproc, RACCT_READIOPS, 1); 1813ae34b6ffSEdward Tomasz Napierala } 1814ae34b6ffSEdward Tomasz Napierala PROC_UNLOCK(curproc); 1815ae34b6ffSEdward Tomasz Napierala } 1816ae34b6ffSEdward Tomasz Napierala #endif 1817b3a01bdfSAndrey Zonov } else 18181c4bcd05SJeff Roberson curthread->td_ru.ru_minflt++; 1819df8bae1dSRodney W. Grimes 1820df8bae1dSRodney W. Grimes return (KERN_SUCCESS); 1821df8bae1dSRodney W. Grimes } 1822df8bae1dSRodney W. Grimes 1823df8bae1dSRodney W. Grimes /* 1824a8b0f100SAlan Cox * Speed up the reclamation of pages that precede the faulting pindex within 1825a8b0f100SAlan Cox * the first object of the shadow chain. Essentially, perform the equivalent 1826a8b0f100SAlan Cox * to madvise(..., MADV_DONTNEED) on a large cluster of pages that precedes 1827a8b0f100SAlan Cox * the faulting pindex by the cluster size when the pages read by vm_fault() 1828a8b0f100SAlan Cox * cross a cluster-size boundary. The cluster size is the greater of the 1829a8b0f100SAlan Cox * smallest superpage size and VM_FAULT_DONTNEED_MIN. 1830a8b0f100SAlan Cox * 1831a8b0f100SAlan Cox * When "fs->first_object" is a shadow object, the pages in the backing object 1832a8b0f100SAlan Cox * that precede the faulting pindex are deactivated by vm_fault(). So, this 1833a8b0f100SAlan Cox * function must only be concerned with pages in the first object. 183413458803SAlan Cox */ 183513458803SAlan Cox static void 1836a8b0f100SAlan Cox vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead) 183713458803SAlan Cox { 1838a8b0f100SAlan Cox vm_map_entry_t entry; 183940cbcb99SJohn Baldwin vm_object_t first_object; 1840a8b0f100SAlan Cox vm_offset_t end, start; 1841a8b0f100SAlan Cox vm_page_t m, m_next; 1842a8b0f100SAlan Cox vm_pindex_t pend, pstart; 1843a8b0f100SAlan Cox vm_size_t size; 184413458803SAlan Cox 184540cbcb99SJohn Baldwin VM_OBJECT_ASSERT_UNLOCKED(fs->object); 184613458803SAlan Cox first_object = fs->first_object; 1847a8b0f100SAlan Cox /* Neither fictitious nor unmanaged pages can be reclaimed. */ 184828634820SAlan Cox if ((first_object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0) { 18491e40fe41SJeff Roberson VM_OBJECT_RLOCK(first_object); 1850a8b0f100SAlan Cox size = VM_FAULT_DONTNEED_MIN; 1851a8b0f100SAlan Cox if (MAXPAGESIZES > 1 && size < pagesizes[1]) 1852a8b0f100SAlan Cox size = pagesizes[1]; 1853a8b0f100SAlan Cox end = rounddown2(vaddr, size); 1854a8b0f100SAlan Cox if (vaddr - end >= size - PAGE_SIZE - ptoa(ahead) && 1855a8b0f100SAlan Cox (entry = fs->entry)->start < end) { 1856a8b0f100SAlan Cox if (end - entry->start < size) 1857a8b0f100SAlan Cox start = entry->start; 185813458803SAlan Cox else 1859a8b0f100SAlan Cox start = end - size; 1860a8b0f100SAlan Cox pmap_advise(fs->map->pmap, start, end, MADV_DONTNEED); 1861a8b0f100SAlan Cox pstart = OFF_TO_IDX(entry->offset) + atop(start - 1862a8b0f100SAlan Cox entry->start); 1863a8b0f100SAlan Cox m_next = vm_page_find_least(first_object, pstart); 1864a8b0f100SAlan Cox pend = OFF_TO_IDX(entry->offset) + atop(end - 1865a8b0f100SAlan Cox entry->start); 1866a8b0f100SAlan Cox while ((m = m_next) != NULL && m->pindex < pend) { 1867a8b0f100SAlan Cox m_next = TAILQ_NEXT(m, listq); 18680012f373SJeff Roberson if (!vm_page_all_valid(m) || 1869a8b0f100SAlan Cox vm_page_busied(m)) 187013458803SAlan Cox continue; 1871d8015db3SAlan Cox 1872d8015db3SAlan Cox /* 1873d8015db3SAlan Cox * Don't clear PGA_REFERENCED, since it would 1874d8015db3SAlan Cox * likely represent a reference by a different 1875d8015db3SAlan Cox * process. 1876d8015db3SAlan Cox * 1877d8015db3SAlan Cox * Typically, at this point, prefetched pages 1878d8015db3SAlan Cox * are still in the inactive queue. Only 1879d8015db3SAlan Cox * pages that triggered page faults are in the 18809f5632e6SMark Johnston * active queue. The test for whether the page 18819f5632e6SMark Johnston * is in the inactive queue is racy; in the 18829f5632e6SMark Johnston * worst case we will requeue the page 18839f5632e6SMark Johnston * unnecessarily. 1884d8015db3SAlan Cox */ 18850eb50f9cSMark Johnston if (!vm_page_inactive(m)) 1886d8015db3SAlan Cox vm_page_deactivate(m); 188713458803SAlan Cox } 188813458803SAlan Cox } 18891e40fe41SJeff Roberson VM_OBJECT_RUNLOCK(first_object); 1890a8b0f100SAlan Cox } 189113458803SAlan Cox } 189213458803SAlan Cox 189313458803SAlan Cox /* 1894566526a9SAlan Cox * vm_fault_prefault provides a quick way of clustering 1895566526a9SAlan Cox * pagefaults into a processes address space. It is a "cousin" 1896566526a9SAlan Cox * of vm_map_pmap_enter, except it runs at page fault time instead 1897566526a9SAlan Cox * of mmap time. 1898566526a9SAlan Cox */ 1899566526a9SAlan Cox static void 190063281952SAlan Cox vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra, 1901a7163bb9SKonstantin Belousov int backward, int forward, bool obj_locked) 1902566526a9SAlan Cox { 190363281952SAlan Cox pmap_t pmap; 190463281952SAlan Cox vm_map_entry_t entry; 190563281952SAlan Cox vm_object_t backing_object, lobject; 1906566526a9SAlan Cox vm_offset_t addr, starta; 1907566526a9SAlan Cox vm_pindex_t pindex; 19082053c127SStephan Uphoff vm_page_t m; 1909f1d73aacSAlan Cox vm_prot_t prot; 1910b0cd2017SGleb Smirnoff int i; 1911566526a9SAlan Cox 191263281952SAlan Cox pmap = fs->map->pmap; 1913950d5f7aSAlan Cox if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) 1914566526a9SAlan Cox return; 1915566526a9SAlan Cox 191663281952SAlan Cox entry = fs->entry; 1917566526a9SAlan Cox 191863cdcaaeSKonstantin Belousov if (addra < backward * PAGE_SIZE) { 1919566526a9SAlan Cox starta = entry->start; 192063cdcaaeSKonstantin Belousov } else { 192163cdcaaeSKonstantin Belousov starta = addra - backward * PAGE_SIZE; 192263cdcaaeSKonstantin Belousov if (starta < entry->start) 192363cdcaaeSKonstantin Belousov starta = entry->start; 1924566526a9SAlan Cox } 1925f1d73aacSAlan Cox prot = entry->protection; 1926f1d73aacSAlan Cox 1927f1d73aacSAlan Cox /* 1928f1d73aacSAlan Cox * If pmap_enter() has enabled write access on a nearby mapping, then 1929f1d73aacSAlan Cox * don't attempt promotion, because it will fail. 1930f1d73aacSAlan Cox */ 1931f1d73aacSAlan Cox if ((fs->prot & VM_PROT_WRITE) != 0) 1932f1d73aacSAlan Cox prot |= VM_PROT_NO_PROMOTE; 1933566526a9SAlan Cox 193463281952SAlan Cox /* 193563281952SAlan Cox * Generate the sequence of virtual addresses that are candidates for 193663281952SAlan Cox * prefaulting in an outward spiral from the faulting virtual address, 193763281952SAlan Cox * "addra". Specifically, the sequence is "addra - PAGE_SIZE", "addra 193863281952SAlan Cox * + PAGE_SIZE", "addra - 2 * PAGE_SIZE", "addra + 2 * PAGE_SIZE", ... 193963281952SAlan Cox * If the candidate address doesn't have a backing physical page, then 194063281952SAlan Cox * the loop immediately terminates. 194163281952SAlan Cox */ 194263281952SAlan Cox for (i = 0; i < 2 * imax(backward, forward); i++) { 194363281952SAlan Cox addr = addra + ((i >> 1) + 1) * ((i & 1) == 0 ? -PAGE_SIZE : 194463281952SAlan Cox PAGE_SIZE); 194563281952SAlan Cox if (addr > addra + forward * PAGE_SIZE) 1946566526a9SAlan Cox addr = 0; 1947566526a9SAlan Cox 1948566526a9SAlan Cox if (addr < starta || addr >= entry->end) 1949566526a9SAlan Cox continue; 1950566526a9SAlan Cox 1951566526a9SAlan Cox if (!pmap_is_prefaultable(pmap, addr)) 1952566526a9SAlan Cox continue; 1953566526a9SAlan Cox 1954566526a9SAlan Cox pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; 195563281952SAlan Cox lobject = entry->object.vm_object; 1956a7163bb9SKonstantin Belousov if (!obj_locked) 1957c141ae7fSAlan Cox VM_OBJECT_RLOCK(lobject); 1958566526a9SAlan Cox while ((m = vm_page_lookup(lobject, pindex)) == NULL && 19590a310c94SMateusz Guzik !vm_fault_object_needs_getpages(lobject) && 1960566526a9SAlan Cox (backing_object = lobject->backing_object) != NULL) { 196136930fc9SAlan Cox KASSERT((lobject->backing_object_offset & PAGE_MASK) == 196236930fc9SAlan Cox 0, ("vm_fault_prefault: unaligned object offset")); 1963566526a9SAlan Cox pindex += lobject->backing_object_offset >> PAGE_SHIFT; 1964c141ae7fSAlan Cox VM_OBJECT_RLOCK(backing_object); 1965a7163bb9SKonstantin Belousov if (!obj_locked || lobject != entry->object.vm_object) 1966c141ae7fSAlan Cox VM_OBJECT_RUNLOCK(lobject); 1967566526a9SAlan Cox lobject = backing_object; 1968566526a9SAlan Cox } 1969cbfbaad8SAlan Cox if (m == NULL) { 1970a7163bb9SKonstantin Belousov if (!obj_locked || lobject != entry->object.vm_object) 1971c141ae7fSAlan Cox VM_OBJECT_RUNLOCK(lobject); 1972566526a9SAlan Cox break; 1973cbfbaad8SAlan Cox } 19740012f373SJeff Roberson if (vm_page_all_valid(m) && 19753c4a2440SAlan Cox (m->flags & PG_FICTITIOUS) == 0) 1976f1d73aacSAlan Cox pmap_enter_quick(pmap, addr, m, prot); 1977a7163bb9SKonstantin Belousov if (!obj_locked || lobject != entry->object.vm_object) 1978c141ae7fSAlan Cox VM_OBJECT_RUNLOCK(lobject); 1979566526a9SAlan Cox } 1980566526a9SAlan Cox } 1981566526a9SAlan Cox 1982566526a9SAlan Cox /* 198382de724fSAlan Cox * Hold each of the physical pages that are mapped by the specified range of 198482de724fSAlan Cox * virtual addresses, ["addr", "addr" + "len"), if those mappings are valid 198582de724fSAlan Cox * and allow the specified types of access, "prot". If all of the implied 198682de724fSAlan Cox * pages are successfully held, then the number of held pages is returned 198782de724fSAlan Cox * together with pointers to those pages in the array "ma". However, if any 198882de724fSAlan Cox * of the pages cannot be held, -1 is returned. 198982de724fSAlan Cox */ 199082de724fSAlan Cox int 199182de724fSAlan Cox vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len, 199282de724fSAlan Cox vm_prot_t prot, vm_page_t *ma, int max_count) 199382de724fSAlan Cox { 199482de724fSAlan Cox vm_offset_t end, va; 199582de724fSAlan Cox vm_page_t *mp; 19967e14088dSKonstantin Belousov int count; 199782de724fSAlan Cox boolean_t pmap_failed; 199882de724fSAlan Cox 1999af32c419SKonstantin Belousov if (len == 0) 2000af32c419SKonstantin Belousov return (0); 200182de724fSAlan Cox end = round_page(addr + len); 200282de724fSAlan Cox addr = trunc_page(addr); 200382de724fSAlan Cox 20040f1e6ec5SMark Johnston if (!vm_map_range_valid(map, addr, end)) 200582de724fSAlan Cox return (-1); 200682de724fSAlan Cox 20077e14088dSKonstantin Belousov if (atop(end - addr) > max_count) 200882de724fSAlan Cox panic("vm_fault_quick_hold_pages: count > max_count"); 20097e14088dSKonstantin Belousov count = atop(end - addr); 201082de724fSAlan Cox 201182de724fSAlan Cox /* 201282de724fSAlan Cox * Most likely, the physical pages are resident in the pmap, so it is 201382de724fSAlan Cox * faster to try pmap_extract_and_hold() first. 201482de724fSAlan Cox */ 201582de724fSAlan Cox pmap_failed = FALSE; 201682de724fSAlan Cox for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) { 201782de724fSAlan Cox *mp = pmap_extract_and_hold(map->pmap, va, prot); 201882de724fSAlan Cox if (*mp == NULL) 201982de724fSAlan Cox pmap_failed = TRUE; 202082de724fSAlan Cox else if ((prot & VM_PROT_WRITE) != 0 && 2021a5dbab54SAlan Cox (*mp)->dirty != VM_PAGE_BITS_ALL) { 202282de724fSAlan Cox /* 202382de724fSAlan Cox * Explicitly dirty the physical page. Otherwise, the 202482de724fSAlan Cox * caller's changes may go unnoticed because they are 202582de724fSAlan Cox * performed through an unmanaged mapping or by a DMA 202682de724fSAlan Cox * operation. 20273c76db4cSAlan Cox * 2028abb9b935SKonstantin Belousov * The object lock is not held here. 2029abb9b935SKonstantin Belousov * See vm_page_clear_dirty_mask(). 203082de724fSAlan Cox */ 20313c76db4cSAlan Cox vm_page_dirty(*mp); 203282de724fSAlan Cox } 203382de724fSAlan Cox } 203482de724fSAlan Cox if (pmap_failed) { 203582de724fSAlan Cox /* 203682de724fSAlan Cox * One or more pages could not be held by the pmap. Either no 203782de724fSAlan Cox * page was mapped at the specified virtual address or that 203882de724fSAlan Cox * mapping had insufficient permissions. Attempt to fault in 203982de724fSAlan Cox * and hold these pages. 20408ec533d3SKonstantin Belousov * 20418ec533d3SKonstantin Belousov * If vm_fault_disable_pagefaults() was called, 20428ec533d3SKonstantin Belousov * i.e., TDP_NOFAULTING is set, we must not sleep nor 20438ec533d3SKonstantin Belousov * acquire MD VM locks, which means we must not call 2044df08823dSKonstantin Belousov * vm_fault(). Some (out of tree) callers mark 20458ec533d3SKonstantin Belousov * too wide a code area with vm_fault_disable_pagefaults() 20468ec533d3SKonstantin Belousov * already, use the VM_PROT_QUICK_NOFAULT flag to request 20478ec533d3SKonstantin Belousov * the proper behaviour explicitly. 204882de724fSAlan Cox */ 20498ec533d3SKonstantin Belousov if ((prot & VM_PROT_QUICK_NOFAULT) != 0 && 20508ec533d3SKonstantin Belousov (curthread->td_pflags & TDP_NOFAULTING) != 0) 20518ec533d3SKonstantin Belousov goto error; 205282de724fSAlan Cox for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) 2053df08823dSKonstantin Belousov if (*mp == NULL && vm_fault(map, va, prot, 205482de724fSAlan Cox VM_FAULT_NORMAL, mp) != KERN_SUCCESS) 205582de724fSAlan Cox goto error; 205682de724fSAlan Cox } 205782de724fSAlan Cox return (count); 205882de724fSAlan Cox error: 205982de724fSAlan Cox for (mp = ma; mp < ma + count; mp++) 2060fee2a2faSMark Johnston if (*mp != NULL) 2061fee2a2faSMark Johnston vm_page_unwire(*mp, PQ_INACTIVE); 206282de724fSAlan Cox return (-1); 206382de724fSAlan Cox } 206482de724fSAlan Cox 206582de724fSAlan Cox /* 2066df8bae1dSRodney W. Grimes * Routine: 2067df8bae1dSRodney W. Grimes * vm_fault_copy_entry 2068df8bae1dSRodney W. Grimes * Function: 2069b57be759SMark Johnston * Create new object backing dst_entry with private copy of all 2070b57be759SMark Johnston * underlying pages. When src_entry is equal to dst_entry, function 2071b57be759SMark Johnston * implements COW for wired-down map entry. Otherwise, it forks 2072b57be759SMark Johnston * wired entry into dst_map. 2073df8bae1dSRodney W. Grimes * 2074df8bae1dSRodney W. Grimes * In/out conditions: 2075df8bae1dSRodney W. Grimes * The source and destination maps must be locked for write. 2076df8bae1dSRodney W. Grimes * The source map entry must be wired down (or be a sharing map 2077df8bae1dSRodney W. Grimes * entry corresponding to a main map entry that is wired down). 2078df8bae1dSRodney W. Grimes */ 207926f9a767SRodney W. Grimes void 2080b57be759SMark Johnston vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map __unused, 2081121fd461SKonstantin Belousov vm_map_entry_t dst_entry, vm_map_entry_t src_entry, 2082121fd461SKonstantin Belousov vm_ooffset_t *fork_charge) 2083df8bae1dSRodney W. Grimes { 2084210a6886SKonstantin Belousov vm_object_t backing_object, dst_object, object, src_object; 20857afab86cSAlan Cox vm_pindex_t dst_pindex, pindex, src_pindex; 2086210a6886SKonstantin Belousov vm_prot_t access, prot; 2087df8bae1dSRodney W. Grimes vm_offset_t vaddr; 2088df8bae1dSRodney W. Grimes vm_page_t dst_m; 2089df8bae1dSRodney W. Grimes vm_page_t src_m; 2090b57be759SMark Johnston bool upgrade; 2091df8bae1dSRodney W. Grimes 2092210a6886SKonstantin Belousov upgrade = src_entry == dst_entry; 2093b57be759SMark Johnston KASSERT(upgrade || dst_entry->object.vm_object == NULL, 2094b57be759SMark Johnston ("vm_fault_copy_entry: vm_object not NULL")); 2095b57be759SMark Johnston 2096b57be759SMark Johnston /* 2097b57be759SMark Johnston * If not an upgrade, then enter the mappings in the pmap as 2098b57be759SMark Johnston * read and/or execute accesses. Otherwise, enter them as 2099b57be759SMark Johnston * write accesses. 2100b57be759SMark Johnston * 2101b57be759SMark Johnston * A writeable large page mapping is only created if all of 2102b57be759SMark Johnston * the constituent small page mappings are modified. Marking 2103b57be759SMark Johnston * PTEs as modified on inception allows promotion to happen 2104b57be759SMark Johnston * without taking potentially large number of soft faults. 2105b57be759SMark Johnston */ 21060973283dSKonstantin Belousov access = prot = dst_entry->protection; 2107b57be759SMark Johnston if (!upgrade) 2108b57be759SMark Johnston access &= ~VM_PROT_WRITE; 2109210a6886SKonstantin Belousov 2110df8bae1dSRodney W. Grimes src_object = src_entry->object.vm_object; 21117afab86cSAlan Cox src_pindex = OFF_TO_IDX(src_entry->offset); 2112df8bae1dSRodney W. Grimes 21130973283dSKonstantin Belousov if (upgrade && (dst_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) { 21140973283dSKonstantin Belousov dst_object = src_object; 21150973283dSKonstantin Belousov vm_object_reference(dst_object); 21160973283dSKonstantin Belousov } else { 2117df8bae1dSRodney W. Grimes /* 211867388836SKonstantin Belousov * Create the top-level object for the destination entry. 211967388836SKonstantin Belousov * Doesn't actually shadow anything - we copy the pages 212067388836SKonstantin Belousov * directly. 2121df8bae1dSRodney W. Grimes */ 212267388836SKonstantin Belousov dst_object = vm_object_allocate_anon(atop(dst_entry->end - 212367388836SKonstantin Belousov dst_entry->start), NULL, NULL, 0); 2124f8a47341SAlan Cox #if VM_NRESERVLEVEL > 0 2125f8a47341SAlan Cox dst_object->flags |= OBJ_COLORED; 2126f8a47341SAlan Cox dst_object->pg_color = atop(dst_entry->start); 2127f8a47341SAlan Cox #endif 2128a60d3db1SKonstantin Belousov dst_object->domain = src_object->domain; 2129a60d3db1SKonstantin Belousov dst_object->charge = dst_entry->end - dst_entry->start; 2130df8bae1dSRodney W. Grimes 2131df8bae1dSRodney W. Grimes dst_entry->object.vm_object = dst_object; 2132df8bae1dSRodney W. Grimes dst_entry->offset = 0; 213378022527SKonstantin Belousov dst_entry->eflags &= ~MAP_ENTRY_VN_EXEC; 21340973283dSKonstantin Belousov } 2135b57be759SMark Johnston 2136b57be759SMark Johnston VM_OBJECT_WLOCK(dst_object); 2137210a6886SKonstantin Belousov if (fork_charge != NULL) { 2138ef694c1aSEdward Tomasz Napierala KASSERT(dst_entry->cred == NULL, 2139121fd461SKonstantin Belousov ("vm_fault_copy_entry: leaked swp charge")); 2140ef694c1aSEdward Tomasz Napierala dst_object->cred = curthread->td_ucred; 2141ef694c1aSEdward Tomasz Napierala crhold(dst_object->cred); 2142121fd461SKonstantin Belousov *fork_charge += dst_object->charge; 21430cb2610eSMark Johnston } else if ((dst_object->flags & OBJ_SWAP) != 0 && 21449f25ab83SKonstantin Belousov dst_object->cred == NULL) { 21450973283dSKonstantin Belousov KASSERT(dst_entry->cred != NULL, ("no cred for entry %p", 21460973283dSKonstantin Belousov dst_entry)); 2147ef694c1aSEdward Tomasz Napierala dst_object->cred = dst_entry->cred; 2148ef694c1aSEdward Tomasz Napierala dst_entry->cred = NULL; 2149210a6886SKonstantin Belousov } 21500973283dSKonstantin Belousov 2151210a6886SKonstantin Belousov /* 2152ef45823eSKonstantin Belousov * Loop through all of the virtual pages within the entry's 2153ef45823eSKonstantin Belousov * range, copying each page from the source object to the 2154ef45823eSKonstantin Belousov * destination object. Since the source is wired, those pages 2155ef45823eSKonstantin Belousov * must exist. In contrast, the destination is pageable. 21566939b4d3SMark Johnston * Since the destination object doesn't share any backing storage 2157ef45823eSKonstantin Belousov * with the source object, all of its pages must be dirtied, 2158ef45823eSKonstantin Belousov * regardless of whether they can be written. 2159df8bae1dSRodney W. Grimes */ 21607afab86cSAlan Cox for (vaddr = dst_entry->start, dst_pindex = 0; 2161df8bae1dSRodney W. Grimes vaddr < dst_entry->end; 21627afab86cSAlan Cox vaddr += PAGE_SIZE, dst_pindex++) { 21630973283dSKonstantin Belousov again: 2164df8bae1dSRodney W. Grimes /* 2165df8bae1dSRodney W. Grimes * Find the page in the source object, and copy it in. 21664c74acf7SKonstantin Belousov * Because the source is wired down, the page will be 21674c74acf7SKonstantin Belousov * in memory. 2168df8bae1dSRodney W. Grimes */ 21690973283dSKonstantin Belousov if (src_object != dst_object) 217083b375eaSAttilio Rao VM_OBJECT_RLOCK(src_object); 2171c5b65a67SAlan Cox object = src_object; 21727afab86cSAlan Cox pindex = src_pindex + dst_pindex; 21737afab86cSAlan Cox while ((src_m = vm_page_lookup(object, pindex)) == NULL && 2174c5b65a67SAlan Cox (backing_object = object->backing_object) != NULL) { 2175c5b65a67SAlan Cox /* 21764c74acf7SKonstantin Belousov * Unless the source mapping is read-only or 21774c74acf7SKonstantin Belousov * it is presently being upgraded from 21784c74acf7SKonstantin Belousov * read-only, the first object in the shadow 21794c74acf7SKonstantin Belousov * chain should provide all of the pages. In 21804c74acf7SKonstantin Belousov * other words, this loop body should never be 21814c74acf7SKonstantin Belousov * executed when the source mapping is already 21824c74acf7SKonstantin Belousov * read/write. 2183c5b65a67SAlan Cox */ 21844c74acf7SKonstantin Belousov KASSERT((src_entry->protection & VM_PROT_WRITE) == 0 || 21854c74acf7SKonstantin Belousov upgrade, 21864c74acf7SKonstantin Belousov ("vm_fault_copy_entry: main object missing page")); 21874c74acf7SKonstantin Belousov 218883b375eaSAttilio Rao VM_OBJECT_RLOCK(backing_object); 2189c5b65a67SAlan Cox pindex += OFF_TO_IDX(object->backing_object_offset); 21900973283dSKonstantin Belousov if (object != dst_object) 219183b375eaSAttilio Rao VM_OBJECT_RUNLOCK(object); 2192c5b65a67SAlan Cox object = backing_object; 2193c5b65a67SAlan Cox } 21944c74acf7SKonstantin Belousov KASSERT(src_m != NULL, ("vm_fault_copy_entry: page missing")); 21950973283dSKonstantin Belousov 21960973283dSKonstantin Belousov if (object != dst_object) { 21970973283dSKonstantin Belousov /* 21980973283dSKonstantin Belousov * Allocate a page in the destination object. 21990973283dSKonstantin Belousov */ 22002602a2eaSKonstantin Belousov dst_m = vm_page_alloc(dst_object, (src_object == 22012602a2eaSKonstantin Belousov dst_object ? src_pindex : 0) + dst_pindex, 22022602a2eaSKonstantin Belousov VM_ALLOC_NORMAL); 22030973283dSKonstantin Belousov if (dst_m == NULL) { 22040973283dSKonstantin Belousov VM_OBJECT_WUNLOCK(dst_object); 22050973283dSKonstantin Belousov VM_OBJECT_RUNLOCK(object); 22062c0f13aaSKonstantin Belousov vm_wait(dst_object); 2207c8f780e3SKonstantin Belousov VM_OBJECT_WLOCK(dst_object); 22080973283dSKonstantin Belousov goto again; 22090973283dSKonstantin Belousov } 22105c50e900SMark Johnston 22115c50e900SMark Johnston /* 22125c50e900SMark Johnston * See the comment in vm_fault_cow(). 22135c50e900SMark Johnston */ 22145c50e900SMark Johnston if (src_object == dst_object && 22155c50e900SMark Johnston (object->flags & OBJ_ONEMAPPING) == 0) 22165c50e900SMark Johnston pmap_remove_all(src_m); 2217669890eaSAlan Cox pmap_copy_page(src_m, dst_m); 2218d0443e2bSMark Johnston 2219d0443e2bSMark Johnston /* 2220d0443e2bSMark Johnston * The object lock does not guarantee that "src_m" will 2221d0443e2bSMark Johnston * transition from invalid to valid, but it does ensure 2222d0443e2bSMark Johnston * that "src_m" will not transition from valid to 2223d0443e2bSMark Johnston * invalid. 2224d0443e2bSMark Johnston */ 222545d72c7dSKonstantin Belousov dst_m->dirty = dst_m->valid = src_m->valid; 2226d0443e2bSMark Johnston VM_OBJECT_RUNLOCK(object); 22270973283dSKonstantin Belousov } else { 22280973283dSKonstantin Belousov dst_m = src_m; 222963e97555SJeff Roberson if (vm_page_busy_acquire(dst_m, VM_ALLOC_WAITFAIL) == 0) 22300973283dSKonstantin Belousov goto again; 223163e97555SJeff Roberson if (dst_m->pindex >= dst_object->size) { 2232c62637d6SKonstantin Belousov /* 2233c62637d6SKonstantin Belousov * We are upgrading. Index can occur 2234c62637d6SKonstantin Belousov * out of bounds if the object type is 2235c62637d6SKonstantin Belousov * vnode and the file was truncated. 2236c62637d6SKonstantin Belousov */ 223763e97555SJeff Roberson vm_page_xunbusy(dst_m); 2238c62637d6SKonstantin Belousov break; 223963e97555SJeff Roberson } 22400973283dSKonstantin Belousov } 2241df8bae1dSRodney W. Grimes 2242df8bae1dSRodney W. Grimes /* 2243210a6886SKonstantin Belousov * Enter it in the pmap. If a wired, copy-on-write 2244210a6886SKonstantin Belousov * mapping is being replaced by a write-enabled 2245210a6886SKonstantin Belousov * mapping, then wire that new mapping. 224645d72c7dSKonstantin Belousov * 224745d72c7dSKonstantin Belousov * The page can be invalid if the user called 224845d72c7dSKonstantin Belousov * msync(MS_INVALIDATE) or truncated the backing vnode 224945d72c7dSKonstantin Belousov * or shared memory object. In this case, do not 225045d72c7dSKonstantin Belousov * insert it into pmap, but still do the copy so that 225145d72c7dSKonstantin Belousov * all copies of the wired map entry have similar 225245d72c7dSKonstantin Belousov * backing pages. 2253df8bae1dSRodney W. Grimes */ 22540012f373SJeff Roberson if (vm_page_all_valid(dst_m)) { 22551f88394bSMark Johnston VM_OBJECT_WUNLOCK(dst_object); 225639ffa8c1SKonstantin Belousov pmap_enter(dst_map->pmap, vaddr, dst_m, prot, 225739ffa8c1SKonstantin Belousov access | (upgrade ? PMAP_ENTER_WIRED : 0), 0); 22581f88394bSMark Johnston VM_OBJECT_WLOCK(dst_object); 225945d72c7dSKonstantin Belousov } 2260df8bae1dSRodney W. Grimes 2261df8bae1dSRodney W. Grimes /* 2262df8bae1dSRodney W. Grimes * Mark it no longer busy, and put it on the active list. 2263df8bae1dSRodney W. Grimes */ 2264210a6886SKonstantin Belousov if (upgrade) { 22650973283dSKonstantin Belousov if (src_m != dst_m) { 22663ae10f74SAttilio Rao vm_page_unwire(src_m, PQ_INACTIVE); 2267210a6886SKonstantin Belousov vm_page_wire(dst_m); 22682965a453SKip Macy } else { 2269d842aa51SMark Johnston KASSERT(vm_page_wired(dst_m), 22700973283dSKonstantin Belousov ("dst_m %p is not wired", dst_m)); 22710973283dSKonstantin Belousov } 22720973283dSKonstantin Belousov } else { 2273df8bae1dSRodney W. Grimes vm_page_activate(dst_m); 22742965a453SKip Macy } 2275c7aebda8SAttilio Rao vm_page_xunbusy(dst_m); 2276df8bae1dSRodney W. Grimes } 227789f6b863SAttilio Rao VM_OBJECT_WUNLOCK(dst_object); 2278210a6886SKonstantin Belousov if (upgrade) { 2279210a6886SKonstantin Belousov dst_entry->eflags &= ~(MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY); 2280210a6886SKonstantin Belousov vm_object_deallocate(src_object); 2281210a6886SKonstantin Belousov } 2282df8bae1dSRodney W. Grimes } 228326f9a767SRodney W. Grimes 22845730afc9SAlan Cox /* 22855730afc9SAlan Cox * Block entry into the machine-independent layer's page fault handler by 22865730afc9SAlan Cox * the calling thread. Subsequent calls to vm_fault() by that thread will 22875730afc9SAlan Cox * return KERN_PROTECTION_FAILURE. Enable machine-dependent handling of 22885730afc9SAlan Cox * spurious page faults. 22895730afc9SAlan Cox */ 22902801687dSKonstantin Belousov int 22912801687dSKonstantin Belousov vm_fault_disable_pagefaults(void) 22922801687dSKonstantin Belousov { 22932801687dSKonstantin Belousov 22945730afc9SAlan Cox return (curthread_pflags_set(TDP_NOFAULTING | TDP_RESETSPUR)); 22952801687dSKonstantin Belousov } 22962801687dSKonstantin Belousov 22972801687dSKonstantin Belousov void 22982801687dSKonstantin Belousov vm_fault_enable_pagefaults(int save) 22992801687dSKonstantin Belousov { 23002801687dSKonstantin Belousov 23012801687dSKonstantin Belousov curthread_pflags_restore(save); 23022801687dSKonstantin Belousov } 2303