160727d8bSWarner Losh /*- 226f9a767SRodney W. Grimes * Copyright (c) 1991 Regents of the University of California. 326f9a767SRodney W. Grimes * All rights reserved. 426f9a767SRodney W. Grimes * Copyright (c) 1994 John S. Dyson 526f9a767SRodney W. Grimes * All rights reserved. 626f9a767SRodney W. Grimes * Copyright (c) 1994 David Greenman 726f9a767SRodney W. Grimes * All rights reserved. 88dbca793STor Egge * Copyright (c) 2005 Yahoo! Technologies Norway AS 98dbca793STor Egge * All rights reserved. 10df8bae1dSRodney W. Grimes * 11df8bae1dSRodney W. Grimes * This code is derived from software contributed to Berkeley by 12df8bae1dSRodney W. Grimes * The Mach Operating System project at Carnegie-Mellon University. 13df8bae1dSRodney W. Grimes * 14df8bae1dSRodney W. Grimes * Redistribution and use in source and binary forms, with or without 15df8bae1dSRodney W. Grimes * modification, are permitted provided that the following conditions 16df8bae1dSRodney W. Grimes * are met: 17df8bae1dSRodney W. Grimes * 1. Redistributions of source code must retain the above copyright 18df8bae1dSRodney W. Grimes * notice, this list of conditions and the following disclaimer. 19df8bae1dSRodney W. Grimes * 2. Redistributions in binary form must reproduce the above copyright 20df8bae1dSRodney W. Grimes * notice, this list of conditions and the following disclaimer in the 21df8bae1dSRodney W. Grimes * documentation and/or other materials provided with the distribution. 22df8bae1dSRodney W. Grimes * 3. All advertising materials mentioning features or use of this software 235929bcfaSPhilippe Charnier * must display the following acknowledgement: 24df8bae1dSRodney W. Grimes * This product includes software developed by the University of 25df8bae1dSRodney W. Grimes * California, Berkeley and its contributors. 26df8bae1dSRodney W. Grimes * 4. Neither the name of the University nor the names of its contributors 27df8bae1dSRodney W. Grimes * may be used to endorse or promote products derived from this software 28df8bae1dSRodney W. Grimes * without specific prior written permission. 29df8bae1dSRodney W. Grimes * 30df8bae1dSRodney W. Grimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 31df8bae1dSRodney W. Grimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 32df8bae1dSRodney W. Grimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 33df8bae1dSRodney W. Grimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 34df8bae1dSRodney W. Grimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 35df8bae1dSRodney W. Grimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 36df8bae1dSRodney W. Grimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 37df8bae1dSRodney W. Grimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 38df8bae1dSRodney W. Grimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 39df8bae1dSRodney W. Grimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 40df8bae1dSRodney W. Grimes * SUCH DAMAGE. 41df8bae1dSRodney W. Grimes * 423c4dd356SDavid Greenman * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 43df8bae1dSRodney W. Grimes * 44df8bae1dSRodney W. Grimes * 45df8bae1dSRodney W. Grimes * Copyright (c) 1987, 1990 Carnegie-Mellon University. 46df8bae1dSRodney W. Grimes * All rights reserved. 47df8bae1dSRodney W. Grimes * 48df8bae1dSRodney W. Grimes * Authors: Avadis Tevanian, Jr., Michael Wayne Young 49df8bae1dSRodney W. Grimes * 50df8bae1dSRodney W. Grimes * Permission to use, copy, modify and distribute this software and 51df8bae1dSRodney W. Grimes * its documentation is hereby granted, provided that both the copyright 52df8bae1dSRodney W. Grimes * notice and this permission notice appear in all copies of the 53df8bae1dSRodney W. Grimes * software, derivative works or modified versions, and any portions 54df8bae1dSRodney W. Grimes * thereof, and that both notices appear in supporting documentation. 55df8bae1dSRodney W. Grimes * 56df8bae1dSRodney W. Grimes * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 57df8bae1dSRodney W. Grimes * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 58df8bae1dSRodney W. Grimes * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 59df8bae1dSRodney W. Grimes * 60df8bae1dSRodney W. Grimes * Carnegie Mellon requests users of this software to return to 61df8bae1dSRodney W. Grimes * 62df8bae1dSRodney W. Grimes * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 63df8bae1dSRodney W. Grimes * School of Computer Science 64df8bae1dSRodney W. Grimes * Carnegie Mellon University 65df8bae1dSRodney W. Grimes * Pittsburgh PA 15213-3890 66df8bae1dSRodney W. Grimes * 67df8bae1dSRodney W. Grimes * any improvements or extensions that they make and grant Carnegie the 68df8bae1dSRodney W. Grimes * rights to redistribute these changes. 69df8bae1dSRodney W. Grimes */ 70df8bae1dSRodney W. Grimes 71df8bae1dSRodney W. Grimes /* 72df8bae1dSRodney W. Grimes * The proverbial page-out daemon. 73df8bae1dSRodney W. Grimes */ 74df8bae1dSRodney W. Grimes 75874651b1SDavid E. O'Brien #include <sys/cdefs.h> 76874651b1SDavid E. O'Brien __FBSDID("$FreeBSD$"); 77874651b1SDavid E. O'Brien 78faa5f8d8SAndrzej Bialecki #include "opt_vm.h" 797672ca05SMark Johnston 80df8bae1dSRodney W. Grimes #include <sys/param.h> 8126f9a767SRodney W. Grimes #include <sys/systm.h> 82b5e8ce9fSBruce Evans #include <sys/kernel.h> 83855a310fSJeff Roberson #include <sys/eventhandler.h> 84fb919e4dSMark Murray #include <sys/lock.h> 85fb919e4dSMark Murray #include <sys/mutex.h> 8626f9a767SRodney W. Grimes #include <sys/proc.h> 879c8b8baaSPeter Wemm #include <sys/kthread.h> 880384fff8SJason Evans #include <sys/ktr.h> 8997824da3SAlan Cox #include <sys/mount.h> 90099e7e95SEdward Tomasz Napierala #include <sys/racct.h> 9126f9a767SRodney W. Grimes #include <sys/resourcevar.h> 92b43179fbSJeff Roberson #include <sys/sched.h> 9314a0d74eSSteven Hartland #include <sys/sdt.h> 94d2fc5315SPoul-Henning Kamp #include <sys/signalvar.h> 95449c2e92SKonstantin Belousov #include <sys/smp.h> 96a6bf3a9eSRyan Stone #include <sys/time.h> 97f6b04d2bSDavid Greenman #include <sys/vnode.h> 98efeaf95aSDavid Greenman #include <sys/vmmeter.h> 9989f6b863SAttilio Rao #include <sys/rwlock.h> 1001005a129SJohn Baldwin #include <sys/sx.h> 10138efa82bSJohn Dyson #include <sys/sysctl.h> 102df8bae1dSRodney W. Grimes 103df8bae1dSRodney W. Grimes #include <vm/vm.h> 104efeaf95aSDavid Greenman #include <vm/vm_param.h> 105efeaf95aSDavid Greenman #include <vm/vm_object.h> 106df8bae1dSRodney W. Grimes #include <vm/vm_page.h> 107efeaf95aSDavid Greenman #include <vm/vm_map.h> 108df8bae1dSRodney W. Grimes #include <vm/vm_pageout.h> 10924a1cce3SDavid Greenman #include <vm/vm_pager.h> 110449c2e92SKonstantin Belousov #include <vm/vm_phys.h> 11105f0fdd2SPoul-Henning Kamp #include <vm/swap_pager.h> 112efeaf95aSDavid Greenman #include <vm/vm_extern.h> 113670d17b5SJeff Roberson #include <vm/uma.h> 114df8bae1dSRodney W. Grimes 1152b14f991SJulian Elischer /* 1162b14f991SJulian Elischer * System initialization 1172b14f991SJulian Elischer */ 1182b14f991SJulian Elischer 1192b14f991SJulian Elischer /* the kernel process "vm_pageout"*/ 12011caded3SAlfred Perlstein static void vm_pageout(void); 1214d19f4adSSteven Hartland static void vm_pageout_init(void); 122*ebcddc72SAlan Cox static int vm_pageout_clean(vm_page_t m, int *numpagedout); 12334d8b7eaSJeff Roberson static int vm_pageout_cluster(vm_page_t m); 124e57dd910SAlan Cox static bool vm_pageout_scan(struct vm_domain *vmd, int pass); 12576386c7eSKonstantin Belousov static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, 12676386c7eSKonstantin Belousov int starting_page_shortage); 12745ae1d91SAlan Cox 1284d19f4adSSteven Hartland SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init, 1294d19f4adSSteven Hartland NULL); 1304d19f4adSSteven Hartland 1312b14f991SJulian Elischer struct proc *pageproc; 1322b14f991SJulian Elischer 1332b14f991SJulian Elischer static struct kproc_desc page_kp = { 1342b14f991SJulian Elischer "pagedaemon", 1352b14f991SJulian Elischer vm_pageout, 1362b14f991SJulian Elischer &pageproc 1372b14f991SJulian Elischer }; 1384d19f4adSSteven Hartland SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, 139237fdd78SRobert Watson &page_kp); 1402b14f991SJulian Elischer 14114a0d74eSSteven Hartland SDT_PROVIDER_DEFINE(vm); 14214a0d74eSSteven Hartland SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan); 14314a0d74eSSteven Hartland 14438efa82bSJohn Dyson #if !defined(NO_SWAPPING) 1452b14f991SJulian Elischer /* the kernel process "vm_daemon"*/ 14611caded3SAlfred Perlstein static void vm_daemon(void); 147f708ef1bSPoul-Henning Kamp static struct proc *vmproc; 1482b14f991SJulian Elischer 1492b14f991SJulian Elischer static struct kproc_desc vm_kp = { 1502b14f991SJulian Elischer "vmdaemon", 1512b14f991SJulian Elischer vm_daemon, 1522b14f991SJulian Elischer &vmproc 1532b14f991SJulian Elischer }; 154237fdd78SRobert Watson SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); 15538efa82bSJohn Dyson #endif 1562b14f991SJulian Elischer 157*ebcddc72SAlan Cox /* Pagedaemon activity rates, in subdivisions of one second. */ 158*ebcddc72SAlan Cox #define VM_LAUNDER_RATE 10 159*ebcddc72SAlan Cox #define VM_INACT_SCAN_RATE 2 1602b14f991SJulian Elischer 1618b245767SAlan Cox int vm_pageout_deficit; /* Estimated number of pages deficit */ 16220c58db9SMark Johnston u_int vm_pageout_wakeup_thresh; 16376386c7eSKonstantin Belousov static int vm_pageout_oom_seq = 12; 16456ce0690SAlan Cox bool vm_pageout_wanted; /* Event on which pageout daemon sleeps */ 16556ce0690SAlan Cox bool vm_pages_needed; /* Are threads waiting for free pages? */ 16626f9a767SRodney W. Grimes 167*ebcddc72SAlan Cox /* Pending request for dirty page laundering. */ 168*ebcddc72SAlan Cox static enum { 169*ebcddc72SAlan Cox VM_LAUNDRY_IDLE, 170*ebcddc72SAlan Cox VM_LAUNDRY_BACKGROUND, 171*ebcddc72SAlan Cox VM_LAUNDRY_SHORTFALL 172*ebcddc72SAlan Cox } vm_laundry_request = VM_LAUNDRY_IDLE; 173*ebcddc72SAlan Cox 17438efa82bSJohn Dyson #if !defined(NO_SWAPPING) 175f708ef1bSPoul-Henning Kamp static int vm_pageout_req_swapout; /* XXX */ 176f708ef1bSPoul-Henning Kamp static int vm_daemon_needed; 17797824da3SAlan Cox static struct mtx vm_daemon_mtx; 17897824da3SAlan Cox /* Allow for use by vm_pageout before vm_daemon is initialized. */ 17997824da3SAlan Cox MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF); 18038efa82bSJohn Dyson #endif 181d9e23210SJeff Roberson static int vm_pageout_update_period; 1824a365329SAndrey Zonov static int disable_swap_pageouts; 183c9612b2dSJeff Roberson static int lowmem_period = 10; 184a6bf3a9eSRyan Stone static time_t lowmem_uptime; 18570111b90SJohn Dyson 18638efa82bSJohn Dyson #if defined(NO_SWAPPING) 187303b270bSEivind Eklund static int vm_swap_enabled = 0; 188303b270bSEivind Eklund static int vm_swap_idle_enabled = 0; 18938efa82bSJohn Dyson #else 190303b270bSEivind Eklund static int vm_swap_enabled = 1; 191303b270bSEivind Eklund static int vm_swap_idle_enabled = 0; 19238efa82bSJohn Dyson #endif 19338efa82bSJohn Dyson 1948311a2b8SWill Andrews static int vm_panic_on_oom = 0; 1958311a2b8SWill Andrews 1968311a2b8SWill Andrews SYSCTL_INT(_vm, OID_AUTO, panic_on_oom, 1978311a2b8SWill Andrews CTLFLAG_RWTUN, &vm_panic_on_oom, 0, 1988311a2b8SWill Andrews "panic on out of memory instead of killing the largest process"); 1998311a2b8SWill Andrews 200d9e23210SJeff Roberson SYSCTL_INT(_vm, OID_AUTO, pageout_wakeup_thresh, 201d9e23210SJeff Roberson CTLFLAG_RW, &vm_pageout_wakeup_thresh, 0, 202d9e23210SJeff Roberson "free page threshold for waking up the pageout daemon"); 203d9e23210SJeff Roberson 204d9e23210SJeff Roberson SYSCTL_INT(_vm, OID_AUTO, pageout_update_period, 205d9e23210SJeff Roberson CTLFLAG_RW, &vm_pageout_update_period, 0, 206d9e23210SJeff Roberson "Maximum active LRU update period"); 20753636869SAndrey Zonov 208c9612b2dSJeff Roberson SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RW, &lowmem_period, 0, 209c9612b2dSJeff Roberson "Low memory callback period"); 210c9612b2dSJeff Roberson 21138efa82bSJohn Dyson #if defined(NO_SWAPPING) 212ceb0cf87SJohn Dyson SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 2136bd9cb1cSTom Rhodes CTLFLAG_RD, &vm_swap_enabled, 0, "Enable entire process swapout"); 214ceb0cf87SJohn Dyson SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 2156bd9cb1cSTom Rhodes CTLFLAG_RD, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); 21638efa82bSJohn Dyson #else 217ceb0cf87SJohn Dyson SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 218b0359e2cSPeter Wemm CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); 219ceb0cf87SJohn Dyson SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 220b0359e2cSPeter Wemm CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); 22138efa82bSJohn Dyson #endif 22226f9a767SRodney W. Grimes 223ceb0cf87SJohn Dyson SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, 224b0359e2cSPeter Wemm CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); 22512ac6a1dSJohn Dyson 22623b59018SMatthew Dillon static int pageout_lock_miss; 22723b59018SMatthew Dillon SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, 22823b59018SMatthew Dillon CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); 22923b59018SMatthew Dillon 23076386c7eSKonstantin Belousov SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq, 23176386c7eSKonstantin Belousov CTLFLAG_RW, &vm_pageout_oom_seq, 0, 23276386c7eSKonstantin Belousov "back-to-back calls to oom detector to start OOM"); 23376386c7eSKonstantin Belousov 234*ebcddc72SAlan Cox static int act_scan_laundry_weight = 3; 235*ebcddc72SAlan Cox SYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RW, 236*ebcddc72SAlan Cox &act_scan_laundry_weight, 0, 237*ebcddc72SAlan Cox "weight given to clean vs. dirty pages in active queue scans"); 238*ebcddc72SAlan Cox 239*ebcddc72SAlan Cox static u_int vm_background_launder_target; 240*ebcddc72SAlan Cox SYSCTL_UINT(_vm, OID_AUTO, background_launder_target, CTLFLAG_RW, 241*ebcddc72SAlan Cox &vm_background_launder_target, 0, 242*ebcddc72SAlan Cox "background laundering target, in pages"); 243*ebcddc72SAlan Cox 244*ebcddc72SAlan Cox static u_int vm_background_launder_rate = 4096; 245*ebcddc72SAlan Cox SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RW, 246*ebcddc72SAlan Cox &vm_background_launder_rate, 0, 247*ebcddc72SAlan Cox "background laundering rate, in kilobytes per second"); 248*ebcddc72SAlan Cox 249*ebcddc72SAlan Cox static u_int vm_background_launder_max = 20 * 1024; 250*ebcddc72SAlan Cox SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RW, 251*ebcddc72SAlan Cox &vm_background_launder_max, 0, "background laundering cap, in kilobytes"); 252*ebcddc72SAlan Cox 253ffc82b0aSJohn Dyson #define VM_PAGEOUT_PAGE_COUNT 16 254bbc0ec52SDavid Greenman int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; 255df8bae1dSRodney W. Grimes 256c3cb3e12SDavid Greenman int vm_page_max_wired; /* XXX max # of wired pages system-wide */ 2575dfc2870SAlan Cox SYSCTL_INT(_vm, OID_AUTO, max_wired, 2585dfc2870SAlan Cox CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count"); 259df8bae1dSRodney W. Grimes 260*ebcddc72SAlan Cox static u_int isqrt(u_int num); 26185eeca35SAlan Cox static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *); 262*ebcddc72SAlan Cox static int vm_pageout_launder(struct vm_domain *vmd, int launder, 263*ebcddc72SAlan Cox bool in_shortfall); 264*ebcddc72SAlan Cox static void vm_pageout_laundry_worker(void *arg); 26538efa82bSJohn Dyson #if !defined(NO_SWAPPING) 266ecf6279fSAlan Cox static void vm_pageout_map_deactivate_pages(vm_map_t, long); 267ecf6279fSAlan Cox static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long); 26897824da3SAlan Cox static void vm_req_vmdaemon(int req); 26938efa82bSJohn Dyson #endif 27085eeca35SAlan Cox static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *); 271cd41fc12SDavid Greenman 272a8229fa3SAlan Cox /* 273a8229fa3SAlan Cox * Initialize a dummy page for marking the caller's place in the specified 274a8229fa3SAlan Cox * paging queue. In principle, this function only needs to set the flag 275f0edf3f8SAlan Cox * PG_MARKER. Nonetheless, it write busies and initializes the hold count 276c7aebda8SAttilio Rao * to one as safety precautions. 277a8229fa3SAlan Cox */ 2788c616246SKonstantin Belousov static void 2798c616246SKonstantin Belousov vm_pageout_init_marker(vm_page_t marker, u_short queue) 2808c616246SKonstantin Belousov { 2818c616246SKonstantin Belousov 2828c616246SKonstantin Belousov bzero(marker, sizeof(*marker)); 283a8229fa3SAlan Cox marker->flags = PG_MARKER; 284c7aebda8SAttilio Rao marker->busy_lock = VPB_SINGLE_EXCLUSIVER; 2858c616246SKonstantin Belousov marker->queue = queue; 286a8229fa3SAlan Cox marker->hold_count = 1; 2878c616246SKonstantin Belousov } 2888c616246SKonstantin Belousov 28926f9a767SRodney W. Grimes /* 2908dbca793STor Egge * vm_pageout_fallback_object_lock: 2918dbca793STor Egge * 29289f6b863SAttilio Rao * Lock vm object currently associated with `m'. VM_OBJECT_TRYWLOCK is 2938dbca793STor Egge * known to have failed and page queue must be either PQ_ACTIVE or 29444be0a8eSMark Johnston * PQ_INACTIVE. To avoid lock order violation, unlock the page queue 2958dbca793STor Egge * while locking the vm object. Use marker page to detect page queue 2968dbca793STor Egge * changes and maintain notion of next page on page queue. Return 2978dbca793STor Egge * TRUE if no changes were detected, FALSE otherwise. vm object is 2988dbca793STor Egge * locked on return. 2998dbca793STor Egge * 3008dbca793STor Egge * This function depends on both the lock portion of struct vm_object 3018dbca793STor Egge * and normal struct vm_page being type stable. 3028dbca793STor Egge */ 30385eeca35SAlan Cox static boolean_t 3048dbca793STor Egge vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next) 3058dbca793STor Egge { 3068dbca793STor Egge struct vm_page marker; 3078d220203SAlan Cox struct vm_pagequeue *pq; 3088dbca793STor Egge boolean_t unchanged; 3098dbca793STor Egge u_short queue; 3108dbca793STor Egge vm_object_t object; 3118dbca793STor Egge 3128dbca793STor Egge queue = m->queue; 3138c616246SKonstantin Belousov vm_pageout_init_marker(&marker, queue); 314449c2e92SKonstantin Belousov pq = vm_page_pagequeue(m); 3158dbca793STor Egge object = m->object; 3168dbca793STor Egge 317c325e866SKonstantin Belousov TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q); 3188d220203SAlan Cox vm_pagequeue_unlock(pq); 3192965a453SKip Macy vm_page_unlock(m); 32089f6b863SAttilio Rao VM_OBJECT_WLOCK(object); 3212965a453SKip Macy vm_page_lock(m); 3228d220203SAlan Cox vm_pagequeue_lock(pq); 3238dbca793STor Egge 32469b8585eSKonstantin Belousov /* 32569b8585eSKonstantin Belousov * The page's object might have changed, and/or the page might 32669b8585eSKonstantin Belousov * have moved from its original position in the queue. If the 32769b8585eSKonstantin Belousov * page's object has changed, then the caller should abandon 32869b8585eSKonstantin Belousov * processing the page because the wrong object lock was 32969b8585eSKonstantin Belousov * acquired. Use the marker's plinks.q, not the page's, to 33069b8585eSKonstantin Belousov * determine if the page has been moved. The state of the 33169b8585eSKonstantin Belousov * page's plinks.q can be indeterminate; whereas, the marker's 33269b8585eSKonstantin Belousov * plinks.q must be valid. 33369b8585eSKonstantin Belousov */ 334c325e866SKonstantin Belousov *next = TAILQ_NEXT(&marker, plinks.q); 33569b8585eSKonstantin Belousov unchanged = m->object == object && 33669b8585eSKonstantin Belousov m == TAILQ_PREV(&marker, pglist, plinks.q); 33769b8585eSKonstantin Belousov KASSERT(!unchanged || m->queue == queue, 33869b8585eSKonstantin Belousov ("page %p queue %d %d", m, queue, m->queue)); 339c325e866SKonstantin Belousov TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q); 3408dbca793STor Egge return (unchanged); 3418dbca793STor Egge } 3428dbca793STor Egge 3438dbca793STor Egge /* 3448c616246SKonstantin Belousov * Lock the page while holding the page queue lock. Use marker page 3458c616246SKonstantin Belousov * to detect page queue changes and maintain notion of next page on 3468c616246SKonstantin Belousov * page queue. Return TRUE if no changes were detected, FALSE 3478c616246SKonstantin Belousov * otherwise. The page is locked on return. The page queue lock might 3488c616246SKonstantin Belousov * be dropped and reacquired. 3498c616246SKonstantin Belousov * 3508c616246SKonstantin Belousov * This function depends on normal struct vm_page being type stable. 3518c616246SKonstantin Belousov */ 35285eeca35SAlan Cox static boolean_t 3538c616246SKonstantin Belousov vm_pageout_page_lock(vm_page_t m, vm_page_t *next) 3548c616246SKonstantin Belousov { 3558c616246SKonstantin Belousov struct vm_page marker; 3568d220203SAlan Cox struct vm_pagequeue *pq; 3578c616246SKonstantin Belousov boolean_t unchanged; 3588c616246SKonstantin Belousov u_short queue; 3598c616246SKonstantin Belousov 3608c616246SKonstantin Belousov vm_page_lock_assert(m, MA_NOTOWNED); 3618c616246SKonstantin Belousov if (vm_page_trylock(m)) 3628c616246SKonstantin Belousov return (TRUE); 3638c616246SKonstantin Belousov 3648c616246SKonstantin Belousov queue = m->queue; 3658c616246SKonstantin Belousov vm_pageout_init_marker(&marker, queue); 366449c2e92SKonstantin Belousov pq = vm_page_pagequeue(m); 3678c616246SKonstantin Belousov 368c325e866SKonstantin Belousov TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q); 3698d220203SAlan Cox vm_pagequeue_unlock(pq); 3708c616246SKonstantin Belousov vm_page_lock(m); 3718d220203SAlan Cox vm_pagequeue_lock(pq); 3728c616246SKonstantin Belousov 3738c616246SKonstantin Belousov /* Page queue might have changed. */ 374c325e866SKonstantin Belousov *next = TAILQ_NEXT(&marker, plinks.q); 37569b8585eSKonstantin Belousov unchanged = m == TAILQ_PREV(&marker, pglist, plinks.q); 37669b8585eSKonstantin Belousov KASSERT(!unchanged || m->queue == queue, 37769b8585eSKonstantin Belousov ("page %p queue %d %d", m, queue, m->queue)); 378c325e866SKonstantin Belousov TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q); 3798c616246SKonstantin Belousov return (unchanged); 3808c616246SKonstantin Belousov } 3818c616246SKonstantin Belousov 3828c616246SKonstantin Belousov /* 383248fe642SAlan Cox * Scan for pages at adjacent offsets within the given page's object that are 384248fe642SAlan Cox * eligible for laundering, form a cluster of these pages and the given page, 385248fe642SAlan Cox * and launder that cluster. 38626f9a767SRodney W. Grimes */ 3873af76890SPoul-Henning Kamp static int 38834d8b7eaSJeff Roberson vm_pageout_cluster(vm_page_t m) 38924a1cce3SDavid Greenman { 39054d92145SMatthew Dillon vm_object_t object; 391248fe642SAlan Cox vm_page_t mc[2 * vm_pageout_page_count], p, pb, ps; 392248fe642SAlan Cox vm_pindex_t pindex; 393248fe642SAlan Cox int ib, is, page_base, pageout_count; 39426f9a767SRodney W. Grimes 395248fe642SAlan Cox vm_page_assert_locked(m); 39617f6a17bSAlan Cox object = m->object; 39789f6b863SAttilio Rao VM_OBJECT_ASSERT_WLOCKED(object); 398248fe642SAlan Cox pindex = m->pindex; 3990cddd8f0SMatthew Dillon 40026f9a767SRodney W. Grimes /* 401248fe642SAlan Cox * We can't clean the page if it is busy or held. 40224a1cce3SDavid Greenman */ 403c7aebda8SAttilio Rao vm_page_assert_unbusied(m); 404248fe642SAlan Cox KASSERT(m->hold_count == 0, ("page %p is held", m)); 40517f6a17bSAlan Cox vm_page_unlock(m); 4060d94caffSDavid Greenman 40791b4f427SAlan Cox mc[vm_pageout_page_count] = pb = ps = m; 40826f9a767SRodney W. Grimes pageout_count = 1; 409f35329acSJohn Dyson page_base = vm_pageout_page_count; 41090ecac61SMatthew Dillon ib = 1; 41190ecac61SMatthew Dillon is = 1; 41290ecac61SMatthew Dillon 41324a1cce3SDavid Greenman /* 414248fe642SAlan Cox * We can cluster only if the page is not clean, busy, or held, and 415*ebcddc72SAlan Cox * the page is in the laundry queue. 41690ecac61SMatthew Dillon * 41790ecac61SMatthew Dillon * During heavy mmap/modification loads the pageout 41890ecac61SMatthew Dillon * daemon can really fragment the underlying file 419248fe642SAlan Cox * due to flushing pages out of order and not trying to 420248fe642SAlan Cox * align the clusters (which leaves sporadic out-of-order 42190ecac61SMatthew Dillon * holes). To solve this problem we do the reverse scan 42290ecac61SMatthew Dillon * first and attempt to align our cluster, then do a 42390ecac61SMatthew Dillon * forward scan if room remains. 42424a1cce3SDavid Greenman */ 42590ecac61SMatthew Dillon more: 426248fe642SAlan Cox while (ib != 0 && pageout_count < vm_pageout_page_count) { 42790ecac61SMatthew Dillon if (ib > pindex) { 42890ecac61SMatthew Dillon ib = 0; 42990ecac61SMatthew Dillon break; 430f6b04d2bSDavid Greenman } 431c7aebda8SAttilio Rao if ((p = vm_page_prev(pb)) == NULL || vm_page_busied(p)) { 43290ecac61SMatthew Dillon ib = 0; 43390ecac61SMatthew Dillon break; 434f6b04d2bSDavid Greenman } 43524a1cce3SDavid Greenman vm_page_test_dirty(p); 436eb5d3969SAlan Cox if (p->dirty == 0) { 437eb5d3969SAlan Cox ib = 0; 438eb5d3969SAlan Cox break; 439eb5d3969SAlan Cox } 440eb5d3969SAlan Cox vm_page_lock(p); 441*ebcddc72SAlan Cox if (!vm_page_in_laundry(p) || 44257601bcbSMatthew Dillon p->hold_count != 0) { /* may be undergoing I/O */ 4432965a453SKip Macy vm_page_unlock(p); 44490ecac61SMatthew Dillon ib = 0; 44524a1cce3SDavid Greenman break; 446f6b04d2bSDavid Greenman } 4472965a453SKip Macy vm_page_unlock(p); 44891b4f427SAlan Cox mc[--page_base] = pb = p; 44990ecac61SMatthew Dillon ++pageout_count; 45090ecac61SMatthew Dillon ++ib; 451248fe642SAlan Cox 45224a1cce3SDavid Greenman /* 453248fe642SAlan Cox * We are at an alignment boundary. Stop here, and switch 454248fe642SAlan Cox * directions. Do not clear ib. 45524a1cce3SDavid Greenman */ 45690ecac61SMatthew Dillon if ((pindex - (ib - 1)) % vm_pageout_page_count == 0) 45790ecac61SMatthew Dillon break; 45824a1cce3SDavid Greenman } 45990ecac61SMatthew Dillon while (pageout_count < vm_pageout_page_count && 46090ecac61SMatthew Dillon pindex + is < object->size) { 461c7aebda8SAttilio Rao if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p)) 46290ecac61SMatthew Dillon break; 46324a1cce3SDavid Greenman vm_page_test_dirty(p); 464eb5d3969SAlan Cox if (p->dirty == 0) 465eb5d3969SAlan Cox break; 466eb5d3969SAlan Cox vm_page_lock(p); 467*ebcddc72SAlan Cox if (!vm_page_in_laundry(p) || 46857601bcbSMatthew Dillon p->hold_count != 0) { /* may be undergoing I/O */ 4692965a453SKip Macy vm_page_unlock(p); 47024a1cce3SDavid Greenman break; 47124a1cce3SDavid Greenman } 4722965a453SKip Macy vm_page_unlock(p); 47391b4f427SAlan Cox mc[page_base + pageout_count] = ps = p; 47490ecac61SMatthew Dillon ++pageout_count; 47590ecac61SMatthew Dillon ++is; 47624a1cce3SDavid Greenman } 47790ecac61SMatthew Dillon 47890ecac61SMatthew Dillon /* 47990ecac61SMatthew Dillon * If we exhausted our forward scan, continue with the reverse scan 480248fe642SAlan Cox * when possible, even past an alignment boundary. This catches 481248fe642SAlan Cox * boundary conditions. 48290ecac61SMatthew Dillon */ 483248fe642SAlan Cox if (ib != 0 && pageout_count < vm_pageout_page_count) 48490ecac61SMatthew Dillon goto more; 485f6b04d2bSDavid Greenman 486126d6082SKonstantin Belousov return (vm_pageout_flush(&mc[page_base], pageout_count, 0, 0, NULL, 487126d6082SKonstantin Belousov NULL)); 488aef922f5SJohn Dyson } 489aef922f5SJohn Dyson 4901c7c3c6aSMatthew Dillon /* 4911c7c3c6aSMatthew Dillon * vm_pageout_flush() - launder the given pages 4921c7c3c6aSMatthew Dillon * 4931c7c3c6aSMatthew Dillon * The given pages are laundered. Note that we setup for the start of 4941c7c3c6aSMatthew Dillon * I/O ( i.e. busy the page ), mark it read-only, and bump the object 4951c7c3c6aSMatthew Dillon * reference count all in here rather then in the parent. If we want 4961c7c3c6aSMatthew Dillon * the parent to do more sophisticated things we may have to change 4971c7c3c6aSMatthew Dillon * the ordering. 4981e8a675cSKonstantin Belousov * 4991e8a675cSKonstantin Belousov * Returned runlen is the count of pages between mreq and first 5001e8a675cSKonstantin Belousov * page after mreq with status VM_PAGER_AGAIN. 501126d6082SKonstantin Belousov * *eio is set to TRUE if pager returned VM_PAGER_ERROR or VM_PAGER_FAIL 502126d6082SKonstantin Belousov * for any page in runlen set. 5031c7c3c6aSMatthew Dillon */ 504aef922f5SJohn Dyson int 505126d6082SKonstantin Belousov vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen, 506126d6082SKonstantin Belousov boolean_t *eio) 507aef922f5SJohn Dyson { 5082e3b314dSAlan Cox vm_object_t object = mc[0]->object; 509aef922f5SJohn Dyson int pageout_status[count]; 51095461b45SJohn Dyson int numpagedout = 0; 5111e8a675cSKonstantin Belousov int i, runlen; 512aef922f5SJohn Dyson 51389f6b863SAttilio Rao VM_OBJECT_ASSERT_WLOCKED(object); 5147bec141bSKip Macy 5151c7c3c6aSMatthew Dillon /* 5161c7c3c6aSMatthew Dillon * Initiate I/O. Bump the vm_page_t->busy counter and 5171c7c3c6aSMatthew Dillon * mark the pages read-only. 5181c7c3c6aSMatthew Dillon * 5191c7c3c6aSMatthew Dillon * We do not have to fixup the clean/dirty bits here... we can 5201c7c3c6aSMatthew Dillon * allow the pager to do it after the I/O completes. 52102fa91d3SMatthew Dillon * 52202fa91d3SMatthew Dillon * NOTE! mc[i]->dirty may be partial or fragmented due to an 52302fa91d3SMatthew Dillon * edge case with file fragments. 5241c7c3c6aSMatthew Dillon */ 5258f9110f6SJohn Dyson for (i = 0; i < count; i++) { 5267a935082SAlan Cox KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, 5277a935082SAlan Cox ("vm_pageout_flush: partially invalid page %p index %d/%d", 5287a935082SAlan Cox mc[i], i, count)); 529c7aebda8SAttilio Rao vm_page_sbusy(mc[i]); 53078985e42SAlan Cox pmap_remove_write(mc[i]); 5312965a453SKip Macy } 532d474eaaaSDoug Rabson vm_object_pip_add(object, count); 533aef922f5SJohn Dyson 534d076fbeaSAlan Cox vm_pager_put_pages(object, mc, count, flags, pageout_status); 53526f9a767SRodney W. Grimes 5361e8a675cSKonstantin Belousov runlen = count - mreq; 537126d6082SKonstantin Belousov if (eio != NULL) 538126d6082SKonstantin Belousov *eio = FALSE; 539aef922f5SJohn Dyson for (i = 0; i < count; i++) { 540aef922f5SJohn Dyson vm_page_t mt = mc[i]; 54124a1cce3SDavid Greenman 5424cd45723SAlan Cox KASSERT(pageout_status[i] == VM_PAGER_PEND || 5436031c68dSAlan Cox !pmap_page_is_write_mapped(mt), 5449ea8d1a6SAlan Cox ("vm_pageout_flush: page %p is not write protected", mt)); 54526f9a767SRodney W. Grimes switch (pageout_status[i]) { 54626f9a767SRodney W. Grimes case VM_PAGER_OK: 547*ebcddc72SAlan Cox vm_page_lock(mt); 548*ebcddc72SAlan Cox if (vm_page_in_laundry(mt)) 549*ebcddc72SAlan Cox vm_page_deactivate_noreuse(mt); 550*ebcddc72SAlan Cox vm_page_unlock(mt); 551*ebcddc72SAlan Cox /* FALLTHROUGH */ 55226f9a767SRodney W. Grimes case VM_PAGER_PEND: 55395461b45SJohn Dyson numpagedout++; 55426f9a767SRodney W. Grimes break; 55526f9a767SRodney W. Grimes case VM_PAGER_BAD: 55626f9a767SRodney W. Grimes /* 557*ebcddc72SAlan Cox * The page is outside the object's range. We pretend 558*ebcddc72SAlan Cox * that the page out worked and clean the page, so the 559*ebcddc72SAlan Cox * changes will be lost if the page is reclaimed by 560*ebcddc72SAlan Cox * the page daemon. 56126f9a767SRodney W. Grimes */ 56290ecac61SMatthew Dillon vm_page_undirty(mt); 563*ebcddc72SAlan Cox vm_page_lock(mt); 564*ebcddc72SAlan Cox if (vm_page_in_laundry(mt)) 565*ebcddc72SAlan Cox vm_page_deactivate_noreuse(mt); 566*ebcddc72SAlan Cox vm_page_unlock(mt); 56726f9a767SRodney W. Grimes break; 56826f9a767SRodney W. Grimes case VM_PAGER_ERROR: 56926f9a767SRodney W. Grimes case VM_PAGER_FAIL: 57026f9a767SRodney W. Grimes /* 571*ebcddc72SAlan Cox * If the page couldn't be paged out, then reactivate 572*ebcddc72SAlan Cox * it so that it doesn't clog the laundry and inactive 573*ebcddc72SAlan Cox * queues. (We will try paging it out again later). 57426f9a767SRodney W. Grimes */ 5753c4a2440SAlan Cox vm_page_lock(mt); 57624a1cce3SDavid Greenman vm_page_activate(mt); 5773c4a2440SAlan Cox vm_page_unlock(mt); 578126d6082SKonstantin Belousov if (eio != NULL && i >= mreq && i - mreq < runlen) 579126d6082SKonstantin Belousov *eio = TRUE; 58026f9a767SRodney W. Grimes break; 58126f9a767SRodney W. Grimes case VM_PAGER_AGAIN: 5821e8a675cSKonstantin Belousov if (i >= mreq && i - mreq < runlen) 5831e8a675cSKonstantin Belousov runlen = i - mreq; 58426f9a767SRodney W. Grimes break; 58526f9a767SRodney W. Grimes } 58626f9a767SRodney W. Grimes 58726f9a767SRodney W. Grimes /* 5880d94caffSDavid Greenman * If the operation is still going, leave the page busy to 5890d94caffSDavid Greenman * block all other accesses. Also, leave the paging in 5900d94caffSDavid Greenman * progress indicator set so that we don't attempt an object 5910d94caffSDavid Greenman * collapse. 59226f9a767SRodney W. Grimes */ 59326f9a767SRodney W. Grimes if (pageout_status[i] != VM_PAGER_PEND) { 594f919ebdeSDavid Greenman vm_object_pip_wakeup(object); 595c7aebda8SAttilio Rao vm_page_sunbusy(mt); 5963c4a2440SAlan Cox } 5973c4a2440SAlan Cox } 5981e8a675cSKonstantin Belousov if (prunlen != NULL) 5991e8a675cSKonstantin Belousov *prunlen = runlen; 6003c4a2440SAlan Cox return (numpagedout); 60126f9a767SRodney W. Grimes } 60226f9a767SRodney W. Grimes 60338efa82bSJohn Dyson #if !defined(NO_SWAPPING) 60426f9a767SRodney W. Grimes /* 60526f9a767SRodney W. Grimes * vm_pageout_object_deactivate_pages 60626f9a767SRodney W. Grimes * 607ce186587SAlan Cox * Deactivate enough pages to satisfy the inactive target 608ce186587SAlan Cox * requirements. 60926f9a767SRodney W. Grimes * 61026f9a767SRodney W. Grimes * The object and map must be locked. 61126f9a767SRodney W. Grimes */ 61238efa82bSJohn Dyson static void 613ce186587SAlan Cox vm_pageout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object, 614ce186587SAlan Cox long desired) 61526f9a767SRodney W. Grimes { 616ecf6279fSAlan Cox vm_object_t backing_object, object; 617ce186587SAlan Cox vm_page_t p; 618bb7858eaSJeff Roberson int act_delta, remove_mode; 61926f9a767SRodney W. Grimes 620e23b0a19SAlan Cox VM_OBJECT_ASSERT_LOCKED(first_object); 62128634820SAlan Cox if ((first_object->flags & OBJ_FICTITIOUS) != 0) 62238efa82bSJohn Dyson return; 623ecf6279fSAlan Cox for (object = first_object;; object = backing_object) { 624ecf6279fSAlan Cox if (pmap_resident_count(pmap) <= desired) 625ecf6279fSAlan Cox goto unlock_return; 626e23b0a19SAlan Cox VM_OBJECT_ASSERT_LOCKED(object); 62728634820SAlan Cox if ((object->flags & OBJ_UNMANAGED) != 0 || 62828634820SAlan Cox object->paging_in_progress != 0) 629ecf6279fSAlan Cox goto unlock_return; 63026f9a767SRodney W. Grimes 63185b1dc89SAlan Cox remove_mode = 0; 63238efa82bSJohn Dyson if (object->shadow_count > 1) 63338efa82bSJohn Dyson remove_mode = 1; 63426f9a767SRodney W. Grimes /* 635ce186587SAlan Cox * Scan the object's entire memory queue. 63626f9a767SRodney W. Grimes */ 637ce186587SAlan Cox TAILQ_FOREACH(p, &object->memq, listq) { 638447fe2a4SAlan Cox if (pmap_resident_count(pmap) <= desired) 639447fe2a4SAlan Cox goto unlock_return; 640c7aebda8SAttilio Rao if (vm_page_busied(p)) 641447fe2a4SAlan Cox continue; 642ce186587SAlan Cox PCPU_INC(cnt.v_pdpages); 6432965a453SKip Macy vm_page_lock(p); 644ce186587SAlan Cox if (p->wire_count != 0 || p->hold_count != 0 || 645ecf6279fSAlan Cox !pmap_page_exists_quick(pmap, p)) { 6462965a453SKip Macy vm_page_unlock(p); 6470d94caffSDavid Greenman continue; 6480d94caffSDavid Greenman } 649bb7858eaSJeff Roberson act_delta = pmap_ts_referenced(p); 6503407fefeSKonstantin Belousov if ((p->aflags & PGA_REFERENCED) != 0) { 651bb7858eaSJeff Roberson if (act_delta == 0) 652bb7858eaSJeff Roberson act_delta = 1; 6533407fefeSKonstantin Belousov vm_page_aflag_clear(p, PGA_REFERENCED); 654ef743ce6SJohn Dyson } 655*ebcddc72SAlan Cox if (!vm_page_active(p) && act_delta != 0) { 656ef743ce6SJohn Dyson vm_page_activate(p); 657bb7858eaSJeff Roberson p->act_count += act_delta; 658*ebcddc72SAlan Cox } else if (vm_page_active(p)) { 659bb7858eaSJeff Roberson if (act_delta == 0) { 660ce186587SAlan Cox p->act_count -= min(p->act_count, 661ce186587SAlan Cox ACT_DECLINE); 66290776bd7SJeff Roberson if (!remove_mode && p->act_count == 0) { 6634fec79beSAlan Cox pmap_remove_all(p); 66426f9a767SRodney W. Grimes vm_page_deactivate(p); 6658d220203SAlan Cox } else 6668d220203SAlan Cox vm_page_requeue(p); 667c8c4b40cSJohn Dyson } else { 668eaf13dd7SJohn Dyson vm_page_activate(p); 669ce186587SAlan Cox if (p->act_count < ACT_MAX - 670ce186587SAlan Cox ACT_ADVANCE) 67138efa82bSJohn Dyson p->act_count += ACT_ADVANCE; 6728d220203SAlan Cox vm_page_requeue(p); 673ce186587SAlan Cox } 674*ebcddc72SAlan Cox } else if (vm_page_inactive(p)) 675ce186587SAlan Cox pmap_remove_all(p); 6762965a453SKip Macy vm_page_unlock(p); 67726f9a767SRodney W. Grimes } 678ecf6279fSAlan Cox if ((backing_object = object->backing_object) == NULL) 679ecf6279fSAlan Cox goto unlock_return; 680e23b0a19SAlan Cox VM_OBJECT_RLOCK(backing_object); 681ecf6279fSAlan Cox if (object != first_object) 682e23b0a19SAlan Cox VM_OBJECT_RUNLOCK(object); 68338efa82bSJohn Dyson } 684ecf6279fSAlan Cox unlock_return: 685ecf6279fSAlan Cox if (object != first_object) 686e23b0a19SAlan Cox VM_OBJECT_RUNLOCK(object); 68726f9a767SRodney W. Grimes } 68826f9a767SRodney W. Grimes 68926f9a767SRodney W. Grimes /* 69026f9a767SRodney W. Grimes * deactivate some number of pages in a map, try to do it fairly, but 69126f9a767SRodney W. Grimes * that is really hard to do. 69226f9a767SRodney W. Grimes */ 693cd41fc12SDavid Greenman static void 69438efa82bSJohn Dyson vm_pageout_map_deactivate_pages(map, desired) 69526f9a767SRodney W. Grimes vm_map_t map; 696ecf6279fSAlan Cox long desired; 69726f9a767SRodney W. Grimes { 69826f9a767SRodney W. Grimes vm_map_entry_t tmpe; 69938efa82bSJohn Dyson vm_object_t obj, bigobj; 70030105b9eSTor Egge int nothingwired; 7010d94caffSDavid Greenman 702d974f03cSAlan Cox if (!vm_map_trylock(map)) 70326f9a767SRodney W. Grimes return; 70438efa82bSJohn Dyson 70538efa82bSJohn Dyson bigobj = NULL; 70630105b9eSTor Egge nothingwired = TRUE; 70738efa82bSJohn Dyson 70838efa82bSJohn Dyson /* 70938efa82bSJohn Dyson * first, search out the biggest object, and try to free pages from 71038efa82bSJohn Dyson * that. 71138efa82bSJohn Dyson */ 71226f9a767SRodney W. Grimes tmpe = map->header.next; 71338efa82bSJohn Dyson while (tmpe != &map->header) { 7149fdfe602SMatthew Dillon if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { 71538efa82bSJohn Dyson obj = tmpe->object.vm_object; 716e23b0a19SAlan Cox if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) { 7170774dfb3SAlan Cox if (obj->shadow_count <= 1 && 7180774dfb3SAlan Cox (bigobj == NULL || 7190774dfb3SAlan Cox bigobj->resident_page_count < obj->resident_page_count)) { 7200774dfb3SAlan Cox if (bigobj != NULL) 721e23b0a19SAlan Cox VM_OBJECT_RUNLOCK(bigobj); 72238efa82bSJohn Dyson bigobj = obj; 7230774dfb3SAlan Cox } else 724e23b0a19SAlan Cox VM_OBJECT_RUNLOCK(obj); 72538efa82bSJohn Dyson } 72638efa82bSJohn Dyson } 72730105b9eSTor Egge if (tmpe->wired_count > 0) 72830105b9eSTor Egge nothingwired = FALSE; 72938efa82bSJohn Dyson tmpe = tmpe->next; 73038efa82bSJohn Dyson } 73138efa82bSJohn Dyson 7320774dfb3SAlan Cox if (bigobj != NULL) { 733ecf6279fSAlan Cox vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired); 734e23b0a19SAlan Cox VM_OBJECT_RUNLOCK(bigobj); 7350774dfb3SAlan Cox } 73638efa82bSJohn Dyson /* 73738efa82bSJohn Dyson * Next, hunt around for other pages to deactivate. We actually 73838efa82bSJohn Dyson * do this search sort of wrong -- .text first is not the best idea. 73938efa82bSJohn Dyson */ 74038efa82bSJohn Dyson tmpe = map->header.next; 74138efa82bSJohn Dyson while (tmpe != &map->header) { 742b1028ad1SLuoqi Chen if (pmap_resident_count(vm_map_pmap(map)) <= desired) 74338efa82bSJohn Dyson break; 7449fdfe602SMatthew Dillon if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { 74538efa82bSJohn Dyson obj = tmpe->object.vm_object; 7460774dfb3SAlan Cox if (obj != NULL) { 747e23b0a19SAlan Cox VM_OBJECT_RLOCK(obj); 748ecf6279fSAlan Cox vm_pageout_object_deactivate_pages(map->pmap, obj, desired); 749e23b0a19SAlan Cox VM_OBJECT_RUNLOCK(obj); 7500774dfb3SAlan Cox } 75138efa82bSJohn Dyson } 75226f9a767SRodney W. Grimes tmpe = tmpe->next; 75338857e7fSAlan Cox } 75438efa82bSJohn Dyson 75538efa82bSJohn Dyson /* 75638efa82bSJohn Dyson * Remove all mappings if a process is swapped out, this will free page 75738efa82bSJohn Dyson * table pages. 75838efa82bSJohn Dyson */ 75938857e7fSAlan Cox if (desired == 0 && nothingwired) { 7608d01a3b2SNathan Whitehorn pmap_remove(vm_map_pmap(map), vm_map_min(map), 7618d01a3b2SNathan Whitehorn vm_map_max(map)); 76238857e7fSAlan Cox } 763938b0f5bSMarcel Moolenaar 76438efa82bSJohn Dyson vm_map_unlock(map); 76526f9a767SRodney W. Grimes } 766a1287949SEivind Eklund #endif /* !defined(NO_SWAPPING) */ 767df8bae1dSRodney W. Grimes 7681c7c3c6aSMatthew Dillon /* 76934d8b7eaSJeff Roberson * Attempt to acquire all of the necessary locks to launder a page and 77034d8b7eaSJeff Roberson * then call through the clustering layer to PUTPAGES. Wait a short 77134d8b7eaSJeff Roberson * time for a vnode lock. 77234d8b7eaSJeff Roberson * 77334d8b7eaSJeff Roberson * Requires the page and object lock on entry, releases both before return. 77434d8b7eaSJeff Roberson * Returns 0 on success and an errno otherwise. 77534d8b7eaSJeff Roberson */ 77634d8b7eaSJeff Roberson static int 777*ebcddc72SAlan Cox vm_pageout_clean(vm_page_t m, int *numpagedout) 77834d8b7eaSJeff Roberson { 77934d8b7eaSJeff Roberson struct vnode *vp; 78034d8b7eaSJeff Roberson struct mount *mp; 78134d8b7eaSJeff Roberson vm_object_t object; 78234d8b7eaSJeff Roberson vm_pindex_t pindex; 78334d8b7eaSJeff Roberson int error, lockmode; 78434d8b7eaSJeff Roberson 78534d8b7eaSJeff Roberson vm_page_assert_locked(m); 78634d8b7eaSJeff Roberson object = m->object; 78734d8b7eaSJeff Roberson VM_OBJECT_ASSERT_WLOCKED(object); 78834d8b7eaSJeff Roberson error = 0; 78934d8b7eaSJeff Roberson vp = NULL; 79034d8b7eaSJeff Roberson mp = NULL; 79134d8b7eaSJeff Roberson 79234d8b7eaSJeff Roberson /* 79334d8b7eaSJeff Roberson * The object is already known NOT to be dead. It 79434d8b7eaSJeff Roberson * is possible for the vget() to block the whole 79534d8b7eaSJeff Roberson * pageout daemon, but the new low-memory handling 79634d8b7eaSJeff Roberson * code should prevent it. 79734d8b7eaSJeff Roberson * 79834d8b7eaSJeff Roberson * We can't wait forever for the vnode lock, we might 79934d8b7eaSJeff Roberson * deadlock due to a vn_read() getting stuck in 80034d8b7eaSJeff Roberson * vm_wait while holding this vnode. We skip the 80134d8b7eaSJeff Roberson * vnode if we can't get it in a reasonable amount 80234d8b7eaSJeff Roberson * of time. 80334d8b7eaSJeff Roberson */ 80434d8b7eaSJeff Roberson if (object->type == OBJT_VNODE) { 80534d8b7eaSJeff Roberson vm_page_unlock(m); 80634d8b7eaSJeff Roberson vp = object->handle; 80734d8b7eaSJeff Roberson if (vp->v_type == VREG && 80834d8b7eaSJeff Roberson vn_start_write(vp, &mp, V_NOWAIT) != 0) { 80934d8b7eaSJeff Roberson mp = NULL; 81034d8b7eaSJeff Roberson error = EDEADLK; 81134d8b7eaSJeff Roberson goto unlock_all; 81234d8b7eaSJeff Roberson } 81334d8b7eaSJeff Roberson KASSERT(mp != NULL, 81434d8b7eaSJeff Roberson ("vp %p with NULL v_mount", vp)); 81534d8b7eaSJeff Roberson vm_object_reference_locked(object); 81634d8b7eaSJeff Roberson pindex = m->pindex; 81734d8b7eaSJeff Roberson VM_OBJECT_WUNLOCK(object); 81834d8b7eaSJeff Roberson lockmode = MNT_SHARED_WRITES(vp->v_mount) ? 81934d8b7eaSJeff Roberson LK_SHARED : LK_EXCLUSIVE; 82034d8b7eaSJeff Roberson if (vget(vp, lockmode | LK_TIMELOCK, curthread)) { 82134d8b7eaSJeff Roberson vp = NULL; 82234d8b7eaSJeff Roberson error = EDEADLK; 82334d8b7eaSJeff Roberson goto unlock_mp; 82434d8b7eaSJeff Roberson } 82534d8b7eaSJeff Roberson VM_OBJECT_WLOCK(object); 82634d8b7eaSJeff Roberson vm_page_lock(m); 82734d8b7eaSJeff Roberson /* 82834d8b7eaSJeff Roberson * While the object and page were unlocked, the page 82934d8b7eaSJeff Roberson * may have been: 83034d8b7eaSJeff Roberson * (1) moved to a different queue, 83134d8b7eaSJeff Roberson * (2) reallocated to a different object, 83234d8b7eaSJeff Roberson * (3) reallocated to a different offset, or 83334d8b7eaSJeff Roberson * (4) cleaned. 83434d8b7eaSJeff Roberson */ 835*ebcddc72SAlan Cox if (!vm_page_in_laundry(m) || m->object != object || 83634d8b7eaSJeff Roberson m->pindex != pindex || m->dirty == 0) { 83734d8b7eaSJeff Roberson vm_page_unlock(m); 83834d8b7eaSJeff Roberson error = ENXIO; 83934d8b7eaSJeff Roberson goto unlock_all; 84034d8b7eaSJeff Roberson } 84134d8b7eaSJeff Roberson 84234d8b7eaSJeff Roberson /* 84334d8b7eaSJeff Roberson * The page may have been busied or held while the object 84434d8b7eaSJeff Roberson * and page locks were released. 84534d8b7eaSJeff Roberson */ 84634d8b7eaSJeff Roberson if (vm_page_busied(m) || m->hold_count != 0) { 84734d8b7eaSJeff Roberson vm_page_unlock(m); 84834d8b7eaSJeff Roberson error = EBUSY; 84934d8b7eaSJeff Roberson goto unlock_all; 85034d8b7eaSJeff Roberson } 85134d8b7eaSJeff Roberson } 85234d8b7eaSJeff Roberson 85334d8b7eaSJeff Roberson /* 85434d8b7eaSJeff Roberson * If a page is dirty, then it is either being washed 85534d8b7eaSJeff Roberson * (but not yet cleaned) or it is still in the 85634d8b7eaSJeff Roberson * laundry. If it is still in the laundry, then we 85734d8b7eaSJeff Roberson * start the cleaning operation. 85834d8b7eaSJeff Roberson */ 859*ebcddc72SAlan Cox if ((*numpagedout = vm_pageout_cluster(m)) == 0) 86034d8b7eaSJeff Roberson error = EIO; 86134d8b7eaSJeff Roberson 86234d8b7eaSJeff Roberson unlock_all: 86334d8b7eaSJeff Roberson VM_OBJECT_WUNLOCK(object); 86434d8b7eaSJeff Roberson 86534d8b7eaSJeff Roberson unlock_mp: 86634d8b7eaSJeff Roberson vm_page_lock_assert(m, MA_NOTOWNED); 86734d8b7eaSJeff Roberson if (mp != NULL) { 86834d8b7eaSJeff Roberson if (vp != NULL) 86934d8b7eaSJeff Roberson vput(vp); 87034d8b7eaSJeff Roberson vm_object_deallocate(object); 87134d8b7eaSJeff Roberson vn_finished_write(mp); 87234d8b7eaSJeff Roberson } 87334d8b7eaSJeff Roberson 87434d8b7eaSJeff Roberson return (error); 87534d8b7eaSJeff Roberson } 87634d8b7eaSJeff Roberson 87734d8b7eaSJeff Roberson /* 878*ebcddc72SAlan Cox * Attempt to launder the specified number of pages. 879*ebcddc72SAlan Cox * 880*ebcddc72SAlan Cox * Returns the number of pages successfully laundered. 881*ebcddc72SAlan Cox */ 882*ebcddc72SAlan Cox static int 883*ebcddc72SAlan Cox vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall) 884*ebcddc72SAlan Cox { 885*ebcddc72SAlan Cox struct vm_pagequeue *pq; 886*ebcddc72SAlan Cox vm_object_t object; 887*ebcddc72SAlan Cox vm_page_t m, next; 888*ebcddc72SAlan Cox int act_delta, error, maxscan, numpagedout, starting_target; 889*ebcddc72SAlan Cox int vnodes_skipped; 890*ebcddc72SAlan Cox bool pageout_ok, queue_locked; 891*ebcddc72SAlan Cox 892*ebcddc72SAlan Cox starting_target = launder; 893*ebcddc72SAlan Cox vnodes_skipped = 0; 894*ebcddc72SAlan Cox 895*ebcddc72SAlan Cox /* 896*ebcddc72SAlan Cox * Scan the laundry queue for pages eligible to be laundered. We stop 897*ebcddc72SAlan Cox * once the target number of dirty pages have been laundered, or once 898*ebcddc72SAlan Cox * we've reached the end of the queue. A single iteration of this loop 899*ebcddc72SAlan Cox * may cause more than one page to be laundered because of clustering. 900*ebcddc72SAlan Cox * 901*ebcddc72SAlan Cox * maxscan ensures that we don't re-examine requeued pages. Any 902*ebcddc72SAlan Cox * additional pages written as part of a cluster are subtracted from 903*ebcddc72SAlan Cox * maxscan since they must be taken from the laundry queue. 904*ebcddc72SAlan Cox */ 905*ebcddc72SAlan Cox pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; 906*ebcddc72SAlan Cox maxscan = pq->pq_cnt; 907*ebcddc72SAlan Cox 908*ebcddc72SAlan Cox vm_pagequeue_lock(pq); 909*ebcddc72SAlan Cox queue_locked = true; 910*ebcddc72SAlan Cox for (m = TAILQ_FIRST(&pq->pq_pl); 911*ebcddc72SAlan Cox m != NULL && maxscan-- > 0 && launder > 0; 912*ebcddc72SAlan Cox m = next) { 913*ebcddc72SAlan Cox vm_pagequeue_assert_locked(pq); 914*ebcddc72SAlan Cox KASSERT(queue_locked, ("unlocked laundry queue")); 915*ebcddc72SAlan Cox KASSERT(vm_page_in_laundry(m), 916*ebcddc72SAlan Cox ("page %p has an inconsistent queue", m)); 917*ebcddc72SAlan Cox next = TAILQ_NEXT(m, plinks.q); 918*ebcddc72SAlan Cox if ((m->flags & PG_MARKER) != 0) 919*ebcddc72SAlan Cox continue; 920*ebcddc72SAlan Cox KASSERT((m->flags & PG_FICTITIOUS) == 0, 921*ebcddc72SAlan Cox ("PG_FICTITIOUS page %p cannot be in laundry queue", m)); 922*ebcddc72SAlan Cox KASSERT((m->oflags & VPO_UNMANAGED) == 0, 923*ebcddc72SAlan Cox ("VPO_UNMANAGED page %p cannot be in laundry queue", m)); 924*ebcddc72SAlan Cox if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) { 925*ebcddc72SAlan Cox vm_page_unlock(m); 926*ebcddc72SAlan Cox continue; 927*ebcddc72SAlan Cox } 928*ebcddc72SAlan Cox object = m->object; 929*ebcddc72SAlan Cox if ((!VM_OBJECT_TRYWLOCK(object) && 930*ebcddc72SAlan Cox (!vm_pageout_fallback_object_lock(m, &next) || 931*ebcddc72SAlan Cox m->hold_count != 0)) || vm_page_busied(m)) { 932*ebcddc72SAlan Cox VM_OBJECT_WUNLOCK(object); 933*ebcddc72SAlan Cox vm_page_unlock(m); 934*ebcddc72SAlan Cox continue; 935*ebcddc72SAlan Cox } 936*ebcddc72SAlan Cox 937*ebcddc72SAlan Cox /* 938*ebcddc72SAlan Cox * Unlock the laundry queue, invalidating the 'next' pointer. 939*ebcddc72SAlan Cox * Use a marker to remember our place in the laundry queue. 940*ebcddc72SAlan Cox */ 941*ebcddc72SAlan Cox TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_laundry_marker, 942*ebcddc72SAlan Cox plinks.q); 943*ebcddc72SAlan Cox vm_pagequeue_unlock(pq); 944*ebcddc72SAlan Cox queue_locked = false; 945*ebcddc72SAlan Cox 946*ebcddc72SAlan Cox /* 947*ebcddc72SAlan Cox * Invalid pages can be easily freed. They cannot be 948*ebcddc72SAlan Cox * mapped; vm_page_free() asserts this. 949*ebcddc72SAlan Cox */ 950*ebcddc72SAlan Cox if (m->valid == 0) 951*ebcddc72SAlan Cox goto free_page; 952*ebcddc72SAlan Cox 953*ebcddc72SAlan Cox /* 954*ebcddc72SAlan Cox * If the page has been referenced and the object is not dead, 955*ebcddc72SAlan Cox * reactivate or requeue the page depending on whether the 956*ebcddc72SAlan Cox * object is mapped. 957*ebcddc72SAlan Cox */ 958*ebcddc72SAlan Cox if ((m->aflags & PGA_REFERENCED) != 0) { 959*ebcddc72SAlan Cox vm_page_aflag_clear(m, PGA_REFERENCED); 960*ebcddc72SAlan Cox act_delta = 1; 961*ebcddc72SAlan Cox } else 962*ebcddc72SAlan Cox act_delta = 0; 963*ebcddc72SAlan Cox if (object->ref_count != 0) 964*ebcddc72SAlan Cox act_delta += pmap_ts_referenced(m); 965*ebcddc72SAlan Cox else { 966*ebcddc72SAlan Cox KASSERT(!pmap_page_is_mapped(m), 967*ebcddc72SAlan Cox ("page %p is mapped", m)); 968*ebcddc72SAlan Cox } 969*ebcddc72SAlan Cox if (act_delta != 0) { 970*ebcddc72SAlan Cox if (object->ref_count != 0) { 971*ebcddc72SAlan Cox PCPU_INC(cnt.v_reactivated); 972*ebcddc72SAlan Cox vm_page_activate(m); 973*ebcddc72SAlan Cox 974*ebcddc72SAlan Cox /* 975*ebcddc72SAlan Cox * Increase the activation count if the page 976*ebcddc72SAlan Cox * was referenced while in the laundry queue. 977*ebcddc72SAlan Cox * This makes it less likely that the page will 978*ebcddc72SAlan Cox * be returned prematurely to the inactive 979*ebcddc72SAlan Cox * queue. 980*ebcddc72SAlan Cox */ 981*ebcddc72SAlan Cox m->act_count += act_delta + ACT_ADVANCE; 982*ebcddc72SAlan Cox 983*ebcddc72SAlan Cox /* 984*ebcddc72SAlan Cox * If this was a background laundering, count 985*ebcddc72SAlan Cox * activated pages towards our target. The 986*ebcddc72SAlan Cox * purpose of background laundering is to ensure 987*ebcddc72SAlan Cox * that pages are eventually cycled through the 988*ebcddc72SAlan Cox * laundry queue, and an activation is a valid 989*ebcddc72SAlan Cox * way out. 990*ebcddc72SAlan Cox */ 991*ebcddc72SAlan Cox if (!in_shortfall) 992*ebcddc72SAlan Cox launder--; 993*ebcddc72SAlan Cox goto drop_page; 994*ebcddc72SAlan Cox } else if ((object->flags & OBJ_DEAD) == 0) 995*ebcddc72SAlan Cox goto requeue_page; 996*ebcddc72SAlan Cox } 997*ebcddc72SAlan Cox 998*ebcddc72SAlan Cox /* 999*ebcddc72SAlan Cox * If the page appears to be clean at the machine-independent 1000*ebcddc72SAlan Cox * layer, then remove all of its mappings from the pmap in 1001*ebcddc72SAlan Cox * anticipation of freeing it. If, however, any of the page's 1002*ebcddc72SAlan Cox * mappings allow write access, then the page may still be 1003*ebcddc72SAlan Cox * modified until the last of those mappings are removed. 1004*ebcddc72SAlan Cox */ 1005*ebcddc72SAlan Cox if (object->ref_count != 0) { 1006*ebcddc72SAlan Cox vm_page_test_dirty(m); 1007*ebcddc72SAlan Cox if (m->dirty == 0) 1008*ebcddc72SAlan Cox pmap_remove_all(m); 1009*ebcddc72SAlan Cox } 1010*ebcddc72SAlan Cox 1011*ebcddc72SAlan Cox /* 1012*ebcddc72SAlan Cox * Clean pages are freed, and dirty pages are paged out unless 1013*ebcddc72SAlan Cox * they belong to a dead object. Requeueing dirty pages from 1014*ebcddc72SAlan Cox * dead objects is pointless, as they are being paged out and 1015*ebcddc72SAlan Cox * freed by the thread that destroyed the object. 1016*ebcddc72SAlan Cox */ 1017*ebcddc72SAlan Cox if (m->dirty == 0) { 1018*ebcddc72SAlan Cox free_page: 1019*ebcddc72SAlan Cox vm_page_free(m); 1020*ebcddc72SAlan Cox PCPU_INC(cnt.v_dfree); 1021*ebcddc72SAlan Cox } else if ((object->flags & OBJ_DEAD) == 0) { 1022*ebcddc72SAlan Cox if (object->type != OBJT_SWAP && 1023*ebcddc72SAlan Cox object->type != OBJT_DEFAULT) 1024*ebcddc72SAlan Cox pageout_ok = true; 1025*ebcddc72SAlan Cox else if (disable_swap_pageouts) 1026*ebcddc72SAlan Cox pageout_ok = false; 1027*ebcddc72SAlan Cox else 1028*ebcddc72SAlan Cox pageout_ok = true; 1029*ebcddc72SAlan Cox if (!pageout_ok) { 1030*ebcddc72SAlan Cox requeue_page: 1031*ebcddc72SAlan Cox vm_pagequeue_lock(pq); 1032*ebcddc72SAlan Cox queue_locked = true; 1033*ebcddc72SAlan Cox vm_page_requeue_locked(m); 1034*ebcddc72SAlan Cox goto drop_page; 1035*ebcddc72SAlan Cox } 1036*ebcddc72SAlan Cox 1037*ebcddc72SAlan Cox /* 1038*ebcddc72SAlan Cox * Form a cluster with adjacent, dirty pages from the 1039*ebcddc72SAlan Cox * same object, and page out that entire cluster. 1040*ebcddc72SAlan Cox * 1041*ebcddc72SAlan Cox * The adjacent, dirty pages must also be in the 1042*ebcddc72SAlan Cox * laundry. However, their mappings are not checked 1043*ebcddc72SAlan Cox * for new references. Consequently, a recently 1044*ebcddc72SAlan Cox * referenced page may be paged out. However, that 1045*ebcddc72SAlan Cox * page will not be prematurely reclaimed. After page 1046*ebcddc72SAlan Cox * out, the page will be placed in the inactive queue, 1047*ebcddc72SAlan Cox * where any new references will be detected and the 1048*ebcddc72SAlan Cox * page reactivated. 1049*ebcddc72SAlan Cox */ 1050*ebcddc72SAlan Cox error = vm_pageout_clean(m, &numpagedout); 1051*ebcddc72SAlan Cox if (error == 0) { 1052*ebcddc72SAlan Cox launder -= numpagedout; 1053*ebcddc72SAlan Cox maxscan -= numpagedout - 1; 1054*ebcddc72SAlan Cox } else if (error == EDEADLK) { 1055*ebcddc72SAlan Cox pageout_lock_miss++; 1056*ebcddc72SAlan Cox vnodes_skipped++; 1057*ebcddc72SAlan Cox } 1058*ebcddc72SAlan Cox goto relock_queue; 1059*ebcddc72SAlan Cox } 1060*ebcddc72SAlan Cox drop_page: 1061*ebcddc72SAlan Cox vm_page_unlock(m); 1062*ebcddc72SAlan Cox VM_OBJECT_WUNLOCK(object); 1063*ebcddc72SAlan Cox relock_queue: 1064*ebcddc72SAlan Cox if (!queue_locked) { 1065*ebcddc72SAlan Cox vm_pagequeue_lock(pq); 1066*ebcddc72SAlan Cox queue_locked = true; 1067*ebcddc72SAlan Cox } 1068*ebcddc72SAlan Cox next = TAILQ_NEXT(&vmd->vmd_laundry_marker, plinks.q); 1069*ebcddc72SAlan Cox TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_laundry_marker, plinks.q); 1070*ebcddc72SAlan Cox } 1071*ebcddc72SAlan Cox vm_pagequeue_unlock(pq); 1072*ebcddc72SAlan Cox 1073*ebcddc72SAlan Cox /* 1074*ebcddc72SAlan Cox * Wakeup the sync daemon if we skipped a vnode in a writeable object 1075*ebcddc72SAlan Cox * and we didn't launder enough pages. 1076*ebcddc72SAlan Cox */ 1077*ebcddc72SAlan Cox if (vnodes_skipped > 0 && launder > 0) 1078*ebcddc72SAlan Cox (void)speedup_syncer(); 1079*ebcddc72SAlan Cox 1080*ebcddc72SAlan Cox return (starting_target - launder); 1081*ebcddc72SAlan Cox } 1082*ebcddc72SAlan Cox 1083*ebcddc72SAlan Cox /* 1084*ebcddc72SAlan Cox * Compute the integer square root. 1085*ebcddc72SAlan Cox */ 1086*ebcddc72SAlan Cox static u_int 1087*ebcddc72SAlan Cox isqrt(u_int num) 1088*ebcddc72SAlan Cox { 1089*ebcddc72SAlan Cox u_int bit, root, tmp; 1090*ebcddc72SAlan Cox 1091*ebcddc72SAlan Cox bit = 1u << ((NBBY * sizeof(u_int)) - 2); 1092*ebcddc72SAlan Cox while (bit > num) 1093*ebcddc72SAlan Cox bit >>= 2; 1094*ebcddc72SAlan Cox root = 0; 1095*ebcddc72SAlan Cox while (bit != 0) { 1096*ebcddc72SAlan Cox tmp = root + bit; 1097*ebcddc72SAlan Cox root >>= 1; 1098*ebcddc72SAlan Cox if (num >= tmp) { 1099*ebcddc72SAlan Cox num -= tmp; 1100*ebcddc72SAlan Cox root += bit; 1101*ebcddc72SAlan Cox } 1102*ebcddc72SAlan Cox bit >>= 2; 1103*ebcddc72SAlan Cox } 1104*ebcddc72SAlan Cox return (root); 1105*ebcddc72SAlan Cox } 1106*ebcddc72SAlan Cox 1107*ebcddc72SAlan Cox /* 1108*ebcddc72SAlan Cox * Perform the work of the laundry thread: periodically wake up and determine 1109*ebcddc72SAlan Cox * whether any pages need to be laundered. If so, determine the number of pages 1110*ebcddc72SAlan Cox * that need to be laundered, and launder them. 1111*ebcddc72SAlan Cox */ 1112*ebcddc72SAlan Cox static void 1113*ebcddc72SAlan Cox vm_pageout_laundry_worker(void *arg) 1114*ebcddc72SAlan Cox { 1115*ebcddc72SAlan Cox struct vm_domain *domain; 1116*ebcddc72SAlan Cox struct vm_pagequeue *pq; 1117*ebcddc72SAlan Cox uint64_t nclean, ndirty; 1118*ebcddc72SAlan Cox u_int last_launder, wakeups; 1119*ebcddc72SAlan Cox int domidx, last_target, launder, shortfall, shortfall_cycle, target; 1120*ebcddc72SAlan Cox bool in_shortfall; 1121*ebcddc72SAlan Cox 1122*ebcddc72SAlan Cox domidx = (uintptr_t)arg; 1123*ebcddc72SAlan Cox domain = &vm_dom[domidx]; 1124*ebcddc72SAlan Cox pq = &domain->vmd_pagequeues[PQ_LAUNDRY]; 1125*ebcddc72SAlan Cox KASSERT(domain->vmd_segs != 0, ("domain without segments")); 1126*ebcddc72SAlan Cox vm_pageout_init_marker(&domain->vmd_laundry_marker, PQ_LAUNDRY); 1127*ebcddc72SAlan Cox 1128*ebcddc72SAlan Cox shortfall = 0; 1129*ebcddc72SAlan Cox in_shortfall = false; 1130*ebcddc72SAlan Cox shortfall_cycle = 0; 1131*ebcddc72SAlan Cox target = 0; 1132*ebcddc72SAlan Cox last_launder = 0; 1133*ebcddc72SAlan Cox 1134*ebcddc72SAlan Cox /* 1135*ebcddc72SAlan Cox * The pageout laundry worker is never done, so loop forever. 1136*ebcddc72SAlan Cox */ 1137*ebcddc72SAlan Cox for (;;) { 1138*ebcddc72SAlan Cox KASSERT(target >= 0, ("negative target %d", target)); 1139*ebcddc72SAlan Cox KASSERT(shortfall_cycle >= 0, 1140*ebcddc72SAlan Cox ("negative cycle %d", shortfall_cycle)); 1141*ebcddc72SAlan Cox launder = 0; 1142*ebcddc72SAlan Cox wakeups = VM_METER_PCPU_CNT(v_pdwakeups); 1143*ebcddc72SAlan Cox 1144*ebcddc72SAlan Cox /* 1145*ebcddc72SAlan Cox * First determine whether we need to launder pages to meet a 1146*ebcddc72SAlan Cox * shortage of free pages. 1147*ebcddc72SAlan Cox */ 1148*ebcddc72SAlan Cox if (shortfall > 0) { 1149*ebcddc72SAlan Cox in_shortfall = true; 1150*ebcddc72SAlan Cox shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE; 1151*ebcddc72SAlan Cox target = shortfall; 1152*ebcddc72SAlan Cox } else if (!in_shortfall) 1153*ebcddc72SAlan Cox goto trybackground; 1154*ebcddc72SAlan Cox else if (shortfall_cycle == 0 || vm_laundry_target() <= 0) { 1155*ebcddc72SAlan Cox /* 1156*ebcddc72SAlan Cox * We recently entered shortfall and began laundering 1157*ebcddc72SAlan Cox * pages. If we have completed that laundering run 1158*ebcddc72SAlan Cox * (and we are no longer in shortfall) or we have met 1159*ebcddc72SAlan Cox * our laundry target through other activity, then we 1160*ebcddc72SAlan Cox * can stop laundering pages. 1161*ebcddc72SAlan Cox */ 1162*ebcddc72SAlan Cox in_shortfall = false; 1163*ebcddc72SAlan Cox target = 0; 1164*ebcddc72SAlan Cox goto trybackground; 1165*ebcddc72SAlan Cox } 1166*ebcddc72SAlan Cox last_launder = wakeups; 1167*ebcddc72SAlan Cox launder = target / shortfall_cycle--; 1168*ebcddc72SAlan Cox goto dolaundry; 1169*ebcddc72SAlan Cox 1170*ebcddc72SAlan Cox /* 1171*ebcddc72SAlan Cox * There's no immediate need to launder any pages; see if we 1172*ebcddc72SAlan Cox * meet the conditions to perform background laundering: 1173*ebcddc72SAlan Cox * 1174*ebcddc72SAlan Cox * 1. The ratio of dirty to clean inactive pages exceeds the 1175*ebcddc72SAlan Cox * background laundering threshold and the pagedaemon has 1176*ebcddc72SAlan Cox * been woken up to reclaim pages since our last 1177*ebcddc72SAlan Cox * laundering, or 1178*ebcddc72SAlan Cox * 2. we haven't yet reached the target of the current 1179*ebcddc72SAlan Cox * background laundering run. 1180*ebcddc72SAlan Cox * 1181*ebcddc72SAlan Cox * The background laundering threshold is not a constant. 1182*ebcddc72SAlan Cox * Instead, it is a slowly growing function of the number of 1183*ebcddc72SAlan Cox * page daemon wakeups since the last laundering. Thus, as the 1184*ebcddc72SAlan Cox * ratio of dirty to clean inactive pages grows, the amount of 1185*ebcddc72SAlan Cox * memory pressure required to trigger laundering decreases. 1186*ebcddc72SAlan Cox */ 1187*ebcddc72SAlan Cox trybackground: 1188*ebcddc72SAlan Cox nclean = vm_cnt.v_inactive_count + vm_cnt.v_free_count; 1189*ebcddc72SAlan Cox ndirty = vm_cnt.v_laundry_count; 1190*ebcddc72SAlan Cox if (target == 0 && wakeups != last_launder && 1191*ebcddc72SAlan Cox ndirty * isqrt(wakeups - last_launder) >= nclean) { 1192*ebcddc72SAlan Cox target = vm_background_launder_target; 1193*ebcddc72SAlan Cox } 1194*ebcddc72SAlan Cox 1195*ebcddc72SAlan Cox /* 1196*ebcddc72SAlan Cox * We have a non-zero background laundering target. If we've 1197*ebcddc72SAlan Cox * laundered up to our maximum without observing a page daemon 1198*ebcddc72SAlan Cox * wakeup, just stop. This is a safety belt that ensures we 1199*ebcddc72SAlan Cox * don't launder an excessive amount if memory pressure is low 1200*ebcddc72SAlan Cox * and the ratio of dirty to clean pages is large. Otherwise, 1201*ebcddc72SAlan Cox * proceed at the background laundering rate. 1202*ebcddc72SAlan Cox */ 1203*ebcddc72SAlan Cox if (target > 0) { 1204*ebcddc72SAlan Cox if (wakeups != last_launder) { 1205*ebcddc72SAlan Cox last_launder = wakeups; 1206*ebcddc72SAlan Cox last_target = target; 1207*ebcddc72SAlan Cox } else if (last_target - target >= 1208*ebcddc72SAlan Cox vm_background_launder_max * PAGE_SIZE / 1024) { 1209*ebcddc72SAlan Cox target = 0; 1210*ebcddc72SAlan Cox } 1211*ebcddc72SAlan Cox launder = vm_background_launder_rate * PAGE_SIZE / 1024; 1212*ebcddc72SAlan Cox launder /= VM_LAUNDER_RATE; 1213*ebcddc72SAlan Cox if (launder > target) 1214*ebcddc72SAlan Cox launder = target; 1215*ebcddc72SAlan Cox } 1216*ebcddc72SAlan Cox 1217*ebcddc72SAlan Cox dolaundry: 1218*ebcddc72SAlan Cox if (launder > 0) { 1219*ebcddc72SAlan Cox /* 1220*ebcddc72SAlan Cox * Because of I/O clustering, the number of laundered 1221*ebcddc72SAlan Cox * pages could exceed "target" by the maximum size of 1222*ebcddc72SAlan Cox * a cluster minus one. 1223*ebcddc72SAlan Cox */ 1224*ebcddc72SAlan Cox target -= min(vm_pageout_launder(domain, launder, 1225*ebcddc72SAlan Cox in_shortfall), target); 1226*ebcddc72SAlan Cox pause("laundp", hz / VM_LAUNDER_RATE); 1227*ebcddc72SAlan Cox } 1228*ebcddc72SAlan Cox 1229*ebcddc72SAlan Cox /* 1230*ebcddc72SAlan Cox * If we're not currently laundering pages and the page daemon 1231*ebcddc72SAlan Cox * hasn't posted a new request, sleep until the page daemon 1232*ebcddc72SAlan Cox * kicks us. 1233*ebcddc72SAlan Cox */ 1234*ebcddc72SAlan Cox vm_pagequeue_lock(pq); 1235*ebcddc72SAlan Cox if (target == 0 && vm_laundry_request == VM_LAUNDRY_IDLE) 1236*ebcddc72SAlan Cox (void)mtx_sleep(&vm_laundry_request, 1237*ebcddc72SAlan Cox vm_pagequeue_lockptr(pq), PVM, "launds", 0); 1238*ebcddc72SAlan Cox 1239*ebcddc72SAlan Cox /* 1240*ebcddc72SAlan Cox * If the pagedaemon has indicated that it's in shortfall, start 1241*ebcddc72SAlan Cox * a shortfall laundering unless we're already in the middle of 1242*ebcddc72SAlan Cox * one. This may preempt a background laundering. 1243*ebcddc72SAlan Cox */ 1244*ebcddc72SAlan Cox if (vm_laundry_request == VM_LAUNDRY_SHORTFALL && 1245*ebcddc72SAlan Cox (!in_shortfall || shortfall_cycle == 0)) { 1246*ebcddc72SAlan Cox shortfall = vm_laundry_target() + vm_pageout_deficit; 1247*ebcddc72SAlan Cox target = 0; 1248*ebcddc72SAlan Cox } else 1249*ebcddc72SAlan Cox shortfall = 0; 1250*ebcddc72SAlan Cox 1251*ebcddc72SAlan Cox if (target == 0) 1252*ebcddc72SAlan Cox vm_laundry_request = VM_LAUNDRY_IDLE; 1253*ebcddc72SAlan Cox vm_pagequeue_unlock(pq); 1254*ebcddc72SAlan Cox } 1255*ebcddc72SAlan Cox } 1256*ebcddc72SAlan Cox 1257*ebcddc72SAlan Cox /* 1258df8bae1dSRodney W. Grimes * vm_pageout_scan does the dirty work for the pageout daemon. 1259d9e23210SJeff Roberson * 1260*ebcddc72SAlan Cox * pass == 0: Update active LRU/deactivate pages 1261*ebcddc72SAlan Cox * pass >= 1: Free inactive pages 1262e57dd910SAlan Cox * 1263e57dd910SAlan Cox * Returns true if pass was zero or enough pages were freed by the inactive 1264e57dd910SAlan Cox * queue scan to meet the target. 1265df8bae1dSRodney W. Grimes */ 1266e57dd910SAlan Cox static bool 1267449c2e92SKonstantin Belousov vm_pageout_scan(struct vm_domain *vmd, int pass) 1268df8bae1dSRodney W. Grimes { 1269502ba6e4SJohn Dyson vm_page_t m, next; 12708d220203SAlan Cox struct vm_pagequeue *pq; 1271df8bae1dSRodney W. Grimes vm_object_t object; 127222cf98d1SAlan Cox long min_scan; 1273*ebcddc72SAlan Cox int act_delta, addl_page_shortage, deficit, inactq_shortage, maxscan; 1274*ebcddc72SAlan Cox int page_shortage, scan_tick, scanned, starting_page_shortage; 1275*ebcddc72SAlan Cox boolean_t queue_locked; 12760d94caffSDavid Greenman 1277df8bae1dSRodney W. Grimes /* 1278d9e23210SJeff Roberson * If we need to reclaim memory ask kernel caches to return 1279c9612b2dSJeff Roberson * some. We rate limit to avoid thrashing. 1280d9e23210SJeff Roberson */ 1281c9612b2dSJeff Roberson if (vmd == &vm_dom[0] && pass > 0 && 1282a6bf3a9eSRyan Stone (time_uptime - lowmem_uptime) >= lowmem_period) { 1283d9e23210SJeff Roberson /* 1284855a310fSJeff Roberson * Decrease registered cache sizes. 1285855a310fSJeff Roberson */ 128614a0d74eSSteven Hartland SDT_PROBE0(vm, , , vm__lowmem_scan); 1287855a310fSJeff Roberson EVENTHANDLER_INVOKE(vm_lowmem, 0); 1288855a310fSJeff Roberson /* 1289d9e23210SJeff Roberson * We do this explicitly after the caches have been 1290d9e23210SJeff Roberson * drained above. 1291855a310fSJeff Roberson */ 1292855a310fSJeff Roberson uma_reclaim(); 1293a6bf3a9eSRyan Stone lowmem_uptime = time_uptime; 1294d9e23210SJeff Roberson } 12955985940eSJohn Dyson 1296311e34e2SKonstantin Belousov /* 129796240c89SEitan Adler * The addl_page_shortage is the number of temporarily 1298311e34e2SKonstantin Belousov * stuck pages in the inactive queue. In other words, the 1299449c2e92SKonstantin Belousov * number of pages from the inactive count that should be 1300311e34e2SKonstantin Belousov * discounted in setting the target for the active queue scan. 1301311e34e2SKonstantin Belousov */ 13029099545aSAlan Cox addl_page_shortage = 0; 13039099545aSAlan Cox 13041c7c3c6aSMatthew Dillon /* 1305e57dd910SAlan Cox * Calculate the number of pages that we want to free. This number 1306e57dd910SAlan Cox * can be negative if many pages are freed between the wakeup call to 1307e57dd910SAlan Cox * the page daemon and this calculation. 13081c7c3c6aSMatthew Dillon */ 130960196cdaSAlan Cox if (pass > 0) { 131060196cdaSAlan Cox deficit = atomic_readandclear_int(&vm_pageout_deficit); 13119099545aSAlan Cox page_shortage = vm_paging_target() + deficit; 131260196cdaSAlan Cox } else 131360196cdaSAlan Cox page_shortage = deficit = 0; 131476386c7eSKonstantin Belousov starting_page_shortage = page_shortage; 13151c7c3c6aSMatthew Dillon 1316936524aaSMatthew Dillon /* 1317f095d1bbSAlan Cox * Start scanning the inactive queue for pages that we can free. The 1318f095d1bbSAlan Cox * scan will stop when we reach the target or we have scanned the 1319f095d1bbSAlan Cox * entire queue. (Note that m->act_count is not used to make 1320f095d1bbSAlan Cox * decisions for the inactive queue, only for the active queue.) 13218d220203SAlan Cox */ 1322449c2e92SKonstantin Belousov pq = &vmd->vmd_pagequeues[PQ_INACTIVE]; 1323449c2e92SKonstantin Belousov maxscan = pq->pq_cnt; 13248d220203SAlan Cox vm_pagequeue_lock(pq); 13253ac8f842SMark Johnston queue_locked = TRUE; 13268d220203SAlan Cox for (m = TAILQ_FIRST(&pq->pq_pl); 13271c7c3c6aSMatthew Dillon m != NULL && maxscan-- > 0 && page_shortage > 0; 1328e929c00dSKirk McKusick m = next) { 13298d220203SAlan Cox vm_pagequeue_assert_locked(pq); 13303ac8f842SMark Johnston KASSERT(queue_locked, ("unlocked inactive queue")); 1331*ebcddc72SAlan Cox KASSERT(vm_page_inactive(m), ("Inactive queue %p", m)); 1332df8bae1dSRodney W. Grimes 13338d220203SAlan Cox PCPU_INC(cnt.v_pdpages); 1334c325e866SKonstantin Belousov next = TAILQ_NEXT(m, plinks.q); 1335df8bae1dSRodney W. Grimes 1336936524aaSMatthew Dillon /* 1337936524aaSMatthew Dillon * skip marker pages 1338936524aaSMatthew Dillon */ 1339936524aaSMatthew Dillon if (m->flags & PG_MARKER) 1340936524aaSMatthew Dillon continue; 1341936524aaSMatthew Dillon 13427900f95dSKonstantin Belousov KASSERT((m->flags & PG_FICTITIOUS) == 0, 13437900f95dSKonstantin Belousov ("Fictitious page %p cannot be in inactive queue", m)); 13447900f95dSKonstantin Belousov KASSERT((m->oflags & VPO_UNMANAGED) == 0, 13457900f95dSKonstantin Belousov ("Unmanaged page %p cannot be in inactive queue", m)); 13467900f95dSKonstantin Belousov 13478c616246SKonstantin Belousov /* 1348311e34e2SKonstantin Belousov * The page or object lock acquisitions fail if the 1349311e34e2SKonstantin Belousov * page was removed from the queue or moved to a 1350311e34e2SKonstantin Belousov * different position within the queue. In either 1351311e34e2SKonstantin Belousov * case, addl_page_shortage should not be incremented. 13528c616246SKonstantin Belousov */ 1353a3aeedabSAlan Cox if (!vm_pageout_page_lock(m, &next)) 1354a3aeedabSAlan Cox goto unlock_page; 1355a3aeedabSAlan Cox else if (m->hold_count != 0) { 1356a3aeedabSAlan Cox /* 1357a3aeedabSAlan Cox * Held pages are essentially stuck in the 1358a3aeedabSAlan Cox * queue. So, they ought to be discounted 1359a3aeedabSAlan Cox * from the inactive count. See the 1360e57dd910SAlan Cox * calculation of inactq_shortage before the 1361a3aeedabSAlan Cox * loop over the active queue below. 1362a3aeedabSAlan Cox */ 1363a3aeedabSAlan Cox addl_page_shortage++; 1364a3aeedabSAlan Cox goto unlock_page; 1365df8bae1dSRodney W. Grimes } 13669ee2165fSAlan Cox object = m->object; 1367a3aeedabSAlan Cox if (!VM_OBJECT_TRYWLOCK(object)) { 1368a3aeedabSAlan Cox if (!vm_pageout_fallback_object_lock(m, &next)) 1369a3aeedabSAlan Cox goto unlock_object; 1370a3aeedabSAlan Cox else if (m->hold_count != 0) { 1371b182ec9eSJohn Dyson addl_page_shortage++; 1372a3aeedabSAlan Cox goto unlock_object; 1373a3aeedabSAlan Cox } 1374a3aeedabSAlan Cox } 1375a3aeedabSAlan Cox if (vm_page_busied(m)) { 1376a3aeedabSAlan Cox /* 1377a3aeedabSAlan Cox * Don't mess with busy pages. Leave them at 1378a3aeedabSAlan Cox * the front of the queue. Most likely, they 1379a3aeedabSAlan Cox * are being paged out and will leave the 1380a3aeedabSAlan Cox * queue shortly after the scan finishes. So, 1381a3aeedabSAlan Cox * they ought to be discounted from the 1382a3aeedabSAlan Cox * inactive count. 1383a3aeedabSAlan Cox */ 1384a3aeedabSAlan Cox addl_page_shortage++; 1385a3aeedabSAlan Cox unlock_object: 1386a3aeedabSAlan Cox VM_OBJECT_WUNLOCK(object); 1387a3aeedabSAlan Cox unlock_page: 1388a3aeedabSAlan Cox vm_page_unlock(m); 138926f9a767SRodney W. Grimes continue; 139026f9a767SRodney W. Grimes } 1391a3aeedabSAlan Cox KASSERT(m->hold_count == 0, ("Held page %p", m)); 1392bd7e5f99SJohn Dyson 13937e006499SJohn Dyson /* 1394*ebcddc72SAlan Cox * Dequeue the inactive page and unlock the inactive page 1395*ebcddc72SAlan Cox * queue, invalidating the 'next' pointer. Dequeueing the 1396*ebcddc72SAlan Cox * page here avoids a later reacquisition (and release) of 1397*ebcddc72SAlan Cox * the inactive page queue lock when vm_page_activate(), 1398*ebcddc72SAlan Cox * vm_page_free(), or vm_page_launder() is called. Use a 1399*ebcddc72SAlan Cox * marker to remember our place in the inactive queue. 140048cc2fc7SKonstantin Belousov */ 1401c325e866SKonstantin Belousov TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q); 1402*ebcddc72SAlan Cox vm_page_dequeue_locked(m); 14038d220203SAlan Cox vm_pagequeue_unlock(pq); 14043ac8f842SMark Johnston queue_locked = FALSE; 140548cc2fc7SKonstantin Belousov 140648cc2fc7SKonstantin Belousov /* 14078748f58cSKonstantin Belousov * Invalid pages can be easily freed. They cannot be 14088748f58cSKonstantin Belousov * mapped, vm_page_free() asserts this. 1409776f729cSKonstantin Belousov */ 14108748f58cSKonstantin Belousov if (m->valid == 0) 14118748f58cSKonstantin Belousov goto free_page; 1412776f729cSKonstantin Belousov 1413776f729cSKonstantin Belousov /* 1414960810ccSAlan Cox * If the page has been referenced and the object is not dead, 1415960810ccSAlan Cox * reactivate or requeue the page depending on whether the 1416960810ccSAlan Cox * object is mapped. 14177e006499SJohn Dyson */ 1418bb7858eaSJeff Roberson if ((m->aflags & PGA_REFERENCED) != 0) { 1419bb7858eaSJeff Roberson vm_page_aflag_clear(m, PGA_REFERENCED); 1420bb7858eaSJeff Roberson act_delta = 1; 142186fa2471SAlan Cox } else 142286fa2471SAlan Cox act_delta = 0; 1423bb7858eaSJeff Roberson if (object->ref_count != 0) { 1424bb7858eaSJeff Roberson act_delta += pmap_ts_referenced(m); 1425bb7858eaSJeff Roberson } else { 1426bb7858eaSJeff Roberson KASSERT(!pmap_page_is_mapped(m), 1427bb7858eaSJeff Roberson ("vm_pageout_scan: page %p is mapped", m)); 14282fe6e4d7SDavid Greenman } 1429bb7858eaSJeff Roberson if (act_delta != 0) { 143086fa2471SAlan Cox if (object->ref_count != 0) { 1431*ebcddc72SAlan Cox PCPU_INC(cnt.v_reactivated); 143226f9a767SRodney W. Grimes vm_page_activate(m); 1433960810ccSAlan Cox 1434960810ccSAlan Cox /* 1435960810ccSAlan Cox * Increase the activation count if the page 1436960810ccSAlan Cox * was referenced while in the inactive queue. 1437960810ccSAlan Cox * This makes it less likely that the page will 1438960810ccSAlan Cox * be returned prematurely to the inactive 1439960810ccSAlan Cox * queue. 1440960810ccSAlan Cox */ 1441bb7858eaSJeff Roberson m->act_count += act_delta + ACT_ADVANCE; 1442960810ccSAlan Cox goto drop_page; 1443*ebcddc72SAlan Cox } else if ((object->flags & OBJ_DEAD) == 0) { 1444*ebcddc72SAlan Cox vm_pagequeue_lock(pq); 1445*ebcddc72SAlan Cox queue_locked = TRUE; 1446*ebcddc72SAlan Cox m->queue = PQ_INACTIVE; 1447*ebcddc72SAlan Cox TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); 1448*ebcddc72SAlan Cox vm_pagequeue_cnt_inc(pq); 1449*ebcddc72SAlan Cox goto drop_page; 1450*ebcddc72SAlan Cox } 1451960810ccSAlan Cox } 145267bf6868SJohn Dyson 14537e006499SJohn Dyson /* 14549fc4739dSAlan Cox * If the page appears to be clean at the machine-independent 14559fc4739dSAlan Cox * layer, then remove all of its mappings from the pmap in 1456a766ffd0SAlan Cox * anticipation of freeing it. If, however, any of the page's 1457a766ffd0SAlan Cox * mappings allow write access, then the page may still be 1458a766ffd0SAlan Cox * modified until the last of those mappings are removed. 14597e006499SJohn Dyson */ 1460aa044135SAlan Cox if (object->ref_count != 0) { 14619fc4739dSAlan Cox vm_page_test_dirty(m); 1462aa044135SAlan Cox if (m->dirty == 0) 1463b78ddb0bSAlan Cox pmap_remove_all(m); 1464aa044135SAlan Cox } 1465dcbcd518SBruce Evans 14666989c456SAlan Cox /* 1467*ebcddc72SAlan Cox * Clean pages can be freed, but dirty pages must be sent back 1468*ebcddc72SAlan Cox * to the laundry, unless they belong to a dead object. 1469*ebcddc72SAlan Cox * Requeueing dirty pages from dead objects is pointless, as 1470*ebcddc72SAlan Cox * they are being paged out and freed by the thread that 1471*ebcddc72SAlan Cox * destroyed the object. 14726989c456SAlan Cox */ 1473*ebcddc72SAlan Cox if (m->dirty == 0) { 14748748f58cSKonstantin Belousov free_page: 147578afdce6SAlan Cox vm_page_free(m); 147678afdce6SAlan Cox PCPU_INC(cnt.v_dfree); 14771c7c3c6aSMatthew Dillon --page_shortage; 1478*ebcddc72SAlan Cox } else if ((object->flags & OBJ_DEAD) == 0) 1479*ebcddc72SAlan Cox vm_page_launder(m); 1480776f729cSKonstantin Belousov drop_page: 148148cc2fc7SKonstantin Belousov vm_page_unlock(m); 148289f6b863SAttilio Rao VM_OBJECT_WUNLOCK(object); 14833ac8f842SMark Johnston if (!queue_locked) { 14848d220203SAlan Cox vm_pagequeue_lock(pq); 14853ac8f842SMark Johnston queue_locked = TRUE; 14866989c456SAlan Cox } 1487c325e866SKonstantin Belousov next = TAILQ_NEXT(&vmd->vmd_marker, plinks.q); 1488c325e866SKonstantin Belousov TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_marker, plinks.q); 14890d94caffSDavid Greenman } 14908d220203SAlan Cox vm_pagequeue_unlock(pq); 149126f9a767SRodney W. Grimes 1492*ebcddc72SAlan Cox /* 1493*ebcddc72SAlan Cox * Wake up the laundry thread so that it can perform any needed 1494*ebcddc72SAlan Cox * laundering. If we didn't meet our target, we're in shortfall and 1495*ebcddc72SAlan Cox * need to launder more aggressively. 1496*ebcddc72SAlan Cox */ 1497*ebcddc72SAlan Cox if (vm_laundry_request == VM_LAUNDRY_IDLE && 1498*ebcddc72SAlan Cox starting_page_shortage > 0) { 1499*ebcddc72SAlan Cox pq = &vm_dom[0].vmd_pagequeues[PQ_LAUNDRY]; 1500*ebcddc72SAlan Cox vm_pagequeue_lock(pq); 1501*ebcddc72SAlan Cox if (page_shortage > 0) { 1502*ebcddc72SAlan Cox vm_laundry_request = VM_LAUNDRY_SHORTFALL; 1503*ebcddc72SAlan Cox PCPU_INC(cnt.v_pdshortfalls); 1504*ebcddc72SAlan Cox } else if (vm_laundry_request != VM_LAUNDRY_SHORTFALL) 1505*ebcddc72SAlan Cox vm_laundry_request = VM_LAUNDRY_BACKGROUND; 1506*ebcddc72SAlan Cox wakeup(&vm_laundry_request); 1507*ebcddc72SAlan Cox vm_pagequeue_unlock(pq); 1508*ebcddc72SAlan Cox } 1509*ebcddc72SAlan Cox 15109452b5edSAlan Cox #if !defined(NO_SWAPPING) 15119452b5edSAlan Cox /* 1512f095d1bbSAlan Cox * Wakeup the swapout daemon if we didn't free the targeted number of 1513f095d1bbSAlan Cox * pages. 15149452b5edSAlan Cox */ 15159452b5edSAlan Cox if (vm_swap_enabled && page_shortage > 0) 15169452b5edSAlan Cox vm_req_vmdaemon(VM_SWAP_NORMAL); 15179452b5edSAlan Cox #endif 15189452b5edSAlan Cox 15199452b5edSAlan Cox /* 152076386c7eSKonstantin Belousov * If the inactive queue scan fails repeatedly to meet its 152176386c7eSKonstantin Belousov * target, kill the largest process. 152276386c7eSKonstantin Belousov */ 152376386c7eSKonstantin Belousov vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage); 152476386c7eSKonstantin Belousov 152576386c7eSKonstantin Belousov /* 1526936524aaSMatthew Dillon * Compute the number of pages we want to try to move from the 1527*ebcddc72SAlan Cox * active queue to either the inactive or laundry queue. 1528*ebcddc72SAlan Cox * 1529*ebcddc72SAlan Cox * When scanning active pages, we make clean pages count more heavily 1530*ebcddc72SAlan Cox * towards the page shortage than dirty pages. This is because dirty 1531*ebcddc72SAlan Cox * pages must be laundered before they can be reused and thus have less 1532*ebcddc72SAlan Cox * utility when attempting to quickly alleviate a shortage. However, 1533*ebcddc72SAlan Cox * this weighting also causes the scan to deactivate dirty pages more 1534*ebcddc72SAlan Cox * more aggressively, improving the effectiveness of clustering and 1535*ebcddc72SAlan Cox * ensuring that they can eventually be reused. 15361c7c3c6aSMatthew Dillon */ 1537*ebcddc72SAlan Cox inactq_shortage = vm_cnt.v_inactive_target - (vm_cnt.v_inactive_count + 1538*ebcddc72SAlan Cox vm_cnt.v_laundry_count / act_scan_laundry_weight) + 15399099545aSAlan Cox vm_paging_target() + deficit + addl_page_shortage; 1540*ebcddc72SAlan Cox page_shortage *= act_scan_laundry_weight; 15419099545aSAlan Cox 1542114f62c6SJeff Roberson pq = &vmd->vmd_pagequeues[PQ_ACTIVE]; 1543114f62c6SJeff Roberson vm_pagequeue_lock(pq); 15449099545aSAlan Cox maxscan = pq->pq_cnt; 15459099545aSAlan Cox 1546d9e23210SJeff Roberson /* 1547d9e23210SJeff Roberson * If we're just idle polling attempt to visit every 1548d9e23210SJeff Roberson * active page within 'update_period' seconds. 1549d9e23210SJeff Roberson */ 155022cf98d1SAlan Cox scan_tick = ticks; 155122cf98d1SAlan Cox if (vm_pageout_update_period != 0) { 155222cf98d1SAlan Cox min_scan = pq->pq_cnt; 155322cf98d1SAlan Cox min_scan *= scan_tick - vmd->vmd_last_active_scan; 155422cf98d1SAlan Cox min_scan /= hz * vm_pageout_update_period; 155522cf98d1SAlan Cox } else 155622cf98d1SAlan Cox min_scan = 0; 1557e57dd910SAlan Cox if (min_scan > 0 || (inactq_shortage > 0 && maxscan > 0)) 155822cf98d1SAlan Cox vmd->vmd_last_active_scan = scan_tick; 15591c7c3c6aSMatthew Dillon 15601c7c3c6aSMatthew Dillon /* 156122cf98d1SAlan Cox * Scan the active queue for pages that can be deactivated. Update 156222cf98d1SAlan Cox * the per-page activity counter and use it to identify deactivation 156379144408SAlan Cox * candidates. Held pages may be deactivated. 15641c7c3c6aSMatthew Dillon */ 156522cf98d1SAlan Cox for (m = TAILQ_FIRST(&pq->pq_pl), scanned = 0; m != NULL && (scanned < 1566e57dd910SAlan Cox min_scan || (inactq_shortage > 0 && scanned < maxscan)); m = next, 156722cf98d1SAlan Cox scanned++) { 15689cf51988SAlan Cox KASSERT(m->queue == PQ_ACTIVE, 1569d3c09dd7SAlan Cox ("vm_pageout_scan: page %p isn't active", m)); 1570c325e866SKonstantin Belousov next = TAILQ_NEXT(m, plinks.q); 157122cf98d1SAlan Cox if ((m->flags & PG_MARKER) != 0) 15728dbca793STor Egge continue; 15737900f95dSKonstantin Belousov KASSERT((m->flags & PG_FICTITIOUS) == 0, 15747900f95dSKonstantin Belousov ("Fictitious page %p cannot be in active queue", m)); 15757900f95dSKonstantin Belousov KASSERT((m->oflags & VPO_UNMANAGED) == 0, 15767900f95dSKonstantin Belousov ("Unmanaged page %p cannot be in active queue", m)); 15779ee2165fSAlan Cox if (!vm_pageout_page_lock(m, &next)) { 15788c616246SKonstantin Belousov vm_page_unlock(m); 15792965a453SKip Macy continue; 15802965a453SKip Macy } 1581b18bfc3dSJohn Dyson 1582b18bfc3dSJohn Dyson /* 158379144408SAlan Cox * The count for page daemon pages is updated after checking 158479144408SAlan Cox * the page for eligibility. 1585b18bfc3dSJohn Dyson */ 15868d220203SAlan Cox PCPU_INC(cnt.v_pdpages); 1587ef743ce6SJohn Dyson 15887e006499SJohn Dyson /* 15897e006499SJohn Dyson * Check to see "how much" the page has been used. 15907e006499SJohn Dyson */ 159186fa2471SAlan Cox if ((m->aflags & PGA_REFERENCED) != 0) { 1592bb7858eaSJeff Roberson vm_page_aflag_clear(m, PGA_REFERENCED); 159386fa2471SAlan Cox act_delta = 1; 159486fa2471SAlan Cox } else 159586fa2471SAlan Cox act_delta = 0; 159686fa2471SAlan Cox 1597274132acSJeff Roberson /* 159879144408SAlan Cox * Perform an unsynchronized object ref count check. While 159979144408SAlan Cox * the page lock ensures that the page is not reallocated to 160079144408SAlan Cox * another object, in particular, one with unmanaged mappings 160179144408SAlan Cox * that cannot support pmap_ts_referenced(), two races are, 160279144408SAlan Cox * nonetheless, possible: 160379144408SAlan Cox * 1) The count was transitioning to zero, but we saw a non- 160479144408SAlan Cox * zero value. pmap_ts_referenced() will return zero 160579144408SAlan Cox * because the page is not mapped. 160679144408SAlan Cox * 2) The count was transitioning to one, but we saw zero. 160779144408SAlan Cox * This race delays the detection of a new reference. At 160879144408SAlan Cox * worst, we will deactivate and reactivate the page. 1609274132acSJeff Roberson */ 1610274132acSJeff Roberson if (m->object->ref_count != 0) 1611bb7858eaSJeff Roberson act_delta += pmap_ts_referenced(m); 1612bb7858eaSJeff Roberson 1613bb7858eaSJeff Roberson /* 1614bb7858eaSJeff Roberson * Advance or decay the act_count based on recent usage. 1615bb7858eaSJeff Roberson */ 161686fa2471SAlan Cox if (act_delta != 0) { 1617bb7858eaSJeff Roberson m->act_count += ACT_ADVANCE + act_delta; 161838efa82bSJohn Dyson if (m->act_count > ACT_MAX) 161938efa82bSJohn Dyson m->act_count = ACT_MAX; 162086fa2471SAlan Cox } else 162138efa82bSJohn Dyson m->act_count -= min(m->act_count, ACT_DECLINE); 1622bb7858eaSJeff Roberson 1623bb7858eaSJeff Roberson /* 1624*ebcddc72SAlan Cox * Move this page to the tail of the active, inactive or laundry 1625bb7858eaSJeff Roberson * queue depending on usage. 1626bb7858eaSJeff Roberson */ 162786fa2471SAlan Cox if (m->act_count == 0) { 16288d220203SAlan Cox /* Dequeue to avoid later lock recursion. */ 16298d220203SAlan Cox vm_page_dequeue_locked(m); 1630*ebcddc72SAlan Cox 1631*ebcddc72SAlan Cox /* 1632*ebcddc72SAlan Cox * When not short for inactive pages, let dirty pages go 1633*ebcddc72SAlan Cox * through the inactive queue before moving to the 1634*ebcddc72SAlan Cox * laundry queues. This gives them some extra time to 1635*ebcddc72SAlan Cox * be reactivated, potentially avoiding an expensive 1636*ebcddc72SAlan Cox * pageout. During a page shortage, the inactive queue 1637*ebcddc72SAlan Cox * is necessarily small, so we may move dirty pages 1638*ebcddc72SAlan Cox * directly to the laundry queue. 1639*ebcddc72SAlan Cox */ 1640*ebcddc72SAlan Cox if (inactq_shortage <= 0) 1641d4a272dbSJohn Dyson vm_page_deactivate(m); 1642*ebcddc72SAlan Cox else { 1643*ebcddc72SAlan Cox /* 1644*ebcddc72SAlan Cox * Calling vm_page_test_dirty() here would 1645*ebcddc72SAlan Cox * require acquisition of the object's write 1646*ebcddc72SAlan Cox * lock. However, during a page shortage, 1647*ebcddc72SAlan Cox * directing dirty pages into the laundry 1648*ebcddc72SAlan Cox * queue is only an optimization and not a 1649*ebcddc72SAlan Cox * requirement. Therefore, we simply rely on 1650*ebcddc72SAlan Cox * the opportunistic updates to the page's 1651*ebcddc72SAlan Cox * dirty field by the pmap. 1652*ebcddc72SAlan Cox */ 1653*ebcddc72SAlan Cox if (m->dirty == 0) { 1654*ebcddc72SAlan Cox vm_page_deactivate(m); 1655*ebcddc72SAlan Cox inactq_shortage -= 1656*ebcddc72SAlan Cox act_scan_laundry_weight; 1657*ebcddc72SAlan Cox } else { 1658*ebcddc72SAlan Cox vm_page_launder(m); 1659e57dd910SAlan Cox inactq_shortage--; 1660*ebcddc72SAlan Cox } 1661*ebcddc72SAlan Cox } 16628d220203SAlan Cox } else 16638d220203SAlan Cox vm_page_requeue_locked(m); 16642965a453SKip Macy vm_page_unlock(m); 166526f9a767SRodney W. Grimes } 16668d220203SAlan Cox vm_pagequeue_unlock(pq); 1667ceb0cf87SJohn Dyson #if !defined(NO_SWAPPING) 1668ceb0cf87SJohn Dyson /* 166987ff568cSAlan Cox * Idle process swapout -- run once per second when we are reclaiming 167087ff568cSAlan Cox * pages. 1671ceb0cf87SJohn Dyson */ 167287ff568cSAlan Cox if (vm_swap_idle_enabled && pass > 0) { 1673ceb0cf87SJohn Dyson static long lsec; 1674227ee8a1SPoul-Henning Kamp if (time_second != lsec) { 167597824da3SAlan Cox vm_req_vmdaemon(VM_SWAP_IDLE); 1676227ee8a1SPoul-Henning Kamp lsec = time_second; 1677ceb0cf87SJohn Dyson } 1678ceb0cf87SJohn Dyson } 1679ceb0cf87SJohn Dyson #endif 1680e57dd910SAlan Cox return (page_shortage <= 0); 16812025d69bSKonstantin Belousov } 16822025d69bSKonstantin Belousov 1683449c2e92SKonstantin Belousov static int vm_pageout_oom_vote; 1684449c2e92SKonstantin Belousov 1685449c2e92SKonstantin Belousov /* 1686449c2e92SKonstantin Belousov * The pagedaemon threads randlomly select one to perform the 1687449c2e92SKonstantin Belousov * OOM. Trying to kill processes before all pagedaemons 1688449c2e92SKonstantin Belousov * failed to reach free target is premature. 1689449c2e92SKonstantin Belousov */ 1690449c2e92SKonstantin Belousov static void 169176386c7eSKonstantin Belousov vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, 169276386c7eSKonstantin Belousov int starting_page_shortage) 1693449c2e92SKonstantin Belousov { 1694449c2e92SKonstantin Belousov int old_vote; 1695449c2e92SKonstantin Belousov 169676386c7eSKonstantin Belousov if (starting_page_shortage <= 0 || starting_page_shortage != 169776386c7eSKonstantin Belousov page_shortage) 169876386c7eSKonstantin Belousov vmd->vmd_oom_seq = 0; 169976386c7eSKonstantin Belousov else 170076386c7eSKonstantin Belousov vmd->vmd_oom_seq++; 170176386c7eSKonstantin Belousov if (vmd->vmd_oom_seq < vm_pageout_oom_seq) { 1702449c2e92SKonstantin Belousov if (vmd->vmd_oom) { 1703449c2e92SKonstantin Belousov vmd->vmd_oom = FALSE; 1704449c2e92SKonstantin Belousov atomic_subtract_int(&vm_pageout_oom_vote, 1); 1705449c2e92SKonstantin Belousov } 1706449c2e92SKonstantin Belousov return; 1707449c2e92SKonstantin Belousov } 1708449c2e92SKonstantin Belousov 170976386c7eSKonstantin Belousov /* 171076386c7eSKonstantin Belousov * Do not follow the call sequence until OOM condition is 171176386c7eSKonstantin Belousov * cleared. 171276386c7eSKonstantin Belousov */ 171376386c7eSKonstantin Belousov vmd->vmd_oom_seq = 0; 171476386c7eSKonstantin Belousov 1715449c2e92SKonstantin Belousov if (vmd->vmd_oom) 1716449c2e92SKonstantin Belousov return; 1717449c2e92SKonstantin Belousov 1718449c2e92SKonstantin Belousov vmd->vmd_oom = TRUE; 1719449c2e92SKonstantin Belousov old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1); 1720449c2e92SKonstantin Belousov if (old_vote != vm_ndomains - 1) 1721449c2e92SKonstantin Belousov return; 1722449c2e92SKonstantin Belousov 1723449c2e92SKonstantin Belousov /* 1724449c2e92SKonstantin Belousov * The current pagedaemon thread is the last in the quorum to 1725449c2e92SKonstantin Belousov * start OOM. Initiate the selection and signaling of the 1726449c2e92SKonstantin Belousov * victim. 1727449c2e92SKonstantin Belousov */ 1728449c2e92SKonstantin Belousov vm_pageout_oom(VM_OOM_MEM); 1729449c2e92SKonstantin Belousov 1730449c2e92SKonstantin Belousov /* 1731449c2e92SKonstantin Belousov * After one round of OOM terror, recall our vote. On the 1732449c2e92SKonstantin Belousov * next pass, current pagedaemon would vote again if the low 1733449c2e92SKonstantin Belousov * memory condition is still there, due to vmd_oom being 1734449c2e92SKonstantin Belousov * false. 1735449c2e92SKonstantin Belousov */ 1736449c2e92SKonstantin Belousov vmd->vmd_oom = FALSE; 1737449c2e92SKonstantin Belousov atomic_subtract_int(&vm_pageout_oom_vote, 1); 1738449c2e92SKonstantin Belousov } 17392025d69bSKonstantin Belousov 17403949873fSKonstantin Belousov /* 17413949873fSKonstantin Belousov * The OOM killer is the page daemon's action of last resort when 17423949873fSKonstantin Belousov * memory allocation requests have been stalled for a prolonged period 17433949873fSKonstantin Belousov * of time because it cannot reclaim memory. This function computes 17443949873fSKonstantin Belousov * the approximate number of physical pages that could be reclaimed if 17453949873fSKonstantin Belousov * the specified address space is destroyed. 17463949873fSKonstantin Belousov * 17473949873fSKonstantin Belousov * Private, anonymous memory owned by the address space is the 17483949873fSKonstantin Belousov * principal resource that we expect to recover after an OOM kill. 17493949873fSKonstantin Belousov * Since the physical pages mapped by the address space's COW entries 17503949873fSKonstantin Belousov * are typically shared pages, they are unlikely to be released and so 17513949873fSKonstantin Belousov * they are not counted. 17523949873fSKonstantin Belousov * 17533949873fSKonstantin Belousov * To get to the point where the page daemon runs the OOM killer, its 17543949873fSKonstantin Belousov * efforts to write-back vnode-backed pages may have stalled. This 17553949873fSKonstantin Belousov * could be caused by a memory allocation deadlock in the write path 17563949873fSKonstantin Belousov * that might be resolved by an OOM kill. Therefore, physical pages 17573949873fSKonstantin Belousov * belonging to vnode-backed objects are counted, because they might 17583949873fSKonstantin Belousov * be freed without being written out first if the address space holds 17593949873fSKonstantin Belousov * the last reference to an unlinked vnode. 17603949873fSKonstantin Belousov * 17613949873fSKonstantin Belousov * Similarly, physical pages belonging to OBJT_PHYS objects are 17623949873fSKonstantin Belousov * counted because the address space might hold the last reference to 17633949873fSKonstantin Belousov * the object. 17643949873fSKonstantin Belousov */ 17653949873fSKonstantin Belousov static long 17663949873fSKonstantin Belousov vm_pageout_oom_pagecount(struct vmspace *vmspace) 17673949873fSKonstantin Belousov { 17683949873fSKonstantin Belousov vm_map_t map; 17693949873fSKonstantin Belousov vm_map_entry_t entry; 17703949873fSKonstantin Belousov vm_object_t obj; 17713949873fSKonstantin Belousov long res; 17723949873fSKonstantin Belousov 17733949873fSKonstantin Belousov map = &vmspace->vm_map; 17743949873fSKonstantin Belousov KASSERT(!map->system_map, ("system map")); 17753949873fSKonstantin Belousov sx_assert(&map->lock, SA_LOCKED); 17763949873fSKonstantin Belousov res = 0; 17773949873fSKonstantin Belousov for (entry = map->header.next; entry != &map->header; 17783949873fSKonstantin Belousov entry = entry->next) { 17793949873fSKonstantin Belousov if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) 17803949873fSKonstantin Belousov continue; 17813949873fSKonstantin Belousov obj = entry->object.vm_object; 17823949873fSKonstantin Belousov if (obj == NULL) 17833949873fSKonstantin Belousov continue; 17843949873fSKonstantin Belousov if ((entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0 && 17853949873fSKonstantin Belousov obj->ref_count != 1) 17863949873fSKonstantin Belousov continue; 17873949873fSKonstantin Belousov switch (obj->type) { 17883949873fSKonstantin Belousov case OBJT_DEFAULT: 17893949873fSKonstantin Belousov case OBJT_SWAP: 17903949873fSKonstantin Belousov case OBJT_PHYS: 17913949873fSKonstantin Belousov case OBJT_VNODE: 17923949873fSKonstantin Belousov res += obj->resident_page_count; 17933949873fSKonstantin Belousov break; 17943949873fSKonstantin Belousov } 17953949873fSKonstantin Belousov } 17963949873fSKonstantin Belousov return (res); 17973949873fSKonstantin Belousov } 17983949873fSKonstantin Belousov 17992025d69bSKonstantin Belousov void 18002025d69bSKonstantin Belousov vm_pageout_oom(int shortage) 18012025d69bSKonstantin Belousov { 18022025d69bSKonstantin Belousov struct proc *p, *bigproc; 18032025d69bSKonstantin Belousov vm_offset_t size, bigsize; 18042025d69bSKonstantin Belousov struct thread *td; 18056bed074cSKonstantin Belousov struct vmspace *vm; 18062025d69bSKonstantin Belousov 18072025d69bSKonstantin Belousov /* 18081c58e4e5SJohn Baldwin * We keep the process bigproc locked once we find it to keep anyone 18091c58e4e5SJohn Baldwin * from messing with it; however, there is a possibility of 181028323addSBryan Drewery * deadlock if process B is bigproc and one of its child processes 18111c58e4e5SJohn Baldwin * attempts to propagate a signal to B while we are waiting for A's 18121c58e4e5SJohn Baldwin * lock while walking this list. To avoid this, we don't block on 18131c58e4e5SJohn Baldwin * the process lock but just skip a process if it is already locked. 18145663e6deSDavid Greenman */ 18155663e6deSDavid Greenman bigproc = NULL; 18165663e6deSDavid Greenman bigsize = 0; 18171005a129SJohn Baldwin sx_slock(&allproc_lock); 1818e602ba25SJulian Elischer FOREACH_PROC_IN_SYSTEM(p) { 1819e602ba25SJulian Elischer int breakout; 1820dcbcd518SBruce Evans 182171943c3dSKonstantin Belousov PROC_LOCK(p); 182271943c3dSKonstantin Belousov 18231c58e4e5SJohn Baldwin /* 18243f1c4c4fSKonstantin Belousov * If this is a system, protected or killed process, skip it. 18255663e6deSDavid Greenman */ 182671943c3dSKonstantin Belousov if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC | 182771943c3dSKonstantin Belousov P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 || 182871943c3dSKonstantin Belousov p->p_pid == 1 || P_KILLED(p) || 182971943c3dSKonstantin Belousov (p->p_pid < 48 && swap_pager_avail != 0)) { 18308606d880SJohn Baldwin PROC_UNLOCK(p); 18315663e6deSDavid Greenman continue; 18325663e6deSDavid Greenman } 18335663e6deSDavid Greenman /* 1834dcbcd518SBruce Evans * If the process is in a non-running type state, 1835e602ba25SJulian Elischer * don't touch it. Check all the threads individually. 18365663e6deSDavid Greenman */ 1837e602ba25SJulian Elischer breakout = 0; 1838e602ba25SJulian Elischer FOREACH_THREAD_IN_PROC(p, td) { 1839982d11f8SJeff Roberson thread_lock(td); 184071fad9fdSJulian Elischer if (!TD_ON_RUNQ(td) && 184171fad9fdSJulian Elischer !TD_IS_RUNNING(td) && 1842f497cda2SEdward Tomasz Napierala !TD_IS_SLEEPING(td) && 1843b98acc0aSKonstantin Belousov !TD_IS_SUSPENDED(td) && 1844b98acc0aSKonstantin Belousov !TD_IS_SWAPPED(td)) { 1845982d11f8SJeff Roberson thread_unlock(td); 1846e602ba25SJulian Elischer breakout = 1; 1847e602ba25SJulian Elischer break; 1848e602ba25SJulian Elischer } 1849982d11f8SJeff Roberson thread_unlock(td); 1850e602ba25SJulian Elischer } 1851e602ba25SJulian Elischer if (breakout) { 18521c58e4e5SJohn Baldwin PROC_UNLOCK(p); 18535663e6deSDavid Greenman continue; 18545663e6deSDavid Greenman } 18555663e6deSDavid Greenman /* 18565663e6deSDavid Greenman * get the process size 18575663e6deSDavid Greenman */ 18586bed074cSKonstantin Belousov vm = vmspace_acquire_ref(p); 18596bed074cSKonstantin Belousov if (vm == NULL) { 18606bed074cSKonstantin Belousov PROC_UNLOCK(p); 18616bed074cSKonstantin Belousov continue; 18626bed074cSKonstantin Belousov } 186395e2409aSKonstantin Belousov _PHOLD_LITE(p); 186472d97679SDavid Schultz PROC_UNLOCK(p); 186595e2409aSKonstantin Belousov sx_sunlock(&allproc_lock); 186695e2409aSKonstantin Belousov if (!vm_map_trylock_read(&vm->vm_map)) { 186771943c3dSKonstantin Belousov vmspace_free(vm); 186895e2409aSKonstantin Belousov sx_slock(&allproc_lock); 186995e2409aSKonstantin Belousov PRELE(p); 187072d97679SDavid Schultz continue; 187172d97679SDavid Schultz } 18727981aa24SKonstantin Belousov size = vmspace_swap_count(vm); 18732025d69bSKonstantin Belousov if (shortage == VM_OOM_MEM) 18743949873fSKonstantin Belousov size += vm_pageout_oom_pagecount(vm); 18753949873fSKonstantin Belousov vm_map_unlock_read(&vm->vm_map); 18766bed074cSKonstantin Belousov vmspace_free(vm); 187795e2409aSKonstantin Belousov sx_slock(&allproc_lock); 18783949873fSKonstantin Belousov 18795663e6deSDavid Greenman /* 18803949873fSKonstantin Belousov * If this process is bigger than the biggest one, 18815663e6deSDavid Greenman * remember it. 18825663e6deSDavid Greenman */ 18835663e6deSDavid Greenman if (size > bigsize) { 18841c58e4e5SJohn Baldwin if (bigproc != NULL) 188571943c3dSKonstantin Belousov PRELE(bigproc); 18865663e6deSDavid Greenman bigproc = p; 18875663e6deSDavid Greenman bigsize = size; 188871943c3dSKonstantin Belousov } else { 188971943c3dSKonstantin Belousov PRELE(p); 189071943c3dSKonstantin Belousov } 18915663e6deSDavid Greenman } 18921005a129SJohn Baldwin sx_sunlock(&allproc_lock); 18935663e6deSDavid Greenman if (bigproc != NULL) { 18948311a2b8SWill Andrews if (vm_panic_on_oom != 0) 18958311a2b8SWill Andrews panic("out of swap space"); 189671943c3dSKonstantin Belousov PROC_LOCK(bigproc); 1897729b1e51SDavid Greenman killproc(bigproc, "out of swap space"); 1898fa885116SJulian Elischer sched_nice(bigproc, PRIO_MIN); 189971943c3dSKonstantin Belousov _PRELE(bigproc); 19001c58e4e5SJohn Baldwin PROC_UNLOCK(bigproc); 190144f1c916SBryan Drewery wakeup(&vm_cnt.v_free_count); 19025663e6deSDavid Greenman } 19035663e6deSDavid Greenman } 190426f9a767SRodney W. Grimes 1905449c2e92SKonstantin Belousov static void 1906449c2e92SKonstantin Belousov vm_pageout_worker(void *arg) 1907449c2e92SKonstantin Belousov { 1908449c2e92SKonstantin Belousov struct vm_domain *domain; 190970cf3cedSAlan Cox int domidx, pass; 1910e57dd910SAlan Cox bool target_met; 1911449c2e92SKonstantin Belousov 1912449c2e92SKonstantin Belousov domidx = (uintptr_t)arg; 1913449c2e92SKonstantin Belousov domain = &vm_dom[domidx]; 191470cf3cedSAlan Cox pass = 0; 1915e57dd910SAlan Cox target_met = true; 1916449c2e92SKonstantin Belousov 1917449c2e92SKonstantin Belousov /* 1918949c9186SKonstantin Belousov * XXXKIB It could be useful to bind pageout daemon threads to 1919949c9186SKonstantin Belousov * the cores belonging to the domain, from which vm_page_array 1920949c9186SKonstantin Belousov * is allocated. 1921449c2e92SKonstantin Belousov */ 1922449c2e92SKonstantin Belousov 1923449c2e92SKonstantin Belousov KASSERT(domain->vmd_segs != 0, ("domain without segments")); 192422cf98d1SAlan Cox domain->vmd_last_active_scan = ticks; 1925449c2e92SKonstantin Belousov vm_pageout_init_marker(&domain->vmd_marker, PQ_INACTIVE); 19267e78597fSMark Johnston vm_pageout_init_marker(&domain->vmd_inacthead, PQ_INACTIVE); 19277e78597fSMark Johnston TAILQ_INSERT_HEAD(&domain->vmd_pagequeues[PQ_INACTIVE].pq_pl, 19287e78597fSMark Johnston &domain->vmd_inacthead, plinks.q); 1929449c2e92SKonstantin Belousov 1930449c2e92SKonstantin Belousov /* 1931449c2e92SKonstantin Belousov * The pageout daemon worker is never done, so loop forever. 1932449c2e92SKonstantin Belousov */ 1933449c2e92SKonstantin Belousov while (TRUE) { 1934449c2e92SKonstantin Belousov mtx_lock(&vm_page_queue_free_mtx); 193556ce0690SAlan Cox 193656ce0690SAlan Cox /* 193756ce0690SAlan Cox * Generally, after a level >= 1 scan, if there are enough 193856ce0690SAlan Cox * free pages to wakeup the waiters, then they are already 193956ce0690SAlan Cox * awake. A call to vm_page_free() during the scan awakened 194056ce0690SAlan Cox * them. However, in the following case, this wakeup serves 194156ce0690SAlan Cox * to bound the amount of time that a thread might wait. 194256ce0690SAlan Cox * Suppose a thread's call to vm_page_alloc() fails, but 194356ce0690SAlan Cox * before that thread calls VM_WAIT, enough pages are freed by 194456ce0690SAlan Cox * other threads to alleviate the free page shortage. The 194556ce0690SAlan Cox * thread will, nonetheless, wait until another page is freed 194656ce0690SAlan Cox * or this wakeup is performed. 194756ce0690SAlan Cox */ 1948449c2e92SKonstantin Belousov if (vm_pages_needed && !vm_page_count_min()) { 194956ce0690SAlan Cox vm_pages_needed = false; 195044f1c916SBryan Drewery wakeup(&vm_cnt.v_free_count); 1951449c2e92SKonstantin Belousov } 195256ce0690SAlan Cox 1953449c2e92SKonstantin Belousov /* 1954e57dd910SAlan Cox * Do not clear vm_pageout_wanted until we reach our free page 1955e57dd910SAlan Cox * target. Otherwise, we may be awakened over and over again, 1956e57dd910SAlan Cox * wasting CPU time. 1957449c2e92SKonstantin Belousov */ 1958e57dd910SAlan Cox if (vm_pageout_wanted && target_met) 195956ce0690SAlan Cox vm_pageout_wanted = false; 196056ce0690SAlan Cox 196156ce0690SAlan Cox /* 196256ce0690SAlan Cox * Might the page daemon receive a wakeup call? 196356ce0690SAlan Cox */ 196456ce0690SAlan Cox if (vm_pageout_wanted) { 196556ce0690SAlan Cox /* 196656ce0690SAlan Cox * No. Either vm_pageout_wanted was set by another 196756ce0690SAlan Cox * thread during the previous scan, which must have 196856ce0690SAlan Cox * been a level 0 scan, or vm_pageout_wanted was 196956ce0690SAlan Cox * already set and the scan failed to free enough 1970*ebcddc72SAlan Cox * pages. If we haven't yet performed a level >= 1 1971*ebcddc72SAlan Cox * (page reclamation) scan, then increase the level 1972*ebcddc72SAlan Cox * and scan again now. Otherwise, sleep a bit and 1973*ebcddc72SAlan Cox * try again later. 197456ce0690SAlan Cox */ 197556ce0690SAlan Cox mtx_unlock(&vm_page_queue_free_mtx); 1976*ebcddc72SAlan Cox if (pass >= 1) 1977*ebcddc72SAlan Cox pause("psleep", hz / VM_INACT_SCAN_RATE); 197870cf3cedSAlan Cox pass++; 1979449c2e92SKonstantin Belousov } else { 1980449c2e92SKonstantin Belousov /* 198156ce0690SAlan Cox * Yes. Sleep until pages need to be reclaimed or 198256ce0690SAlan Cox * have their reference stats updated. 1983449c2e92SKonstantin Belousov */ 198456ce0690SAlan Cox if (mtx_sleep(&vm_pageout_wanted, 198556ce0690SAlan Cox &vm_page_queue_free_mtx, PDROP | PVM, "psleep", 198656ce0690SAlan Cox hz) == 0) { 198756ce0690SAlan Cox PCPU_INC(cnt.v_pdwakeups); 198870cf3cedSAlan Cox pass = 1; 1989d9347bcaSAlan Cox } else 199070cf3cedSAlan Cox pass = 0; 199156ce0690SAlan Cox } 199256ce0690SAlan Cox 199370cf3cedSAlan Cox target_met = vm_pageout_scan(domain, pass); 1994449c2e92SKonstantin Belousov } 1995449c2e92SKonstantin Belousov } 1996449c2e92SKonstantin Belousov 1997df8bae1dSRodney W. Grimes /* 19984d19f4adSSteven Hartland * vm_pageout_init initialises basic pageout daemon settings. 1999df8bae1dSRodney W. Grimes */ 20002b14f991SJulian Elischer static void 20014d19f4adSSteven Hartland vm_pageout_init(void) 2002df8bae1dSRodney W. Grimes { 2003df8bae1dSRodney W. Grimes /* 2004df8bae1dSRodney W. Grimes * Initialize some paging parameters. 2005df8bae1dSRodney W. Grimes */ 200644f1c916SBryan Drewery vm_cnt.v_interrupt_free_min = 2; 200744f1c916SBryan Drewery if (vm_cnt.v_page_count < 2000) 2008f35329acSJohn Dyson vm_pageout_page_count = 8; 2009f6b04d2bSDavid Greenman 201045ae1d91SAlan Cox /* 201145ae1d91SAlan Cox * v_free_reserved needs to include enough for the largest 201245ae1d91SAlan Cox * swap pager structures plus enough for any pv_entry structs 201345ae1d91SAlan Cox * when paging. 201445ae1d91SAlan Cox */ 201544f1c916SBryan Drewery if (vm_cnt.v_page_count > 1024) 201644f1c916SBryan Drewery vm_cnt.v_free_min = 4 + (vm_cnt.v_page_count - 1024) / 200; 20172feb50bfSAttilio Rao else 201844f1c916SBryan Drewery vm_cnt.v_free_min = 4; 201944f1c916SBryan Drewery vm_cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE + 202044f1c916SBryan Drewery vm_cnt.v_interrupt_free_min; 202144f1c916SBryan Drewery vm_cnt.v_free_reserved = vm_pageout_page_count + 202244f1c916SBryan Drewery vm_cnt.v_pageout_free_min + (vm_cnt.v_page_count / 768); 202344f1c916SBryan Drewery vm_cnt.v_free_severe = vm_cnt.v_free_min / 2; 202444f1c916SBryan Drewery vm_cnt.v_free_target = 4 * vm_cnt.v_free_min + vm_cnt.v_free_reserved; 202544f1c916SBryan Drewery vm_cnt.v_free_min += vm_cnt.v_free_reserved; 202644f1c916SBryan Drewery vm_cnt.v_free_severe += vm_cnt.v_free_reserved; 202744f1c916SBryan Drewery vm_cnt.v_inactive_target = (3 * vm_cnt.v_free_target) / 2; 202844f1c916SBryan Drewery if (vm_cnt.v_inactive_target > vm_cnt.v_free_count / 3) 202944f1c916SBryan Drewery vm_cnt.v_inactive_target = vm_cnt.v_free_count / 3; 2030df8bae1dSRodney W. Grimes 2031d9e23210SJeff Roberson /* 2032d9e23210SJeff Roberson * Set the default wakeup threshold to be 10% above the minimum 2033d9e23210SJeff Roberson * page limit. This keeps the steady state out of shortfall. 2034d9e23210SJeff Roberson */ 203544f1c916SBryan Drewery vm_pageout_wakeup_thresh = (vm_cnt.v_free_min / 10) * 11; 2036d9e23210SJeff Roberson 2037d9e23210SJeff Roberson /* 2038d9e23210SJeff Roberson * Set interval in seconds for active scan. We want to visit each 2039c9612b2dSJeff Roberson * page at least once every ten minutes. This is to prevent worst 2040c9612b2dSJeff Roberson * case paging behaviors with stale active LRU. 2041d9e23210SJeff Roberson */ 2042d9e23210SJeff Roberson if (vm_pageout_update_period == 0) 2043c9612b2dSJeff Roberson vm_pageout_update_period = 600; 2044d9e23210SJeff Roberson 2045df8bae1dSRodney W. Grimes /* XXX does not really belong here */ 2046df8bae1dSRodney W. Grimes if (vm_page_max_wired == 0) 204744f1c916SBryan Drewery vm_page_max_wired = vm_cnt.v_free_count / 3; 2048*ebcddc72SAlan Cox 2049*ebcddc72SAlan Cox /* 2050*ebcddc72SAlan Cox * Target amount of memory to move out of the laundry queue during a 2051*ebcddc72SAlan Cox * background laundering. This is proportional to the amount of system 2052*ebcddc72SAlan Cox * memory. 2053*ebcddc72SAlan Cox */ 2054*ebcddc72SAlan Cox vm_background_launder_target = (vm_cnt.v_free_target - 2055*ebcddc72SAlan Cox vm_cnt.v_free_min) / 10; 20564d19f4adSSteven Hartland } 20574d19f4adSSteven Hartland 20584d19f4adSSteven Hartland /* 20594d19f4adSSteven Hartland * vm_pageout is the high level pageout daemon. 20604d19f4adSSteven Hartland */ 20614d19f4adSSteven Hartland static void 20624d19f4adSSteven Hartland vm_pageout(void) 20634d19f4adSSteven Hartland { 206444ec2b63SKonstantin Belousov int error; 206562d70a81SJohn Baldwin #ifdef VM_NUMA_ALLOC 206644ec2b63SKonstantin Belousov int i; 20674d19f4adSSteven Hartland #endif 2068df8bae1dSRodney W. Grimes 206924a1cce3SDavid Greenman swap_pager_swap_init(); 2070*ebcddc72SAlan Cox error = kthread_add(vm_pageout_laundry_worker, NULL, curproc, NULL, 2071*ebcddc72SAlan Cox 0, 0, "laundry: dom0"); 2072*ebcddc72SAlan Cox if (error != 0) 2073*ebcddc72SAlan Cox panic("starting laundry for domain 0, error %d", error); 207462d70a81SJohn Baldwin #ifdef VM_NUMA_ALLOC 2075449c2e92SKonstantin Belousov for (i = 1; i < vm_ndomains; i++) { 2076449c2e92SKonstantin Belousov error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i, 2077449c2e92SKonstantin Belousov curproc, NULL, 0, 0, "dom%d", i); 2078449c2e92SKonstantin Belousov if (error != 0) { 2079449c2e92SKonstantin Belousov panic("starting pageout for domain %d, error %d\n", 2080449c2e92SKonstantin Belousov i, error); 2081dc2efb27SJohn Dyson } 2082f919ebdeSDavid Greenman } 2083449c2e92SKonstantin Belousov #endif 208444ec2b63SKonstantin Belousov error = kthread_add(uma_reclaim_worker, NULL, curproc, NULL, 208544ec2b63SKonstantin Belousov 0, 0, "uma"); 208644ec2b63SKonstantin Belousov if (error != 0) 208744ec2b63SKonstantin Belousov panic("starting uma_reclaim helper, error %d\n", error); 2088d395270dSDimitry Andric vm_pageout_worker((void *)(uintptr_t)0); 2089df8bae1dSRodney W. Grimes } 209026f9a767SRodney W. Grimes 20916b4b77adSAlan Cox /* 2092e9f995d8SAlan Cox * Unless the free page queue lock is held by the caller, this function 20936b4b77adSAlan Cox * should be regarded as advisory. Specifically, the caller should 209444f1c916SBryan Drewery * not msleep() on &vm_cnt.v_free_count following this function unless 2095e9f995d8SAlan Cox * the free page queue lock is held until the msleep() is performed. 20966b4b77adSAlan Cox */ 2097e0c5a895SJohn Dyson void 20984a365329SAndrey Zonov pagedaemon_wakeup(void) 2099e0c5a895SJohn Dyson { 2100a1c0a785SAlan Cox 210156ce0690SAlan Cox if (!vm_pageout_wanted && curthread->td_proc != pageproc) { 210256ce0690SAlan Cox vm_pageout_wanted = true; 210356ce0690SAlan Cox wakeup(&vm_pageout_wanted); 2104e0c5a895SJohn Dyson } 2105e0c5a895SJohn Dyson } 2106e0c5a895SJohn Dyson 210738efa82bSJohn Dyson #if !defined(NO_SWAPPING) 21085afce282SDavid Greenman static void 210997824da3SAlan Cox vm_req_vmdaemon(int req) 21105afce282SDavid Greenman { 21115afce282SDavid Greenman static int lastrun = 0; 21125afce282SDavid Greenman 211397824da3SAlan Cox mtx_lock(&vm_daemon_mtx); 211497824da3SAlan Cox vm_pageout_req_swapout |= req; 2115b18bfc3dSJohn Dyson if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { 21165afce282SDavid Greenman wakeup(&vm_daemon_needed); 21175afce282SDavid Greenman lastrun = ticks; 21185afce282SDavid Greenman } 211997824da3SAlan Cox mtx_unlock(&vm_daemon_mtx); 21205afce282SDavid Greenman } 21215afce282SDavid Greenman 21222b14f991SJulian Elischer static void 21234a365329SAndrey Zonov vm_daemon(void) 21240d94caffSDavid Greenman { 212591d5354aSJohn Baldwin struct rlimit rsslim; 2126dcbcd518SBruce Evans struct proc *p; 2127dcbcd518SBruce Evans struct thread *td; 21286bed074cSKonstantin Belousov struct vmspace *vm; 2129099e7e95SEdward Tomasz Napierala int breakout, swapout_flags, tryagain, attempts; 2130afcc55f3SEdward Tomasz Napierala #ifdef RACCT 2131099e7e95SEdward Tomasz Napierala uint64_t rsize, ravailable; 2132afcc55f3SEdward Tomasz Napierala #endif 21330d94caffSDavid Greenman 21342fe6e4d7SDavid Greenman while (TRUE) { 213597824da3SAlan Cox mtx_lock(&vm_daemon_mtx); 21364b5c9cf6SEdward Tomasz Napierala msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", 2137099e7e95SEdward Tomasz Napierala #ifdef RACCT 21384b5c9cf6SEdward Tomasz Napierala racct_enable ? hz : 0 2139099e7e95SEdward Tomasz Napierala #else 21404b5c9cf6SEdward Tomasz Napierala 0 2141099e7e95SEdward Tomasz Napierala #endif 21424b5c9cf6SEdward Tomasz Napierala ); 214397824da3SAlan Cox swapout_flags = vm_pageout_req_swapout; 21444c1f8ee9SDavid Greenman vm_pageout_req_swapout = 0; 214597824da3SAlan Cox mtx_unlock(&vm_daemon_mtx); 214697824da3SAlan Cox if (swapout_flags) 214797824da3SAlan Cox swapout_procs(swapout_flags); 214897824da3SAlan Cox 21492fe6e4d7SDavid Greenman /* 21500d94caffSDavid Greenman * scan the processes for exceeding their rlimits or if 21510d94caffSDavid Greenman * process is swapped out -- deactivate pages 21522fe6e4d7SDavid Greenman */ 2153099e7e95SEdward Tomasz Napierala tryagain = 0; 2154099e7e95SEdward Tomasz Napierala attempts = 0; 2155099e7e95SEdward Tomasz Napierala again: 2156099e7e95SEdward Tomasz Napierala attempts++; 21571005a129SJohn Baldwin sx_slock(&allproc_lock); 2158f67af5c9SXin LI FOREACH_PROC_IN_SYSTEM(p) { 2159fe2144fdSLuoqi Chen vm_pindex_t limit, size; 21602fe6e4d7SDavid Greenman 21612fe6e4d7SDavid Greenman /* 21622fe6e4d7SDavid Greenman * if this is a system process or if we have already 21632fe6e4d7SDavid Greenman * looked at this process, skip it. 21642fe6e4d7SDavid Greenman */ 2165897ecacdSJohn Baldwin PROC_LOCK(p); 21668e6fa660SJohn Baldwin if (p->p_state != PRS_NORMAL || 21678e6fa660SJohn Baldwin p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) { 2168897ecacdSJohn Baldwin PROC_UNLOCK(p); 21692fe6e4d7SDavid Greenman continue; 21702fe6e4d7SDavid Greenman } 21712fe6e4d7SDavid Greenman /* 21722fe6e4d7SDavid Greenman * if the process is in a non-running type state, 21732fe6e4d7SDavid Greenman * don't touch it. 21742fe6e4d7SDavid Greenman */ 2175e602ba25SJulian Elischer breakout = 0; 2176e602ba25SJulian Elischer FOREACH_THREAD_IN_PROC(p, td) { 2177982d11f8SJeff Roberson thread_lock(td); 217871fad9fdSJulian Elischer if (!TD_ON_RUNQ(td) && 217971fad9fdSJulian Elischer !TD_IS_RUNNING(td) && 2180f497cda2SEdward Tomasz Napierala !TD_IS_SLEEPING(td) && 2181f497cda2SEdward Tomasz Napierala !TD_IS_SUSPENDED(td)) { 2182982d11f8SJeff Roberson thread_unlock(td); 2183e602ba25SJulian Elischer breakout = 1; 2184e602ba25SJulian Elischer break; 2185e602ba25SJulian Elischer } 2186982d11f8SJeff Roberson thread_unlock(td); 2187e602ba25SJulian Elischer } 2188897ecacdSJohn Baldwin if (breakout) { 2189897ecacdSJohn Baldwin PROC_UNLOCK(p); 21902fe6e4d7SDavid Greenman continue; 21912fe6e4d7SDavid Greenman } 21922fe6e4d7SDavid Greenman /* 21932fe6e4d7SDavid Greenman * get a limit 21942fe6e4d7SDavid Greenman */ 2195f6f6d240SMateusz Guzik lim_rlimit_proc(p, RLIMIT_RSS, &rsslim); 2196fe2144fdSLuoqi Chen limit = OFF_TO_IDX( 219791d5354aSJohn Baldwin qmin(rsslim.rlim_cur, rsslim.rlim_max)); 21982fe6e4d7SDavid Greenman 21992fe6e4d7SDavid Greenman /* 22000d94caffSDavid Greenman * let processes that are swapped out really be 22010d94caffSDavid Greenman * swapped out set the limit to nothing (will force a 22020d94caffSDavid Greenman * swap-out.) 22032fe6e4d7SDavid Greenman */ 2204b61ce5b0SJeff Roberson if ((p->p_flag & P_INMEM) == 0) 22050d94caffSDavid Greenman limit = 0; /* XXX */ 22066bed074cSKonstantin Belousov vm = vmspace_acquire_ref(p); 220795e2409aSKonstantin Belousov _PHOLD_LITE(p); 2208897ecacdSJohn Baldwin PROC_UNLOCK(p); 220995e2409aSKonstantin Belousov if (vm == NULL) { 221095e2409aSKonstantin Belousov PRELE(p); 22116bed074cSKonstantin Belousov continue; 221295e2409aSKonstantin Belousov } 221395e2409aSKonstantin Belousov sx_sunlock(&allproc_lock); 22142fe6e4d7SDavid Greenman 22156bed074cSKonstantin Belousov size = vmspace_resident_count(vm); 2216a406d8c3SEdward Tomasz Napierala if (size >= limit) { 2217fe2144fdSLuoqi Chen vm_pageout_map_deactivate_pages( 22186bed074cSKonstantin Belousov &vm->vm_map, limit); 22192fe6e4d7SDavid Greenman } 2220afcc55f3SEdward Tomasz Napierala #ifdef RACCT 22214b5c9cf6SEdward Tomasz Napierala if (racct_enable) { 2222099e7e95SEdward Tomasz Napierala rsize = IDX_TO_OFF(size); 2223099e7e95SEdward Tomasz Napierala PROC_LOCK(p); 2224099e7e95SEdward Tomasz Napierala racct_set(p, RACCT_RSS, rsize); 2225099e7e95SEdward Tomasz Napierala ravailable = racct_get_available(p, RACCT_RSS); 2226099e7e95SEdward Tomasz Napierala PROC_UNLOCK(p); 2227099e7e95SEdward Tomasz Napierala if (rsize > ravailable) { 2228099e7e95SEdward Tomasz Napierala /* 22294b5c9cf6SEdward Tomasz Napierala * Don't be overly aggressive; this 22304b5c9cf6SEdward Tomasz Napierala * might be an innocent process, 22314b5c9cf6SEdward Tomasz Napierala * and the limit could've been exceeded 22324b5c9cf6SEdward Tomasz Napierala * by some memory hog. Don't try 22334b5c9cf6SEdward Tomasz Napierala * to deactivate more than 1/4th 22344b5c9cf6SEdward Tomasz Napierala * of process' resident set size. 2235099e7e95SEdward Tomasz Napierala */ 2236099e7e95SEdward Tomasz Napierala if (attempts <= 8) { 22374b5c9cf6SEdward Tomasz Napierala if (ravailable < rsize - 22384b5c9cf6SEdward Tomasz Napierala (rsize / 4)) { 22394b5c9cf6SEdward Tomasz Napierala ravailable = rsize - 22404b5c9cf6SEdward Tomasz Napierala (rsize / 4); 22414b5c9cf6SEdward Tomasz Napierala } 2242099e7e95SEdward Tomasz Napierala } 2243099e7e95SEdward Tomasz Napierala vm_pageout_map_deactivate_pages( 22444b5c9cf6SEdward Tomasz Napierala &vm->vm_map, 22454b5c9cf6SEdward Tomasz Napierala OFF_TO_IDX(ravailable)); 2246099e7e95SEdward Tomasz Napierala /* Update RSS usage after paging out. */ 2247099e7e95SEdward Tomasz Napierala size = vmspace_resident_count(vm); 2248099e7e95SEdward Tomasz Napierala rsize = IDX_TO_OFF(size); 2249099e7e95SEdward Tomasz Napierala PROC_LOCK(p); 2250099e7e95SEdward Tomasz Napierala racct_set(p, RACCT_RSS, rsize); 2251099e7e95SEdward Tomasz Napierala PROC_UNLOCK(p); 2252099e7e95SEdward Tomasz Napierala if (rsize > ravailable) 2253099e7e95SEdward Tomasz Napierala tryagain = 1; 2254099e7e95SEdward Tomasz Napierala } 22554b5c9cf6SEdward Tomasz Napierala } 2256afcc55f3SEdward Tomasz Napierala #endif 22576bed074cSKonstantin Belousov vmspace_free(vm); 225895e2409aSKonstantin Belousov sx_slock(&allproc_lock); 225995e2409aSKonstantin Belousov PRELE(p); 22602fe6e4d7SDavid Greenman } 22611005a129SJohn Baldwin sx_sunlock(&allproc_lock); 2262099e7e95SEdward Tomasz Napierala if (tryagain != 0 && attempts <= 10) 2263099e7e95SEdward Tomasz Napierala goto again; 226424a1cce3SDavid Greenman } 22652fe6e4d7SDavid Greenman } 2266a1287949SEivind Eklund #endif /* !defined(NO_SWAPPING) */ 2267