sys/vm/vm_pageout.c

60727d8bSWarner Losh/*-
26f9a767SRodney W. Grimes * Copyright (c) 1991 Regents of the University of California.
26f9a767SRodney W. Grimes * All rights reserved.
26f9a767SRodney W. Grimes * Copyright (c) 1994 John S. Dyson
26f9a767SRodney W. Grimes * All rights reserved.
26f9a767SRodney W. Grimes * Copyright (c) 1994 David Greenman
26f9a767SRodney W. Grimes * All rights reserved.
8dbca793STor Egge * Copyright (c) 2005 Yahoo! Technologies Norway AS
8dbca793STor Egge * All rights reserved.
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes * This code is derived from software contributed to Berkeley by
df8bae1dSRodney W. Grimes * The Mach Operating System project at Carnegie-Mellon University.
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes * Redistribution and use in source and binary forms, with or without
df8bae1dSRodney W. Grimes * modification, are permitted provided that the following conditions
df8bae1dSRodney W. Grimes * are met:
df8bae1dSRodney W. Grimes * 1. Redistributions of source code must retain the above copyright
df8bae1dSRodney W. Grimes *    notice, this list of conditions and the following disclaimer.
df8bae1dSRodney W. Grimes * 2. Redistributions in binary form must reproduce the above copyright
df8bae1dSRodney W. Grimes *    notice, this list of conditions and the following disclaimer in the
df8bae1dSRodney W. Grimes *    documentation and/or other materials provided with the distribution.
df8bae1dSRodney W. Grimes * 3. All advertising materials mentioning features or use of this software
5929bcfaSPhilippe Charnier *    must display the following acknowledgement:
df8bae1dSRodney W. Grimes *	This product includes software developed by the University of
df8bae1dSRodney W. Grimes *	California, Berkeley and its contributors.
df8bae1dSRodney W. Grimes * 4. Neither the name of the University nor the names of its contributors
df8bae1dSRodney W. Grimes *    may be used to endorse or promote products derived from this software
df8bae1dSRodney W. Grimes *    without specific prior written permission.
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
df8bae1dSRodney W. Grimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
df8bae1dSRodney W. Grimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
df8bae1dSRodney W. Grimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
df8bae1dSRodney W. Grimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
df8bae1dSRodney W. Grimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
df8bae1dSRodney W. Grimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
df8bae1dSRodney W. Grimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
df8bae1dSRodney W. Grimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
df8bae1dSRodney W. Grimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
df8bae1dSRodney W. Grimes * SUCH DAMAGE.
df8bae1dSRodney W. Grimes *
3c4dd356SDavid Greenman *	from: @(#)vm_pageout.c	7.4 (Berkeley) 5/7/91
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes * Copyright (c) 1987, 1990 Carnegie-Mellon University.
df8bae1dSRodney W. Grimes * All rights reserved.
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes * Authors: Avadis Tevanian, Jr., Michael Wayne Young
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes * Permission to use, copy, modify and distribute this software and
df8bae1dSRodney W. Grimes * its documentation is hereby granted, provided that both the copyright
df8bae1dSRodney W. Grimes * notice and this permission notice appear in all copies of the
df8bae1dSRodney W. Grimes * software, derivative works or modified versions, and any portions
df8bae1dSRodney W. Grimes * thereof, and that both notices appear in supporting documentation.
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
df8bae1dSRodney W. Grimes * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
df8bae1dSRodney W. Grimes * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes * Carnegie Mellon requests users of this software to return to
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
df8bae1dSRodney W. Grimes *  School of Computer Science
df8bae1dSRodney W. Grimes *  Carnegie Mellon University
df8bae1dSRodney W. Grimes *  Pittsburgh PA 15213-3890
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes * any improvements or extensions that they make and grant Carnegie the
df8bae1dSRodney W. Grimes * rights to redistribute these changes.
df8bae1dSRodney W. Grimes */
df8bae1dSRodney W. Grimes
df8bae1dSRodney W. Grimes/*
df8bae1dSRodney W. Grimes *	The proverbial page-out daemon.
df8bae1dSRodney W. Grimes */
df8bae1dSRodney W. Grimes
874651b1SDavid E. O'Brien#include <sys/cdefs.h>
874651b1SDavid E. O'Brien__FBSDID("$FreeBSD$");
874651b1SDavid E. O'Brien
faa5f8d8SAndrzej Bialecki#include "opt_vm.h"
7672ca05SMark Johnston
df8bae1dSRodney W. Grimes#include <sys/param.h>
26f9a767SRodney W. Grimes#include <sys/systm.h>
b5e8ce9fSBruce Evans#include <sys/kernel.h>
855a310fSJeff Roberson#include <sys/eventhandler.h>
fb919e4dSMark Murray#include <sys/lock.h>
fb919e4dSMark Murray#include <sys/mutex.h>
26f9a767SRodney W. Grimes#include <sys/proc.h>
9c8b8baaSPeter Wemm#include <sys/kthread.h>
0384fff8SJason Evans#include <sys/ktr.h>
97824da3SAlan Cox#include <sys/mount.h>
099e7e95SEdward Tomasz Napierala#include <sys/racct.h>
26f9a767SRodney W. Grimes#include <sys/resourcevar.h>
b43179fbSJeff Roberson#include <sys/sched.h>
14a0d74eSSteven Hartland#include <sys/sdt.h>
d2fc5315SPoul-Henning Kamp#include <sys/signalvar.h>
449c2e92SKonstantin Belousov#include <sys/smp.h>
a6bf3a9eSRyan Stone#include <sys/time.h>
f6b04d2bSDavid Greenman#include <sys/vnode.h>
efeaf95aSDavid Greenman#include <sys/vmmeter.h>
89f6b863SAttilio Rao#include <sys/rwlock.h>
1005a129SJohn Baldwin#include <sys/sx.h>
38efa82bSJohn Dyson#include <sys/sysctl.h>
df8bae1dSRodney W. Grimes
df8bae1dSRodney W. Grimes#include <vm/vm.h>
efeaf95aSDavid Greenman#include <vm/vm_param.h>
efeaf95aSDavid Greenman#include <vm/vm_object.h>
df8bae1dSRodney W. Grimes#include <vm/vm_page.h>
efeaf95aSDavid Greenman#include <vm/vm_map.h>
df8bae1dSRodney W. Grimes#include <vm/vm_pageout.h>
24a1cce3SDavid Greenman#include <vm/vm_pager.h>
449c2e92SKonstantin Belousov#include <vm/vm_phys.h>
05f0fdd2SPoul-Henning Kamp#include <vm/swap_pager.h>
efeaf95aSDavid Greenman#include <vm/vm_extern.h>
670d17b5SJeff Roberson#include <vm/uma.h>
df8bae1dSRodney W. Grimes
2b14f991SJulian Elischer/*
2b14f991SJulian Elischer * System initialization
2b14f991SJulian Elischer */
2b14f991SJulian Elischer
2b14f991SJulian Elischer/* the kernel process "vm_pageout"*/
11caded3SAlfred Perlsteinstatic void vm_pageout(void);
4d19f4adSSteven Hartlandstatic void vm_pageout_init(void);
ebcddc72SAlan Coxstatic int vm_pageout_clean(vm_page_t m, int *numpagedout);
34d8b7eaSJeff Robersonstatic int vm_pageout_cluster(vm_page_t m);
e57dd910SAlan Coxstatic bool vm_pageout_scan(struct vm_domain *vmd, int pass);
76386c7eSKonstantin Belousovstatic void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
76386c7eSKonstantin Belousov    int starting_page_shortage);
45ae1d91SAlan Cox
4d19f4adSSteven HartlandSYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init,
4d19f4adSSteven Hartland    NULL);
4d19f4adSSteven Hartland
2b14f991SJulian Elischerstruct proc *pageproc;
2b14f991SJulian Elischer
2b14f991SJulian Elischerstatic struct kproc_desc page_kp = {
2b14f991SJulian Elischer	"pagedaemon",
2b14f991SJulian Elischer	vm_pageout,
2b14f991SJulian Elischer	&pageproc
2b14f991SJulian Elischer};
4d19f4adSSteven HartlandSYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start,
237fdd78SRobert Watson    &page_kp);
2b14f991SJulian Elischer
14a0d74eSSteven HartlandSDT_PROVIDER_DEFINE(vm);
14a0d74eSSteven HartlandSDT_PROBE_DEFINE(vm, , , vm__lowmem_scan);
14a0d74eSSteven Hartland
38efa82bSJohn Dyson#if !defined(NO_SWAPPING)
2b14f991SJulian Elischer/* the kernel process "vm_daemon"*/
11caded3SAlfred Perlsteinstatic void vm_daemon(void);
f708ef1bSPoul-Henning Kampstatic struct	proc *vmproc;
2b14f991SJulian Elischer
2b14f991SJulian Elischerstatic struct kproc_desc vm_kp = {
2b14f991SJulian Elischer	"vmdaemon",
2b14f991SJulian Elischer	vm_daemon,
2b14f991SJulian Elischer	&vmproc
2b14f991SJulian Elischer};
237fdd78SRobert WatsonSYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
38efa82bSJohn Dyson#endif
2b14f991SJulian Elischer
ebcddc72SAlan Cox/* Pagedaemon activity rates, in subdivisions of one second. */
ebcddc72SAlan Cox#define	VM_LAUNDER_RATE		10
ebcddc72SAlan Cox#define	VM_INACT_SCAN_RATE	2
2b14f991SJulian Elischer
8b245767SAlan Coxint vm_pageout_deficit;		/* Estimated number of pages deficit */
20c58db9SMark Johnstonu_int vm_pageout_wakeup_thresh;
76386c7eSKonstantin Belousovstatic int vm_pageout_oom_seq = 12;
56ce0690SAlan Coxbool vm_pageout_wanted;		/* Event on which pageout daemon sleeps */
56ce0690SAlan Coxbool vm_pages_needed;		/* Are threads waiting for free pages? */
26f9a767SRodney W. Grimes
ebcddc72SAlan Cox/* Pending request for dirty page laundering. */
ebcddc72SAlan Coxstatic enum {
ebcddc72SAlan Cox	VM_LAUNDRY_IDLE,
ebcddc72SAlan Cox	VM_LAUNDRY_BACKGROUND,
ebcddc72SAlan Cox	VM_LAUNDRY_SHORTFALL
ebcddc72SAlan Cox} vm_laundry_request = VM_LAUNDRY_IDLE;
ebcddc72SAlan Cox
38efa82bSJohn Dyson#if !defined(NO_SWAPPING)
f708ef1bSPoul-Henning Kampstatic int vm_pageout_req_swapout;	/* XXX */
f708ef1bSPoul-Henning Kampstatic int vm_daemon_needed;
97824da3SAlan Coxstatic struct mtx vm_daemon_mtx;
97824da3SAlan Cox/* Allow for use by vm_pageout before vm_daemon is initialized. */
97824da3SAlan CoxMTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF);
38efa82bSJohn Dyson#endif
d9e23210SJeff Robersonstatic int vm_pageout_update_period;
4a365329SAndrey Zonovstatic int disable_swap_pageouts;
c9612b2dSJeff Robersonstatic int lowmem_period = 10;
a6bf3a9eSRyan Stonestatic time_t lowmem_uptime;
b1fd102eSMark Johnstonstatic int swapdev_enabled;
70111b90SJohn Dyson
38efa82bSJohn Dyson#if defined(NO_SWAPPING)
303b270bSEivind Eklundstatic int vm_swap_enabled = 0;
303b270bSEivind Eklundstatic int vm_swap_idle_enabled = 0;
38efa82bSJohn Dyson#else
303b270bSEivind Eklundstatic int vm_swap_enabled = 1;
303b270bSEivind Eklundstatic int vm_swap_idle_enabled = 0;
38efa82bSJohn Dyson#endif
38efa82bSJohn Dyson
8311a2b8SWill Andrewsstatic int vm_panic_on_oom = 0;
8311a2b8SWill Andrews
8311a2b8SWill AndrewsSYSCTL_INT(_vm, OID_AUTO, panic_on_oom,
8311a2b8SWill Andrews	CTLFLAG_RWTUN, &vm_panic_on_oom, 0,
8311a2b8SWill Andrews	"panic on out of memory instead of killing the largest process");
8311a2b8SWill Andrews
d9e23210SJeff RobersonSYSCTL_INT(_vm, OID_AUTO, pageout_wakeup_thresh,
d9e23210SJeff Roberson	CTLFLAG_RW, &vm_pageout_wakeup_thresh, 0,
d9e23210SJeff Roberson	"free page threshold for waking up the pageout daemon");
d9e23210SJeff Roberson
d9e23210SJeff RobersonSYSCTL_INT(_vm, OID_AUTO, pageout_update_period,
d9e23210SJeff Roberson	CTLFLAG_RW, &vm_pageout_update_period, 0,
d9e23210SJeff Roberson	"Maximum active LRU update period");
53636869SAndrey Zonov
c9612b2dSJeff RobersonSYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RW, &lowmem_period, 0,
c9612b2dSJeff Roberson	"Low memory callback period");
c9612b2dSJeff Roberson
38efa82bSJohn Dyson#if defined(NO_SWAPPING)
ceb0cf87SJohn DysonSYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
6bd9cb1cSTom Rhodes	CTLFLAG_RD, &vm_swap_enabled, 0, "Enable entire process swapout");
ceb0cf87SJohn DysonSYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
6bd9cb1cSTom Rhodes	CTLFLAG_RD, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
38efa82bSJohn Dyson#else
ceb0cf87SJohn DysonSYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
b0359e2cSPeter Wemm	CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
ceb0cf87SJohn DysonSYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
b0359e2cSPeter Wemm	CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
38efa82bSJohn Dyson#endif
26f9a767SRodney W. Grimes
ceb0cf87SJohn DysonSYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
b0359e2cSPeter Wemm	CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
12ac6a1dSJohn Dyson
23b59018SMatthew Dillonstatic int pageout_lock_miss;
23b59018SMatthew DillonSYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
23b59018SMatthew Dillon	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
23b59018SMatthew Dillon
76386c7eSKonstantin BelousovSYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq,
76386c7eSKonstantin Belousov	CTLFLAG_RW, &vm_pageout_oom_seq, 0,
76386c7eSKonstantin Belousov	"back-to-back calls to oom detector to start OOM");
76386c7eSKonstantin Belousov
ebcddc72SAlan Coxstatic int act_scan_laundry_weight = 3;
ebcddc72SAlan CoxSYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RW,
ebcddc72SAlan Cox    &act_scan_laundry_weight, 0,
ebcddc72SAlan Cox    "weight given to clean vs. dirty pages in active queue scans");
ebcddc72SAlan Cox
ebcddc72SAlan Coxstatic u_int vm_background_launder_target;
ebcddc72SAlan CoxSYSCTL_UINT(_vm, OID_AUTO, background_launder_target, CTLFLAG_RW,
ebcddc72SAlan Cox    &vm_background_launder_target, 0,
ebcddc72SAlan Cox    "background laundering target, in pages");
ebcddc72SAlan Cox
ebcddc72SAlan Coxstatic u_int vm_background_launder_rate = 4096;
ebcddc72SAlan CoxSYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RW,
ebcddc72SAlan Cox    &vm_background_launder_rate, 0,
ebcddc72SAlan Cox    "background laundering rate, in kilobytes per second");
ebcddc72SAlan Cox
ebcddc72SAlan Coxstatic u_int vm_background_launder_max = 20 * 1024;
ebcddc72SAlan CoxSYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RW,
ebcddc72SAlan Cox    &vm_background_launder_max, 0, "background laundering cap, in kilobytes");
ebcddc72SAlan Cox
ffc82b0aSJohn Dyson#define VM_PAGEOUT_PAGE_COUNT 16
bbc0ec52SDavid Greenmanint vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
df8bae1dSRodney W. Grimes
c3cb3e12SDavid Greenmanint vm_page_max_wired;		/* XXX max # of wired pages system-wide */
5dfc2870SAlan CoxSYSCTL_INT(_vm, OID_AUTO, max_wired,
5dfc2870SAlan Cox	CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count");
df8bae1dSRodney W. Grimes
ebcddc72SAlan Coxstatic u_int isqrt(u_int num);
85eeca35SAlan Coxstatic boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *);
ebcddc72SAlan Coxstatic int vm_pageout_launder(struct vm_domain *vmd, int launder,
ebcddc72SAlan Cox    bool in_shortfall);
ebcddc72SAlan Coxstatic void vm_pageout_laundry_worker(void *arg);
38efa82bSJohn Dyson#if !defined(NO_SWAPPING)
ecf6279fSAlan Coxstatic void vm_pageout_map_deactivate_pages(vm_map_t, long);
ecf6279fSAlan Coxstatic void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long);
97824da3SAlan Coxstatic void vm_req_vmdaemon(int req);
38efa82bSJohn Dyson#endif
85eeca35SAlan Coxstatic boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *);
cd41fc12SDavid Greenman
a8229fa3SAlan Cox/*
a8229fa3SAlan Cox * Initialize a dummy page for marking the caller's place in the specified
a8229fa3SAlan Cox * paging queue.  In principle, this function only needs to set the flag
f0edf3f8SAlan Cox * PG_MARKER.  Nonetheless, it write busies and initializes the hold count
c7aebda8SAttilio Rao * to one as safety precautions.
a8229fa3SAlan Cox */
8c616246SKonstantin Belousovstatic void
8c616246SKonstantin Belousovvm_pageout_init_marker(vm_page_t marker, u_short queue)
8c616246SKonstantin Belousov{
8c616246SKonstantin Belousov
8c616246SKonstantin Belousov	bzero(marker, sizeof(*marker));
a8229fa3SAlan Cox	marker->flags = PG_MARKER;
c7aebda8SAttilio Rao	marker->busy_lock = VPB_SINGLE_EXCLUSIVER;
8c616246SKonstantin Belousov	marker->queue = queue;
a8229fa3SAlan Cox	marker->hold_count = 1;
8c616246SKonstantin Belousov}
8c616246SKonstantin Belousov
26f9a767SRodney W. Grimes/*
8dbca793STor Egge * vm_pageout_fallback_object_lock:
8dbca793STor Egge *
89f6b863SAttilio Rao * Lock vm object currently associated with `m'. VM_OBJECT_TRYWLOCK is
8dbca793STor Egge * known to have failed and page queue must be either PQ_ACTIVE or
44be0a8eSMark Johnston * PQ_INACTIVE.  To avoid lock order violation, unlock the page queue
8dbca793STor Egge * while locking the vm object.  Use marker page to detect page queue
8dbca793STor Egge * changes and maintain notion of next page on page queue.  Return
8dbca793STor Egge * TRUE if no changes were detected, FALSE otherwise.  vm object is
8dbca793STor Egge * locked on return.
8dbca793STor Egge *
8dbca793STor Egge * This function depends on both the lock portion of struct vm_object
8dbca793STor Egge * and normal struct vm_page being type stable.
8dbca793STor Egge */
85eeca35SAlan Coxstatic boolean_t
8dbca793STor Eggevm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next)
8dbca793STor Egge{
8dbca793STor Egge	struct vm_page marker;
8d220203SAlan Cox	struct vm_pagequeue *pq;
8dbca793STor Egge	boolean_t unchanged;
8dbca793STor Egge	u_short queue;
8dbca793STor Egge	vm_object_t object;
8dbca793STor Egge
8dbca793STor Egge	queue = m->queue;
8c616246SKonstantin Belousov	vm_pageout_init_marker(&marker, queue);
449c2e92SKonstantin Belousov	pq = vm_page_pagequeue(m);
8dbca793STor Egge	object = m->object;
8dbca793STor Egge
c325e866SKonstantin Belousov	TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q);
8d220203SAlan Cox	vm_pagequeue_unlock(pq);
2965a453SKip Macy	vm_page_unlock(m);
89f6b863SAttilio Rao	VM_OBJECT_WLOCK(object);
2965a453SKip Macy	vm_page_lock(m);
8d220203SAlan Cox	vm_pagequeue_lock(pq);
8dbca793STor Egge
69b8585eSKonstantin Belousov	/*
69b8585eSKonstantin Belousov	 * The page's object might have changed, and/or the page might
69b8585eSKonstantin Belousov	 * have moved from its original position in the queue.  If the
69b8585eSKonstantin Belousov	 * page's object has changed, then the caller should abandon
69b8585eSKonstantin Belousov	 * processing the page because the wrong object lock was
69b8585eSKonstantin Belousov	 * acquired.  Use the marker's plinks.q, not the page's, to
69b8585eSKonstantin Belousov	 * determine if the page has been moved.  The state of the
69b8585eSKonstantin Belousov	 * page's plinks.q can be indeterminate; whereas, the marker's
69b8585eSKonstantin Belousov	 * plinks.q must be valid.
69b8585eSKonstantin Belousov	 */
c325e866SKonstantin Belousov	*next = TAILQ_NEXT(&marker, plinks.q);
69b8585eSKonstantin Belousov	unchanged = m->object == object &&
69b8585eSKonstantin Belousov	    m == TAILQ_PREV(&marker, pglist, plinks.q);
69b8585eSKonstantin Belousov	KASSERT(!unchanged || m->queue == queue,
69b8585eSKonstantin Belousov	    ("page %p queue %d %d", m, queue, m->queue));
c325e866SKonstantin Belousov	TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q);
8dbca793STor Egge	return (unchanged);
8dbca793STor Egge}
8dbca793STor Egge
8dbca793STor Egge/*
8c616246SKonstantin Belousov * Lock the page while holding the page queue lock.  Use marker page
8c616246SKonstantin Belousov * to detect page queue changes and maintain notion of next page on
8c616246SKonstantin Belousov * page queue.  Return TRUE if no changes were detected, FALSE
8c616246SKonstantin Belousov * otherwise.  The page is locked on return. The page queue lock might
8c616246SKonstantin Belousov * be dropped and reacquired.
8c616246SKonstantin Belousov *
8c616246SKonstantin Belousov * This function depends on normal struct vm_page being type stable.
8c616246SKonstantin Belousov */
85eeca35SAlan Coxstatic boolean_t
8c616246SKonstantin Belousovvm_pageout_page_lock(vm_page_t m, vm_page_t *next)
8c616246SKonstantin Belousov{
8c616246SKonstantin Belousov	struct vm_page marker;
8d220203SAlan Cox	struct vm_pagequeue *pq;
8c616246SKonstantin Belousov	boolean_t unchanged;
8c616246SKonstantin Belousov	u_short queue;
8c616246SKonstantin Belousov
8c616246SKonstantin Belousov	vm_page_lock_assert(m, MA_NOTOWNED);
8c616246SKonstantin Belousov	if (vm_page_trylock(m))
8c616246SKonstantin Belousov		return (TRUE);
8c616246SKonstantin Belousov
8c616246SKonstantin Belousov	queue = m->queue;
8c616246SKonstantin Belousov	vm_pageout_init_marker(&marker, queue);
449c2e92SKonstantin Belousov	pq = vm_page_pagequeue(m);
8c616246SKonstantin Belousov
c325e866SKonstantin Belousov	TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q);
8d220203SAlan Cox	vm_pagequeue_unlock(pq);
8c616246SKonstantin Belousov	vm_page_lock(m);
8d220203SAlan Cox	vm_pagequeue_lock(pq);
8c616246SKonstantin Belousov
8c616246SKonstantin Belousov	/* Page queue might have changed. */
c325e866SKonstantin Belousov	*next = TAILQ_NEXT(&marker, plinks.q);
69b8585eSKonstantin Belousov	unchanged = m == TAILQ_PREV(&marker, pglist, plinks.q);
69b8585eSKonstantin Belousov	KASSERT(!unchanged || m->queue == queue,
69b8585eSKonstantin Belousov	    ("page %p queue %d %d", m, queue, m->queue));
c325e866SKonstantin Belousov	TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q);
8c616246SKonstantin Belousov	return (unchanged);
8c616246SKonstantin Belousov}
8c616246SKonstantin Belousov
8c616246SKonstantin Belousov/*
248fe642SAlan Cox * Scan for pages at adjacent offsets within the given page's object that are
248fe642SAlan Cox * eligible for laundering, form a cluster of these pages and the given page,
248fe642SAlan Cox * and launder that cluster.
26f9a767SRodney W. Grimes */
3af76890SPoul-Henning Kampstatic int
34d8b7eaSJeff Robersonvm_pageout_cluster(vm_page_t m)
24a1cce3SDavid Greenman{
54d92145SMatthew Dillon	vm_object_t object;
248fe642SAlan Cox	vm_page_t mc[2 * vm_pageout_page_count], p, pb, ps;
248fe642SAlan Cox	vm_pindex_t pindex;
248fe642SAlan Cox	int ib, is, page_base, pageout_count;
26f9a767SRodney W. Grimes
248fe642SAlan Cox	vm_page_assert_locked(m);
17f6a17bSAlan Cox	object = m->object;
89f6b863SAttilio Rao	VM_OBJECT_ASSERT_WLOCKED(object);
248fe642SAlan Cox	pindex = m->pindex;
0cddd8f0SMatthew Dillon
26f9a767SRodney W. Grimes	/*
248fe642SAlan Cox	 * We can't clean the page if it is busy or held.
24a1cce3SDavid Greenman	 */
c7aebda8SAttilio Rao	vm_page_assert_unbusied(m);
248fe642SAlan Cox	KASSERT(m->hold_count == 0, ("page %p is held", m));
17f6a17bSAlan Cox	vm_page_unlock(m);
0d94caffSDavid Greenman
91b4f427SAlan Cox	mc[vm_pageout_page_count] = pb = ps = m;
26f9a767SRodney W. Grimes	pageout_count = 1;
f35329acSJohn Dyson	page_base = vm_pageout_page_count;
90ecac61SMatthew Dillon	ib = 1;
90ecac61SMatthew Dillon	is = 1;
90ecac61SMatthew Dillon
24a1cce3SDavid Greenman	/*
248fe642SAlan Cox	 * We can cluster only if the page is not clean, busy, or held, and
ebcddc72SAlan Cox	 * the page is in the laundry queue.
90ecac61SMatthew Dillon	 *
90ecac61SMatthew Dillon	 * During heavy mmap/modification loads the pageout
90ecac61SMatthew Dillon	 * daemon can really fragment the underlying file
248fe642SAlan Cox	 * due to flushing pages out of order and not trying to
248fe642SAlan Cox	 * align the clusters (which leaves sporadic out-of-order
90ecac61SMatthew Dillon	 * holes).  To solve this problem we do the reverse scan
90ecac61SMatthew Dillon	 * first and attempt to align our cluster, then do a
90ecac61SMatthew Dillon	 * forward scan if room remains.
24a1cce3SDavid Greenman	 */
90ecac61SMatthew Dillonmore:
248fe642SAlan Cox	while (ib != 0 && pageout_count < vm_pageout_page_count) {
90ecac61SMatthew Dillon		if (ib > pindex) {
90ecac61SMatthew Dillon			ib = 0;
90ecac61SMatthew Dillon			break;
f6b04d2bSDavid Greenman		}
c7aebda8SAttilio Rao		if ((p = vm_page_prev(pb)) == NULL || vm_page_busied(p)) {
90ecac61SMatthew Dillon			ib = 0;
90ecac61SMatthew Dillon			break;
f6b04d2bSDavid Greenman		}
24a1cce3SDavid Greenman		vm_page_test_dirty(p);
eb5d3969SAlan Cox		if (p->dirty == 0) {
eb5d3969SAlan Cox			ib = 0;
eb5d3969SAlan Cox			break;
eb5d3969SAlan Cox		}
eb5d3969SAlan Cox		vm_page_lock(p);
ebcddc72SAlan Cox		if (!vm_page_in_laundry(p) ||
57601bcbSMatthew Dillon		    p->hold_count != 0) {	/* may be undergoing I/O */
2965a453SKip Macy			vm_page_unlock(p);
90ecac61SMatthew Dillon			ib = 0;
24a1cce3SDavid Greenman			break;
f6b04d2bSDavid Greenman		}
2965a453SKip Macy		vm_page_unlock(p);
91b4f427SAlan Cox		mc[--page_base] = pb = p;
90ecac61SMatthew Dillon		++pageout_count;
90ecac61SMatthew Dillon		++ib;
248fe642SAlan Cox
24a1cce3SDavid Greenman		/*
248fe642SAlan Cox		 * We are at an alignment boundary.  Stop here, and switch
248fe642SAlan Cox		 * directions.  Do not clear ib.
24a1cce3SDavid Greenman		 */
90ecac61SMatthew Dillon		if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
90ecac61SMatthew Dillon			break;
24a1cce3SDavid Greenman	}
90ecac61SMatthew Dillon	while (pageout_count < vm_pageout_page_count &&
90ecac61SMatthew Dillon	    pindex + is < object->size) {
c7aebda8SAttilio Rao		if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p))
90ecac61SMatthew Dillon			break;
24a1cce3SDavid Greenman		vm_page_test_dirty(p);
eb5d3969SAlan Cox		if (p->dirty == 0)
eb5d3969SAlan Cox			break;
eb5d3969SAlan Cox		vm_page_lock(p);
ebcddc72SAlan Cox		if (!vm_page_in_laundry(p) ||
57601bcbSMatthew Dillon		    p->hold_count != 0) {	/* may be undergoing I/O */
2965a453SKip Macy			vm_page_unlock(p);
24a1cce3SDavid Greenman			break;
24a1cce3SDavid Greenman		}
2965a453SKip Macy		vm_page_unlock(p);
91b4f427SAlan Cox		mc[page_base + pageout_count] = ps = p;
90ecac61SMatthew Dillon		++pageout_count;
90ecac61SMatthew Dillon		++is;
24a1cce3SDavid Greenman	}
90ecac61SMatthew Dillon
90ecac61SMatthew Dillon	/*
90ecac61SMatthew Dillon	 * If we exhausted our forward scan, continue with the reverse scan
248fe642SAlan Cox	 * when possible, even past an alignment boundary.  This catches
248fe642SAlan Cox	 * boundary conditions.
90ecac61SMatthew Dillon	 */
248fe642SAlan Cox	if (ib != 0 && pageout_count < vm_pageout_page_count)
90ecac61SMatthew Dillon		goto more;
f6b04d2bSDavid Greenman
99e6e193SMark Johnston	return (vm_pageout_flush(&mc[page_base], pageout_count,
99e6e193SMark Johnston	    VM_PAGER_PUT_NOREUSE, 0, NULL, NULL));
aef922f5SJohn Dyson}
aef922f5SJohn Dyson
1c7c3c6aSMatthew Dillon/*
1c7c3c6aSMatthew Dillon * vm_pageout_flush() - launder the given pages
1c7c3c6aSMatthew Dillon *
1c7c3c6aSMatthew Dillon *	The given pages are laundered.  Note that we setup for the start of
1c7c3c6aSMatthew Dillon *	I/O ( i.e. busy the page ), mark it read-only, and bump the object
1c7c3c6aSMatthew Dillon *	reference count all in here rather then in the parent.  If we want
1c7c3c6aSMatthew Dillon *	the parent to do more sophisticated things we may have to change
1c7c3c6aSMatthew Dillon *	the ordering.
1e8a675cSKonstantin Belousov *
1e8a675cSKonstantin Belousov *	Returned runlen is the count of pages between mreq and first
1e8a675cSKonstantin Belousov *	page after mreq with status VM_PAGER_AGAIN.
126d6082SKonstantin Belousov *	*eio is set to TRUE if pager returned VM_PAGER_ERROR or VM_PAGER_FAIL
126d6082SKonstantin Belousov *	for any page in runlen set.
1c7c3c6aSMatthew Dillon */
aef922f5SJohn Dysonint
126d6082SKonstantin Belousovvm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen,
126d6082SKonstantin Belousov    boolean_t *eio)
aef922f5SJohn Dyson{
2e3b314dSAlan Cox	vm_object_t object = mc[0]->object;
aef922f5SJohn Dyson	int pageout_status[count];
95461b45SJohn Dyson	int numpagedout = 0;
1e8a675cSKonstantin Belousov	int i, runlen;
aef922f5SJohn Dyson
89f6b863SAttilio Rao	VM_OBJECT_ASSERT_WLOCKED(object);
7bec141bSKip Macy
1c7c3c6aSMatthew Dillon	/*
1c7c3c6aSMatthew Dillon	 * Initiate I/O.  Bump the vm_page_t->busy counter and
1c7c3c6aSMatthew Dillon	 * mark the pages read-only.
1c7c3c6aSMatthew Dillon	 *
1c7c3c6aSMatthew Dillon	 * We do not have to fixup the clean/dirty bits here... we can
1c7c3c6aSMatthew Dillon	 * allow the pager to do it after the I/O completes.
02fa91d3SMatthew Dillon	 *
02fa91d3SMatthew Dillon	 * NOTE! mc[i]->dirty may be partial or fragmented due to an
02fa91d3SMatthew Dillon	 * edge case with file fragments.
1c7c3c6aSMatthew Dillon	 */
8f9110f6SJohn Dyson	for (i = 0; i < count; i++) {
7a935082SAlan Cox		KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
7a935082SAlan Cox		    ("vm_pageout_flush: partially invalid page %p index %d/%d",
7a935082SAlan Cox			mc[i], i, count));
c7aebda8SAttilio Rao		vm_page_sbusy(mc[i]);
78985e42SAlan Cox		pmap_remove_write(mc[i]);
2965a453SKip Macy	}
d474eaaaSDoug Rabson	vm_object_pip_add(object, count);
aef922f5SJohn Dyson
d076fbeaSAlan Cox	vm_pager_put_pages(object, mc, count, flags, pageout_status);
26f9a767SRodney W. Grimes
1e8a675cSKonstantin Belousov	runlen = count - mreq;
126d6082SKonstantin Belousov	if (eio != NULL)
126d6082SKonstantin Belousov		*eio = FALSE;
aef922f5SJohn Dyson	for (i = 0; i < count; i++) {
aef922f5SJohn Dyson		vm_page_t mt = mc[i];
24a1cce3SDavid Greenman
4cd45723SAlan Cox		KASSERT(pageout_status[i] == VM_PAGER_PEND ||
6031c68dSAlan Cox		    !pmap_page_is_write_mapped(mt),
9ea8d1a6SAlan Cox		    ("vm_pageout_flush: page %p is not write protected", mt));
26f9a767SRodney W. Grimes		switch (pageout_status[i]) {
26f9a767SRodney W. Grimes		case VM_PAGER_OK:
ebcddc72SAlan Cox			vm_page_lock(mt);
ebcddc72SAlan Cox			if (vm_page_in_laundry(mt))
ebcddc72SAlan Cox				vm_page_deactivate_noreuse(mt);
ebcddc72SAlan Cox			vm_page_unlock(mt);
ebcddc72SAlan Cox			/* FALLTHROUGH */
26f9a767SRodney W. Grimes		case VM_PAGER_PEND:
95461b45SJohn Dyson			numpagedout++;
26f9a767SRodney W. Grimes			break;
26f9a767SRodney W. Grimes		case VM_PAGER_BAD:
26f9a767SRodney W. Grimes			/*
ebcddc72SAlan Cox			 * The page is outside the object's range.  We pretend
ebcddc72SAlan Cox			 * that the page out worked and clean the page, so the
ebcddc72SAlan Cox			 * changes will be lost if the page is reclaimed by
ebcddc72SAlan Cox			 * the page daemon.
26f9a767SRodney W. Grimes			 */
90ecac61SMatthew Dillon			vm_page_undirty(mt);
ebcddc72SAlan Cox			vm_page_lock(mt);
ebcddc72SAlan Cox			if (vm_page_in_laundry(mt))
ebcddc72SAlan Cox				vm_page_deactivate_noreuse(mt);
ebcddc72SAlan Cox			vm_page_unlock(mt);
26f9a767SRodney W. Grimes			break;
26f9a767SRodney W. Grimes		case VM_PAGER_ERROR:
26f9a767SRodney W. Grimes		case VM_PAGER_FAIL:
26f9a767SRodney W. Grimes			/*
b1fd102eSMark Johnston			 * If the page couldn't be paged out to swap because the
b1fd102eSMark Johnston			 * pager wasn't able to find space, place the page in
b1fd102eSMark Johnston			 * the PQ_UNSWAPPABLE holding queue.  This is an
b1fd102eSMark Johnston			 * optimization that prevents the page daemon from
b1fd102eSMark Johnston			 * wasting CPU cycles on pages that cannot be reclaimed
b1fd102eSMark Johnston			 * becase no swap device is configured.
b1fd102eSMark Johnston			 *
b1fd102eSMark Johnston			 * Otherwise, reactivate the page so that it doesn't
b1fd102eSMark Johnston			 * clog the laundry and inactive queues.  (We will try
b1fd102eSMark Johnston			 * paging it out again later.)
26f9a767SRodney W. Grimes			 */
3c4a2440SAlan Cox			vm_page_lock(mt);
b1fd102eSMark Johnston			if (object->type == OBJT_SWAP &&
b1fd102eSMark Johnston			    pageout_status[i] == VM_PAGER_FAIL) {
b1fd102eSMark Johnston				vm_page_unswappable(mt);
b1fd102eSMark Johnston				numpagedout++;
b1fd102eSMark Johnston			} else
24a1cce3SDavid Greenman				vm_page_activate(mt);
3c4a2440SAlan Cox			vm_page_unlock(mt);
126d6082SKonstantin Belousov			if (eio != NULL && i >= mreq && i - mreq < runlen)
126d6082SKonstantin Belousov				*eio = TRUE;
26f9a767SRodney W. Grimes			break;
26f9a767SRodney W. Grimes		case VM_PAGER_AGAIN:
1e8a675cSKonstantin Belousov			if (i >= mreq && i - mreq < runlen)
1e8a675cSKonstantin Belousov				runlen = i - mreq;
26f9a767SRodney W. Grimes			break;
26f9a767SRodney W. Grimes		}
26f9a767SRodney W. Grimes
26f9a767SRodney W. Grimes		/*
0d94caffSDavid Greenman		 * If the operation is still going, leave the page busy to
0d94caffSDavid Greenman		 * block all other accesses. Also, leave the paging in
0d94caffSDavid Greenman		 * progress indicator set so that we don't attempt an object
0d94caffSDavid Greenman		 * collapse.
26f9a767SRodney W. Grimes		 */
26f9a767SRodney W. Grimes		if (pageout_status[i] != VM_PAGER_PEND) {
f919ebdeSDavid Greenman			vm_object_pip_wakeup(object);
c7aebda8SAttilio Rao			vm_page_sunbusy(mt);
3c4a2440SAlan Cox		}
3c4a2440SAlan Cox	}
1e8a675cSKonstantin Belousov	if (prunlen != NULL)
1e8a675cSKonstantin Belousov		*prunlen = runlen;
3c4a2440SAlan Cox	return (numpagedout);
26f9a767SRodney W. Grimes}
26f9a767SRodney W. Grimes
b1fd102eSMark Johnstonstatic void
b1fd102eSMark Johnstonvm_pageout_swapon(void *arg __unused, struct swdevt *sp __unused)
b1fd102eSMark Johnston{
b1fd102eSMark Johnston
b1fd102eSMark Johnston	atomic_store_rel_int(&swapdev_enabled, 1);
b1fd102eSMark Johnston}
b1fd102eSMark Johnston
b1fd102eSMark Johnstonstatic void
b1fd102eSMark Johnstonvm_pageout_swapoff(void *arg __unused, struct swdevt *sp __unused)
b1fd102eSMark Johnston{
b1fd102eSMark Johnston
b1fd102eSMark Johnston	if (swap_pager_nswapdev() == 1)
b1fd102eSMark Johnston		atomic_store_rel_int(&swapdev_enabled, 0);
b1fd102eSMark Johnston}
b1fd102eSMark Johnston
38efa82bSJohn Dyson#if !defined(NO_SWAPPING)
26f9a767SRodney W. Grimes/*
26f9a767SRodney W. Grimes *	vm_pageout_object_deactivate_pages
26f9a767SRodney W. Grimes *
ce186587SAlan Cox *	Deactivate enough pages to satisfy the inactive target
ce186587SAlan Cox *	requirements.
26f9a767SRodney W. Grimes *
26f9a767SRodney W. Grimes *	The object and map must be locked.
26f9a767SRodney W. Grimes */
38efa82bSJohn Dysonstatic void
ce186587SAlan Coxvm_pageout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object,
ce186587SAlan Cox    long desired)
26f9a767SRodney W. Grimes{
ecf6279fSAlan Cox	vm_object_t backing_object, object;
ce186587SAlan Cox	vm_page_t p;
bb7858eaSJeff Roberson	int act_delta, remove_mode;
26f9a767SRodney W. Grimes
e23b0a19SAlan Cox	VM_OBJECT_ASSERT_LOCKED(first_object);
28634820SAlan Cox	if ((first_object->flags & OBJ_FICTITIOUS) != 0)
38efa82bSJohn Dyson		return;
ecf6279fSAlan Cox	for (object = first_object;; object = backing_object) {
ecf6279fSAlan Cox		if (pmap_resident_count(pmap) <= desired)
ecf6279fSAlan Cox			goto unlock_return;
e23b0a19SAlan Cox		VM_OBJECT_ASSERT_LOCKED(object);
28634820SAlan Cox		if ((object->flags & OBJ_UNMANAGED) != 0 ||
28634820SAlan Cox		    object->paging_in_progress != 0)
ecf6279fSAlan Cox			goto unlock_return;
26f9a767SRodney W. Grimes
85b1dc89SAlan Cox		remove_mode = 0;
38efa82bSJohn Dyson		if (object->shadow_count > 1)
38efa82bSJohn Dyson			remove_mode = 1;
26f9a767SRodney W. Grimes		/*
ce186587SAlan Cox		 * Scan the object's entire memory queue.
26f9a767SRodney W. Grimes		 */
ce186587SAlan Cox		TAILQ_FOREACH(p, &object->memq, listq) {
447fe2a4SAlan Cox			if (pmap_resident_count(pmap) <= desired)
447fe2a4SAlan Cox				goto unlock_return;
c7aebda8SAttilio Rao			if (vm_page_busied(p))
447fe2a4SAlan Cox				continue;
*83c9dea1SGleb Smirnoff			VM_CNT_INC(v_pdpages);
2965a453SKip Macy			vm_page_lock(p);
ce186587SAlan Cox			if (p->wire_count != 0 || p->hold_count != 0 ||
ecf6279fSAlan Cox			    !pmap_page_exists_quick(pmap, p)) {
2965a453SKip Macy				vm_page_unlock(p);
0d94caffSDavid Greenman				continue;
0d94caffSDavid Greenman			}
bb7858eaSJeff Roberson			act_delta = pmap_ts_referenced(p);
3407fefeSKonstantin Belousov			if ((p->aflags & PGA_REFERENCED) != 0) {
bb7858eaSJeff Roberson				if (act_delta == 0)
bb7858eaSJeff Roberson					act_delta = 1;
3407fefeSKonstantin Belousov				vm_page_aflag_clear(p, PGA_REFERENCED);
ef743ce6SJohn Dyson			}
ebcddc72SAlan Cox			if (!vm_page_active(p) && act_delta != 0) {
ef743ce6SJohn Dyson				vm_page_activate(p);
bb7858eaSJeff Roberson				p->act_count += act_delta;
ebcddc72SAlan Cox			} else if (vm_page_active(p)) {
bb7858eaSJeff Roberson				if (act_delta == 0) {
ce186587SAlan Cox					p->act_count -= min(p->act_count,
ce186587SAlan Cox					    ACT_DECLINE);
90776bd7SJeff Roberson					if (!remove_mode && p->act_count == 0) {
4fec79beSAlan Cox						pmap_remove_all(p);
26f9a767SRodney W. Grimes						vm_page_deactivate(p);
8d220203SAlan Cox					} else
8d220203SAlan Cox						vm_page_requeue(p);
c8c4b40cSJohn Dyson				} else {
eaf13dd7SJohn Dyson					vm_page_activate(p);
ce186587SAlan Cox					if (p->act_count < ACT_MAX -
ce186587SAlan Cox					    ACT_ADVANCE)
38efa82bSJohn Dyson						p->act_count += ACT_ADVANCE;
8d220203SAlan Cox					vm_page_requeue(p);
ce186587SAlan Cox				}
ebcddc72SAlan Cox			} else if (vm_page_inactive(p))
ce186587SAlan Cox				pmap_remove_all(p);
2965a453SKip Macy			vm_page_unlock(p);
26f9a767SRodney W. Grimes		}
ecf6279fSAlan Cox		if ((backing_object = object->backing_object) == NULL)
ecf6279fSAlan Cox			goto unlock_return;
e23b0a19SAlan Cox		VM_OBJECT_RLOCK(backing_object);
ecf6279fSAlan Cox		if (object != first_object)
e23b0a19SAlan Cox			VM_OBJECT_RUNLOCK(object);
38efa82bSJohn Dyson	}
ecf6279fSAlan Coxunlock_return:
ecf6279fSAlan Cox	if (object != first_object)
e23b0a19SAlan Cox		VM_OBJECT_RUNLOCK(object);
26f9a767SRodney W. Grimes}
26f9a767SRodney W. Grimes
26f9a767SRodney W. Grimes/*
26f9a767SRodney W. Grimes * deactivate some number of pages in a map, try to do it fairly, but
26f9a767SRodney W. Grimes * that is really hard to do.
26f9a767SRodney W. Grimes */
cd41fc12SDavid Greenmanstatic void
38efa82bSJohn Dysonvm_pageout_map_deactivate_pages(map, desired)
26f9a767SRodney W. Grimes	vm_map_t map;
ecf6279fSAlan Cox	long desired;
26f9a767SRodney W. Grimes{
26f9a767SRodney W. Grimes	vm_map_entry_t tmpe;
38efa82bSJohn Dyson	vm_object_t obj, bigobj;
30105b9eSTor Egge	int nothingwired;
0d94caffSDavid Greenman
d974f03cSAlan Cox	if (!vm_map_trylock(map))
26f9a767SRodney W. Grimes		return;
38efa82bSJohn Dyson
38efa82bSJohn Dyson	bigobj = NULL;
30105b9eSTor Egge	nothingwired = TRUE;
38efa82bSJohn Dyson
38efa82bSJohn Dyson	/*
38efa82bSJohn Dyson	 * first, search out the biggest object, and try to free pages from
38efa82bSJohn Dyson	 * that.
38efa82bSJohn Dyson	 */
26f9a767SRodney W. Grimes	tmpe = map->header.next;
38efa82bSJohn Dyson	while (tmpe != &map->header) {
9fdfe602SMatthew Dillon		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
38efa82bSJohn Dyson			obj = tmpe->object.vm_object;
e23b0a19SAlan Cox			if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) {
0774dfb3SAlan Cox				if (obj->shadow_count <= 1 &&
0774dfb3SAlan Cox				    (bigobj == NULL ||
0774dfb3SAlan Cox				     bigobj->resident_page_count < obj->resident_page_count)) {
0774dfb3SAlan Cox					if (bigobj != NULL)
e23b0a19SAlan Cox						VM_OBJECT_RUNLOCK(bigobj);
38efa82bSJohn Dyson					bigobj = obj;
0774dfb3SAlan Cox				} else
e23b0a19SAlan Cox					VM_OBJECT_RUNLOCK(obj);
38efa82bSJohn Dyson			}
38efa82bSJohn Dyson		}
30105b9eSTor Egge		if (tmpe->wired_count > 0)
30105b9eSTor Egge			nothingwired = FALSE;
38efa82bSJohn Dyson		tmpe = tmpe->next;
38efa82bSJohn Dyson	}
38efa82bSJohn Dyson
0774dfb3SAlan Cox	if (bigobj != NULL) {
ecf6279fSAlan Cox		vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired);
e23b0a19SAlan Cox		VM_OBJECT_RUNLOCK(bigobj);
0774dfb3SAlan Cox	}
38efa82bSJohn Dyson	/*
38efa82bSJohn Dyson	 * Next, hunt around for other pages to deactivate.  We actually
38efa82bSJohn Dyson	 * do this search sort of wrong -- .text first is not the best idea.
38efa82bSJohn Dyson	 */
38efa82bSJohn Dyson	tmpe = map->header.next;
38efa82bSJohn Dyson	while (tmpe != &map->header) {
b1028ad1SLuoqi Chen		if (pmap_resident_count(vm_map_pmap(map)) <= desired)
38efa82bSJohn Dyson			break;
9fdfe602SMatthew Dillon		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
38efa82bSJohn Dyson			obj = tmpe->object.vm_object;
0774dfb3SAlan Cox			if (obj != NULL) {
e23b0a19SAlan Cox				VM_OBJECT_RLOCK(obj);
ecf6279fSAlan Cox				vm_pageout_object_deactivate_pages(map->pmap, obj, desired);
e23b0a19SAlan Cox				VM_OBJECT_RUNLOCK(obj);
0774dfb3SAlan Cox			}
38efa82bSJohn Dyson		}
26f9a767SRodney W. Grimes		tmpe = tmpe->next;
38857e7fSAlan Cox	}
38efa82bSJohn Dyson
38efa82bSJohn Dyson	/*
38efa82bSJohn Dyson	 * Remove all mappings if a process is swapped out, this will free page
38efa82bSJohn Dyson	 * table pages.
38efa82bSJohn Dyson	 */
38857e7fSAlan Cox	if (desired == 0 && nothingwired) {
8d01a3b2SNathan Whitehorn		pmap_remove(vm_map_pmap(map), vm_map_min(map),
8d01a3b2SNathan Whitehorn		    vm_map_max(map));
38857e7fSAlan Cox	}
938b0f5bSMarcel Moolenaar
38efa82bSJohn Dyson	vm_map_unlock(map);
26f9a767SRodney W. Grimes}
a1287949SEivind Eklund#endif		/* !defined(NO_SWAPPING) */
df8bae1dSRodney W. Grimes
1c7c3c6aSMatthew Dillon/*
34d8b7eaSJeff Roberson * Attempt to acquire all of the necessary locks to launder a page and
34d8b7eaSJeff Roberson * then call through the clustering layer to PUTPAGES.  Wait a short
34d8b7eaSJeff Roberson * time for a vnode lock.
34d8b7eaSJeff Roberson *
34d8b7eaSJeff Roberson * Requires the page and object lock on entry, releases both before return.
34d8b7eaSJeff Roberson * Returns 0 on success and an errno otherwise.
34d8b7eaSJeff Roberson */
34d8b7eaSJeff Robersonstatic int
ebcddc72SAlan Coxvm_pageout_clean(vm_page_t m, int *numpagedout)
34d8b7eaSJeff Roberson{
34d8b7eaSJeff Roberson	struct vnode *vp;
34d8b7eaSJeff Roberson	struct mount *mp;
34d8b7eaSJeff Roberson	vm_object_t object;
34d8b7eaSJeff Roberson	vm_pindex_t pindex;
34d8b7eaSJeff Roberson	int error, lockmode;
34d8b7eaSJeff Roberson
34d8b7eaSJeff Roberson	vm_page_assert_locked(m);
34d8b7eaSJeff Roberson	object = m->object;
34d8b7eaSJeff Roberson	VM_OBJECT_ASSERT_WLOCKED(object);
34d8b7eaSJeff Roberson	error = 0;
34d8b7eaSJeff Roberson	vp = NULL;
34d8b7eaSJeff Roberson	mp = NULL;
34d8b7eaSJeff Roberson
34d8b7eaSJeff Roberson	/*
34d8b7eaSJeff Roberson	 * The object is already known NOT to be dead.   It
34d8b7eaSJeff Roberson	 * is possible for the vget() to block the whole
34d8b7eaSJeff Roberson	 * pageout daemon, but the new low-memory handling
34d8b7eaSJeff Roberson	 * code should prevent it.
34d8b7eaSJeff Roberson	 *
34d8b7eaSJeff Roberson	 * We can't wait forever for the vnode lock, we might
34d8b7eaSJeff Roberson	 * deadlock due to a vn_read() getting stuck in
34d8b7eaSJeff Roberson	 * vm_wait while holding this vnode.  We skip the
34d8b7eaSJeff Roberson	 * vnode if we can't get it in a reasonable amount
34d8b7eaSJeff Roberson	 * of time.
34d8b7eaSJeff Roberson	 */
34d8b7eaSJeff Roberson	if (object->type == OBJT_VNODE) {
34d8b7eaSJeff Roberson		vm_page_unlock(m);
34d8b7eaSJeff Roberson		vp = object->handle;
34d8b7eaSJeff Roberson		if (vp->v_type == VREG &&
34d8b7eaSJeff Roberson		    vn_start_write(vp, &mp, V_NOWAIT) != 0) {
34d8b7eaSJeff Roberson			mp = NULL;
34d8b7eaSJeff Roberson			error = EDEADLK;
34d8b7eaSJeff Roberson			goto unlock_all;
34d8b7eaSJeff Roberson		}
34d8b7eaSJeff Roberson		KASSERT(mp != NULL,
34d8b7eaSJeff Roberson		    ("vp %p with NULL v_mount", vp));
34d8b7eaSJeff Roberson		vm_object_reference_locked(object);
34d8b7eaSJeff Roberson		pindex = m->pindex;
34d8b7eaSJeff Roberson		VM_OBJECT_WUNLOCK(object);
34d8b7eaSJeff Roberson		lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
34d8b7eaSJeff Roberson		    LK_SHARED : LK_EXCLUSIVE;
34d8b7eaSJeff Roberson		if (vget(vp, lockmode | LK_TIMELOCK, curthread)) {
34d8b7eaSJeff Roberson			vp = NULL;
34d8b7eaSJeff Roberson			error = EDEADLK;
34d8b7eaSJeff Roberson			goto unlock_mp;
34d8b7eaSJeff Roberson		}
34d8b7eaSJeff Roberson		VM_OBJECT_WLOCK(object);
34d8b7eaSJeff Roberson		vm_page_lock(m);
34d8b7eaSJeff Roberson		/*
34d8b7eaSJeff Roberson		 * While the object and page were unlocked, the page
34d8b7eaSJeff Roberson		 * may have been:
34d8b7eaSJeff Roberson		 * (1) moved to a different queue,
34d8b7eaSJeff Roberson		 * (2) reallocated to a different object,
34d8b7eaSJeff Roberson		 * (3) reallocated to a different offset, or
34d8b7eaSJeff Roberson		 * (4) cleaned.
34d8b7eaSJeff Roberson		 */
ebcddc72SAlan Cox		if (!vm_page_in_laundry(m) || m->object != object ||
34d8b7eaSJeff Roberson		    m->pindex != pindex || m->dirty == 0) {
34d8b7eaSJeff Roberson			vm_page_unlock(m);
34d8b7eaSJeff Roberson			error = ENXIO;
34d8b7eaSJeff Roberson			goto unlock_all;
34d8b7eaSJeff Roberson		}
34d8b7eaSJeff Roberson
34d8b7eaSJeff Roberson		/*
34d8b7eaSJeff Roberson		 * The page may have been busied or held while the object
34d8b7eaSJeff Roberson		 * and page locks were released.
34d8b7eaSJeff Roberson		 */
34d8b7eaSJeff Roberson		if (vm_page_busied(m) || m->hold_count != 0) {
34d8b7eaSJeff Roberson			vm_page_unlock(m);
34d8b7eaSJeff Roberson			error = EBUSY;
34d8b7eaSJeff Roberson			goto unlock_all;
34d8b7eaSJeff Roberson		}
34d8b7eaSJeff Roberson	}
34d8b7eaSJeff Roberson
34d8b7eaSJeff Roberson	/*
34d8b7eaSJeff Roberson	 * If a page is dirty, then it is either being washed
34d8b7eaSJeff Roberson	 * (but not yet cleaned) or it is still in the
34d8b7eaSJeff Roberson	 * laundry.  If it is still in the laundry, then we
34d8b7eaSJeff Roberson	 * start the cleaning operation.
34d8b7eaSJeff Roberson	 */
ebcddc72SAlan Cox	if ((*numpagedout = vm_pageout_cluster(m)) == 0)
34d8b7eaSJeff Roberson		error = EIO;
34d8b7eaSJeff Roberson
34d8b7eaSJeff Robersonunlock_all:
34d8b7eaSJeff Roberson	VM_OBJECT_WUNLOCK(object);
34d8b7eaSJeff Roberson
34d8b7eaSJeff Robersonunlock_mp:
34d8b7eaSJeff Roberson	vm_page_lock_assert(m, MA_NOTOWNED);
34d8b7eaSJeff Roberson	if (mp != NULL) {
34d8b7eaSJeff Roberson		if (vp != NULL)
34d8b7eaSJeff Roberson			vput(vp);
34d8b7eaSJeff Roberson		vm_object_deallocate(object);
34d8b7eaSJeff Roberson		vn_finished_write(mp);
34d8b7eaSJeff Roberson	}
34d8b7eaSJeff Roberson
34d8b7eaSJeff Roberson	return (error);
34d8b7eaSJeff Roberson}
34d8b7eaSJeff Roberson
34d8b7eaSJeff Roberson/*
ebcddc72SAlan Cox * Attempt to launder the specified number of pages.
ebcddc72SAlan Cox *
ebcddc72SAlan Cox * Returns the number of pages successfully laundered.
ebcddc72SAlan Cox */
ebcddc72SAlan Coxstatic int
ebcddc72SAlan Coxvm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)
ebcddc72SAlan Cox{
ebcddc72SAlan Cox	struct vm_pagequeue *pq;
ebcddc72SAlan Cox	vm_object_t object;
ebcddc72SAlan Cox	vm_page_t m, next;
ebcddc72SAlan Cox	int act_delta, error, maxscan, numpagedout, starting_target;
ebcddc72SAlan Cox	int vnodes_skipped;
ebcddc72SAlan Cox	bool pageout_ok, queue_locked;
ebcddc72SAlan Cox
ebcddc72SAlan Cox	starting_target = launder;
ebcddc72SAlan Cox	vnodes_skipped = 0;
ebcddc72SAlan Cox
ebcddc72SAlan Cox	/*
b1fd102eSMark Johnston	 * Scan the laundry queues for pages eligible to be laundered.  We stop
ebcddc72SAlan Cox	 * once the target number of dirty pages have been laundered, or once
ebcddc72SAlan Cox	 * we've reached the end of the queue.  A single iteration of this loop
ebcddc72SAlan Cox	 * may cause more than one page to be laundered because of clustering.
ebcddc72SAlan Cox	 *
ebcddc72SAlan Cox	 * maxscan ensures that we don't re-examine requeued pages.  Any
ebcddc72SAlan Cox	 * additional pages written as part of a cluster are subtracted from
ebcddc72SAlan Cox	 * maxscan since they must be taken from the laundry queue.
b1fd102eSMark Johnston	 *
b1fd102eSMark Johnston	 * As an optimization, we avoid laundering from PQ_UNSWAPPABLE when no
b1fd102eSMark Johnston	 * swap devices are configured.
ebcddc72SAlan Cox	 */
b1fd102eSMark Johnston	if (atomic_load_acq_int(&swapdev_enabled))
b1fd102eSMark Johnston		pq = &vmd->vmd_pagequeues[PQ_UNSWAPPABLE];
b1fd102eSMark Johnston	else
ebcddc72SAlan Cox		pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
ebcddc72SAlan Cox
b1fd102eSMark Johnstonscan:
ebcddc72SAlan Cox	vm_pagequeue_lock(pq);
b1fd102eSMark Johnston	maxscan = pq->pq_cnt;
ebcddc72SAlan Cox	queue_locked = true;
ebcddc72SAlan Cox	for (m = TAILQ_FIRST(&pq->pq_pl);
ebcddc72SAlan Cox	    m != NULL && maxscan-- > 0 && launder > 0;
ebcddc72SAlan Cox	    m = next) {
ebcddc72SAlan Cox		vm_pagequeue_assert_locked(pq);
ebcddc72SAlan Cox		KASSERT(queue_locked, ("unlocked laundry queue"));
ebcddc72SAlan Cox		KASSERT(vm_page_in_laundry(m),
ebcddc72SAlan Cox		    ("page %p has an inconsistent queue", m));
ebcddc72SAlan Cox		next = TAILQ_NEXT(m, plinks.q);
ebcddc72SAlan Cox		if ((m->flags & PG_MARKER) != 0)
ebcddc72SAlan Cox			continue;
ebcddc72SAlan Cox		KASSERT((m->flags & PG_FICTITIOUS) == 0,
ebcddc72SAlan Cox		    ("PG_FICTITIOUS page %p cannot be in laundry queue", m));
ebcddc72SAlan Cox		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
ebcddc72SAlan Cox		    ("VPO_UNMANAGED page %p cannot be in laundry queue", m));
ebcddc72SAlan Cox		if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) {
ebcddc72SAlan Cox			vm_page_unlock(m);
ebcddc72SAlan Cox			continue;
ebcddc72SAlan Cox		}
ebcddc72SAlan Cox		object = m->object;
ebcddc72SAlan Cox		if ((!VM_OBJECT_TRYWLOCK(object) &&
ebcddc72SAlan Cox		    (!vm_pageout_fallback_object_lock(m, &next) ||
ebcddc72SAlan Cox		    m->hold_count != 0)) || vm_page_busied(m)) {
ebcddc72SAlan Cox			VM_OBJECT_WUNLOCK(object);
ebcddc72SAlan Cox			vm_page_unlock(m);
ebcddc72SAlan Cox			continue;
ebcddc72SAlan Cox		}
ebcddc72SAlan Cox
ebcddc72SAlan Cox		/*
ebcddc72SAlan Cox		 * Unlock the laundry queue, invalidating the 'next' pointer.
ebcddc72SAlan Cox		 * Use a marker to remember our place in the laundry queue.
ebcddc72SAlan Cox		 */
ebcddc72SAlan Cox		TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_laundry_marker,
ebcddc72SAlan Cox		    plinks.q);
ebcddc72SAlan Cox		vm_pagequeue_unlock(pq);
ebcddc72SAlan Cox		queue_locked = false;
ebcddc72SAlan Cox
ebcddc72SAlan Cox		/*
ebcddc72SAlan Cox		 * Invalid pages can be easily freed.  They cannot be
ebcddc72SAlan Cox		 * mapped; vm_page_free() asserts this.
ebcddc72SAlan Cox		 */
ebcddc72SAlan Cox		if (m->valid == 0)
ebcddc72SAlan Cox			goto free_page;
ebcddc72SAlan Cox
ebcddc72SAlan Cox		/*
ebcddc72SAlan Cox		 * If the page has been referenced and the object is not dead,
ebcddc72SAlan Cox		 * reactivate or requeue the page depending on whether the
ebcddc72SAlan Cox		 * object is mapped.
ebcddc72SAlan Cox		 */
ebcddc72SAlan Cox		if ((m->aflags & PGA_REFERENCED) != 0) {
ebcddc72SAlan Cox			vm_page_aflag_clear(m, PGA_REFERENCED);
ebcddc72SAlan Cox			act_delta = 1;
ebcddc72SAlan Cox		} else
ebcddc72SAlan Cox			act_delta = 0;
ebcddc72SAlan Cox		if (object->ref_count != 0)
ebcddc72SAlan Cox			act_delta += pmap_ts_referenced(m);
ebcddc72SAlan Cox		else {
ebcddc72SAlan Cox			KASSERT(!pmap_page_is_mapped(m),
ebcddc72SAlan Cox			    ("page %p is mapped", m));
ebcddc72SAlan Cox		}
ebcddc72SAlan Cox		if (act_delta != 0) {
ebcddc72SAlan Cox			if (object->ref_count != 0) {
*83c9dea1SGleb Smirnoff				VM_CNT_INC(v_reactivated);
ebcddc72SAlan Cox				vm_page_activate(m);
ebcddc72SAlan Cox
ebcddc72SAlan Cox				/*
ebcddc72SAlan Cox				 * Increase the activation count if the page
ebcddc72SAlan Cox				 * was referenced while in the laundry queue.
ebcddc72SAlan Cox				 * This makes it less likely that the page will
ebcddc72SAlan Cox				 * be returned prematurely to the inactive
ebcddc72SAlan Cox				 * queue.
ebcddc72SAlan Cox 				 */
ebcddc72SAlan Cox				m->act_count += act_delta + ACT_ADVANCE;
ebcddc72SAlan Cox
ebcddc72SAlan Cox				/*
ebcddc72SAlan Cox				 * If this was a background laundering, count
ebcddc72SAlan Cox				 * activated pages towards our target.  The
ebcddc72SAlan Cox				 * purpose of background laundering is to ensure
ebcddc72SAlan Cox				 * that pages are eventually cycled through the
ebcddc72SAlan Cox				 * laundry queue, and an activation is a valid
ebcddc72SAlan Cox				 * way out.
ebcddc72SAlan Cox				 */
ebcddc72SAlan Cox				if (!in_shortfall)
ebcddc72SAlan Cox					launder--;
ebcddc72SAlan Cox				goto drop_page;
ebcddc72SAlan Cox			} else if ((object->flags & OBJ_DEAD) == 0)
ebcddc72SAlan Cox				goto requeue_page;
ebcddc72SAlan Cox		}
ebcddc72SAlan Cox
ebcddc72SAlan Cox		/*
ebcddc72SAlan Cox		 * If the page appears to be clean at the machine-independent
ebcddc72SAlan Cox		 * layer, then remove all of its mappings from the pmap in
ebcddc72SAlan Cox		 * anticipation of freeing it.  If, however, any of the page's
ebcddc72SAlan Cox		 * mappings allow write access, then the page may still be
ebcddc72SAlan Cox		 * modified until the last of those mappings are removed.
ebcddc72SAlan Cox		 */
ebcddc72SAlan Cox		if (object->ref_count != 0) {
ebcddc72SAlan Cox			vm_page_test_dirty(m);
ebcddc72SAlan Cox			if (m->dirty == 0)
ebcddc72SAlan Cox				pmap_remove_all(m);
ebcddc72SAlan Cox		}
ebcddc72SAlan Cox
ebcddc72SAlan Cox		/*
ebcddc72SAlan Cox		 * Clean pages are freed, and dirty pages are paged out unless
ebcddc72SAlan Cox		 * they belong to a dead object.  Requeueing dirty pages from
ebcddc72SAlan Cox		 * dead objects is pointless, as they are being paged out and
ebcddc72SAlan Cox		 * freed by the thread that destroyed the object.
ebcddc72SAlan Cox		 */
ebcddc72SAlan Cox		if (m->dirty == 0) {
ebcddc72SAlan Coxfree_page:
ebcddc72SAlan Cox			vm_page_free(m);
*83c9dea1SGleb Smirnoff			VM_CNT_INC(v_dfree);
ebcddc72SAlan Cox		} else if ((object->flags & OBJ_DEAD) == 0) {
ebcddc72SAlan Cox			if (object->type != OBJT_SWAP &&
ebcddc72SAlan Cox			    object->type != OBJT_DEFAULT)
ebcddc72SAlan Cox				pageout_ok = true;
ebcddc72SAlan Cox			else if (disable_swap_pageouts)
ebcddc72SAlan Cox				pageout_ok = false;
ebcddc72SAlan Cox			else
ebcddc72SAlan Cox				pageout_ok = true;
ebcddc72SAlan Cox			if (!pageout_ok) {
ebcddc72SAlan Coxrequeue_page:
ebcddc72SAlan Cox				vm_pagequeue_lock(pq);
ebcddc72SAlan Cox				queue_locked = true;
ebcddc72SAlan Cox				vm_page_requeue_locked(m);
ebcddc72SAlan Cox				goto drop_page;
ebcddc72SAlan Cox			}
ebcddc72SAlan Cox
ebcddc72SAlan Cox			/*
ebcddc72SAlan Cox			 * Form a cluster with adjacent, dirty pages from the
ebcddc72SAlan Cox			 * same object, and page out that entire cluster.
ebcddc72SAlan Cox			 *
ebcddc72SAlan Cox			 * The adjacent, dirty pages must also be in the
ebcddc72SAlan Cox			 * laundry.  However, their mappings are not checked
ebcddc72SAlan Cox			 * for new references.  Consequently, a recently
ebcddc72SAlan Cox			 * referenced page may be paged out.  However, that
ebcddc72SAlan Cox			 * page will not be prematurely reclaimed.  After page
ebcddc72SAlan Cox			 * out, the page will be placed in the inactive queue,
ebcddc72SAlan Cox			 * where any new references will be detected and the
ebcddc72SAlan Cox			 * page reactivated.
ebcddc72SAlan Cox			 */
ebcddc72SAlan Cox			error = vm_pageout_clean(m, &numpagedout);
ebcddc72SAlan Cox			if (error == 0) {
ebcddc72SAlan Cox				launder -= numpagedout;
ebcddc72SAlan Cox				maxscan -= numpagedout - 1;
ebcddc72SAlan Cox			} else if (error == EDEADLK) {
ebcddc72SAlan Cox				pageout_lock_miss++;
ebcddc72SAlan Cox				vnodes_skipped++;
ebcddc72SAlan Cox			}
ebcddc72SAlan Cox			goto relock_queue;
ebcddc72SAlan Cox		}
ebcddc72SAlan Coxdrop_page:
ebcddc72SAlan Cox		vm_page_unlock(m);
ebcddc72SAlan Cox		VM_OBJECT_WUNLOCK(object);
ebcddc72SAlan Coxrelock_queue:
ebcddc72SAlan Cox		if (!queue_locked) {
ebcddc72SAlan Cox			vm_pagequeue_lock(pq);
ebcddc72SAlan Cox			queue_locked = true;
ebcddc72SAlan Cox		}
ebcddc72SAlan Cox		next = TAILQ_NEXT(&vmd->vmd_laundry_marker, plinks.q);
ebcddc72SAlan Cox		TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_laundry_marker, plinks.q);
ebcddc72SAlan Cox	}
ebcddc72SAlan Cox	vm_pagequeue_unlock(pq);
ebcddc72SAlan Cox
b1fd102eSMark Johnston	if (launder > 0 && pq == &vmd->vmd_pagequeues[PQ_UNSWAPPABLE]) {
b1fd102eSMark Johnston		pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
b1fd102eSMark Johnston		goto scan;
b1fd102eSMark Johnston	}
b1fd102eSMark Johnston
ebcddc72SAlan Cox	/*
ebcddc72SAlan Cox	 * Wakeup the sync daemon if we skipped a vnode in a writeable object
ebcddc72SAlan Cox	 * and we didn't launder enough pages.
ebcddc72SAlan Cox	 */
ebcddc72SAlan Cox	if (vnodes_skipped > 0 && launder > 0)
ebcddc72SAlan Cox		(void)speedup_syncer();
ebcddc72SAlan Cox
ebcddc72SAlan Cox	return (starting_target - launder);
ebcddc72SAlan Cox}
ebcddc72SAlan Cox
ebcddc72SAlan Cox/*
ebcddc72SAlan Cox * Compute the integer square root.
ebcddc72SAlan Cox */
ebcddc72SAlan Coxstatic u_int
ebcddc72SAlan Coxisqrt(u_int num)
ebcddc72SAlan Cox{
ebcddc72SAlan Cox	u_int bit, root, tmp;
ebcddc72SAlan Cox
ebcddc72SAlan Cox	bit = 1u << ((NBBY * sizeof(u_int)) - 2);
ebcddc72SAlan Cox	while (bit > num)
ebcddc72SAlan Cox		bit >>= 2;
ebcddc72SAlan Cox	root = 0;
ebcddc72SAlan Cox	while (bit != 0) {
ebcddc72SAlan Cox		tmp = root + bit;
ebcddc72SAlan Cox		root >>= 1;
ebcddc72SAlan Cox		if (num >= tmp) {
ebcddc72SAlan Cox			num -= tmp;
ebcddc72SAlan Cox			root += bit;
ebcddc72SAlan Cox		}
ebcddc72SAlan Cox		bit >>= 2;
ebcddc72SAlan Cox	}
ebcddc72SAlan Cox	return (root);
ebcddc72SAlan Cox}
ebcddc72SAlan Cox
ebcddc72SAlan Cox/*
ebcddc72SAlan Cox * Perform the work of the laundry thread: periodically wake up and determine
ebcddc72SAlan Cox * whether any pages need to be laundered.  If so, determine the number of pages
ebcddc72SAlan Cox * that need to be laundered, and launder them.
ebcddc72SAlan Cox */
ebcddc72SAlan Coxstatic void
ebcddc72SAlan Coxvm_pageout_laundry_worker(void *arg)
ebcddc72SAlan Cox{
ebcddc72SAlan Cox	struct vm_domain *domain;
ebcddc72SAlan Cox	struct vm_pagequeue *pq;
ebcddc72SAlan Cox	uint64_t nclean, ndirty;
ebcddc72SAlan Cox	u_int last_launder, wakeups;
ebcddc72SAlan Cox	int domidx, last_target, launder, shortfall, shortfall_cycle, target;
ebcddc72SAlan Cox	bool in_shortfall;
ebcddc72SAlan Cox
ebcddc72SAlan Cox	domidx = (uintptr_t)arg;
ebcddc72SAlan Cox	domain = &vm_dom[domidx];
ebcddc72SAlan Cox	pq = &domain->vmd_pagequeues[PQ_LAUNDRY];
ebcddc72SAlan Cox	KASSERT(domain->vmd_segs != 0, ("domain without segments"));
ebcddc72SAlan Cox	vm_pageout_init_marker(&domain->vmd_laundry_marker, PQ_LAUNDRY);
ebcddc72SAlan Cox
ebcddc72SAlan Cox	shortfall = 0;
ebcddc72SAlan Cox	in_shortfall = false;
ebcddc72SAlan Cox	shortfall_cycle = 0;
ebcddc72SAlan Cox	target = 0;
ebcddc72SAlan Cox	last_launder = 0;
ebcddc72SAlan Cox
ebcddc72SAlan Cox	/*
b1fd102eSMark Johnston	 * Calls to these handlers are serialized by the swap syscall lock.
b1fd102eSMark Johnston	 */
b1fd102eSMark Johnston	(void)EVENTHANDLER_REGISTER(swapon, vm_pageout_swapon, domain,
b1fd102eSMark Johnston	    EVENTHANDLER_PRI_ANY);
b1fd102eSMark Johnston	(void)EVENTHANDLER_REGISTER(swapoff, vm_pageout_swapoff, domain,
b1fd102eSMark Johnston	    EVENTHANDLER_PRI_ANY);
b1fd102eSMark Johnston
b1fd102eSMark Johnston	/*
ebcddc72SAlan Cox	 * The pageout laundry worker is never done, so loop forever.
ebcddc72SAlan Cox	 */
ebcddc72SAlan Cox	for (;;) {
ebcddc72SAlan Cox		KASSERT(target >= 0, ("negative target %d", target));
ebcddc72SAlan Cox		KASSERT(shortfall_cycle >= 0,
ebcddc72SAlan Cox		    ("negative cycle %d", shortfall_cycle));
ebcddc72SAlan Cox		launder = 0;
*83c9dea1SGleb Smirnoff		wakeups = VM_CNT_FETCH(v_pdwakeups);
ebcddc72SAlan Cox
ebcddc72SAlan Cox		/*
ebcddc72SAlan Cox		 * First determine whether we need to launder pages to meet a
ebcddc72SAlan Cox		 * shortage of free pages.
ebcddc72SAlan Cox		 */
ebcddc72SAlan Cox		if (shortfall > 0) {
ebcddc72SAlan Cox			in_shortfall = true;
ebcddc72SAlan Cox			shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE;
ebcddc72SAlan Cox			target = shortfall;
ebcddc72SAlan Cox		} else if (!in_shortfall)
ebcddc72SAlan Cox			goto trybackground;
ebcddc72SAlan Cox		else if (shortfall_cycle == 0 || vm_laundry_target() <= 0) {
ebcddc72SAlan Cox			/*
ebcddc72SAlan Cox			 * We recently entered shortfall and began laundering
ebcddc72SAlan Cox			 * pages.  If we have completed that laundering run
ebcddc72SAlan Cox			 * (and we are no longer in shortfall) or we have met
ebcddc72SAlan Cox			 * our laundry target through other activity, then we
ebcddc72SAlan Cox			 * can stop laundering pages.
ebcddc72SAlan Cox			 */
ebcddc72SAlan Cox			in_shortfall = false;
ebcddc72SAlan Cox			target = 0;
ebcddc72SAlan Cox			goto trybackground;
ebcddc72SAlan Cox		}
ebcddc72SAlan Cox		last_launder = wakeups;
ebcddc72SAlan Cox		launder = target / shortfall_cycle--;
ebcddc72SAlan Cox		goto dolaundry;
ebcddc72SAlan Cox
ebcddc72SAlan Cox		/*
ebcddc72SAlan Cox		 * There's no immediate need to launder any pages; see if we
ebcddc72SAlan Cox		 * meet the conditions to perform background laundering:
ebcddc72SAlan Cox		 *
ebcddc72SAlan Cox		 * 1. The ratio of dirty to clean inactive pages exceeds the
ebcddc72SAlan Cox		 *    background laundering threshold and the pagedaemon has
ebcddc72SAlan Cox		 *    been woken up to reclaim pages since our last
ebcddc72SAlan Cox		 *    laundering, or
ebcddc72SAlan Cox		 * 2. we haven't yet reached the target of the current
ebcddc72SAlan Cox		 *    background laundering run.
ebcddc72SAlan Cox		 *
ebcddc72SAlan Cox		 * The background laundering threshold is not a constant.
ebcddc72SAlan Cox		 * Instead, it is a slowly growing function of the number of
ebcddc72SAlan Cox		 * page daemon wakeups since the last laundering.  Thus, as the
ebcddc72SAlan Cox		 * ratio of dirty to clean inactive pages grows, the amount of
ebcddc72SAlan Cox		 * memory pressure required to trigger laundering decreases.
ebcddc72SAlan Cox		 */
ebcddc72SAlan Coxtrybackground:
ebcddc72SAlan Cox		nclean = vm_cnt.v_inactive_count + vm_cnt.v_free_count;
ebcddc72SAlan Cox		ndirty = vm_cnt.v_laundry_count;
ebcddc72SAlan Cox		if (target == 0 && wakeups != last_launder &&
ebcddc72SAlan Cox		    ndirty * isqrt(wakeups - last_launder) >= nclean) {
ebcddc72SAlan Cox			target = vm_background_launder_target;
ebcddc72SAlan Cox		}
ebcddc72SAlan Cox
ebcddc72SAlan Cox		/*
ebcddc72SAlan Cox		 * We have a non-zero background laundering target.  If we've
ebcddc72SAlan Cox		 * laundered up to our maximum without observing a page daemon
ebcddc72SAlan Cox		 * wakeup, just stop.  This is a safety belt that ensures we
ebcddc72SAlan Cox		 * don't launder an excessive amount if memory pressure is low
ebcddc72SAlan Cox		 * and the ratio of dirty to clean pages is large.  Otherwise,
ebcddc72SAlan Cox		 * proceed at the background laundering rate.
ebcddc72SAlan Cox		 */
ebcddc72SAlan Cox		if (target > 0) {
ebcddc72SAlan Cox			if (wakeups != last_launder) {
ebcddc72SAlan Cox				last_launder = wakeups;
ebcddc72SAlan Cox				last_target = target;
ebcddc72SAlan Cox			} else if (last_target - target >=
ebcddc72SAlan Cox			    vm_background_launder_max * PAGE_SIZE / 1024) {
ebcddc72SAlan Cox				target = 0;
ebcddc72SAlan Cox			}
ebcddc72SAlan Cox			launder = vm_background_launder_rate * PAGE_SIZE / 1024;
ebcddc72SAlan Cox			launder /= VM_LAUNDER_RATE;
ebcddc72SAlan Cox			if (launder > target)
ebcddc72SAlan Cox				launder = target;
ebcddc72SAlan Cox		}
ebcddc72SAlan Cox
ebcddc72SAlan Coxdolaundry:
ebcddc72SAlan Cox		if (launder > 0) {
ebcddc72SAlan Cox			/*
ebcddc72SAlan Cox			 * Because of I/O clustering, the number of laundered
ebcddc72SAlan Cox			 * pages could exceed "target" by the maximum size of
ebcddc72SAlan Cox			 * a cluster minus one.
ebcddc72SAlan Cox			 */
ebcddc72SAlan Cox			target -= min(vm_pageout_launder(domain, launder,
ebcddc72SAlan Cox			    in_shortfall), target);
ebcddc72SAlan Cox			pause("laundp", hz / VM_LAUNDER_RATE);
ebcddc72SAlan Cox		}
ebcddc72SAlan Cox
ebcddc72SAlan Cox		/*
ebcddc72SAlan Cox		 * If we're not currently laundering pages and the page daemon
ebcddc72SAlan Cox		 * hasn't posted a new request, sleep until the page daemon
ebcddc72SAlan Cox		 * kicks us.
ebcddc72SAlan Cox		 */
ebcddc72SAlan Cox		vm_pagequeue_lock(pq);
ebcddc72SAlan Cox		if (target == 0 && vm_laundry_request == VM_LAUNDRY_IDLE)
ebcddc72SAlan Cox			(void)mtx_sleep(&vm_laundry_request,
ebcddc72SAlan Cox			    vm_pagequeue_lockptr(pq), PVM, "launds", 0);
ebcddc72SAlan Cox
ebcddc72SAlan Cox		/*
ebcddc72SAlan Cox		 * If the pagedaemon has indicated that it's in shortfall, start
ebcddc72SAlan Cox		 * a shortfall laundering unless we're already in the middle of
ebcddc72SAlan Cox		 * one.  This may preempt a background laundering.
ebcddc72SAlan Cox		 */
ebcddc72SAlan Cox		if (vm_laundry_request == VM_LAUNDRY_SHORTFALL &&
ebcddc72SAlan Cox		    (!in_shortfall || shortfall_cycle == 0)) {
ebcddc72SAlan Cox			shortfall = vm_laundry_target() + vm_pageout_deficit;
ebcddc72SAlan Cox			target = 0;
ebcddc72SAlan Cox		} else
ebcddc72SAlan Cox			shortfall = 0;
ebcddc72SAlan Cox
ebcddc72SAlan Cox		if (target == 0)
ebcddc72SAlan Cox			vm_laundry_request = VM_LAUNDRY_IDLE;
ebcddc72SAlan Cox		vm_pagequeue_unlock(pq);
ebcddc72SAlan Cox	}
ebcddc72SAlan Cox}
ebcddc72SAlan Cox
ebcddc72SAlan Cox/*
df8bae1dSRodney W. Grimes *	vm_pageout_scan does the dirty work for the pageout daemon.
d9e23210SJeff Roberson *
ebcddc72SAlan Cox *	pass == 0: Update active LRU/deactivate pages
ebcddc72SAlan Cox *	pass >= 1: Free inactive pages
e57dd910SAlan Cox *
e57dd910SAlan Cox * Returns true if pass was zero or enough pages were freed by the inactive
e57dd910SAlan Cox * queue scan to meet the target.
df8bae1dSRodney W. Grimes */
e57dd910SAlan Coxstatic bool
449c2e92SKonstantin Belousovvm_pageout_scan(struct vm_domain *vmd, int pass)
df8bae1dSRodney W. Grimes{
502ba6e4SJohn Dyson	vm_page_t m, next;
8d220203SAlan Cox	struct vm_pagequeue *pq;
df8bae1dSRodney W. Grimes	vm_object_t object;
22cf98d1SAlan Cox	long min_scan;
ebcddc72SAlan Cox	int act_delta, addl_page_shortage, deficit, inactq_shortage, maxscan;
ebcddc72SAlan Cox	int page_shortage, scan_tick, scanned, starting_page_shortage;
ebcddc72SAlan Cox	boolean_t queue_locked;
0d94caffSDavid Greenman
df8bae1dSRodney W. Grimes	/*
d9e23210SJeff Roberson	 * If we need to reclaim memory ask kernel caches to return
c9612b2dSJeff Roberson	 * some.  We rate limit to avoid thrashing.
d9e23210SJeff Roberson	 */
c9612b2dSJeff Roberson	if (vmd == &vm_dom[0] && pass > 0 &&
a6bf3a9eSRyan Stone	    (time_uptime - lowmem_uptime) >= lowmem_period) {
d9e23210SJeff Roberson		/*
855a310fSJeff Roberson		 * Decrease registered cache sizes.
855a310fSJeff Roberson		 */
14a0d74eSSteven Hartland		SDT_PROBE0(vm, , , vm__lowmem_scan);
9b43bc27SAndriy Gapon		EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_PAGES);
855a310fSJeff Roberson		/*
d9e23210SJeff Roberson		 * We do this explicitly after the caches have been
d9e23210SJeff Roberson		 * drained above.
855a310fSJeff Roberson		 */
855a310fSJeff Roberson		uma_reclaim();
a6bf3a9eSRyan Stone		lowmem_uptime = time_uptime;
d9e23210SJeff Roberson	}
5985940eSJohn Dyson
311e34e2SKonstantin Belousov	/*
96240c89SEitan Adler	 * The addl_page_shortage is the number of temporarily
311e34e2SKonstantin Belousov	 * stuck pages in the inactive queue.  In other words, the
449c2e92SKonstantin Belousov	 * number of pages from the inactive count that should be
311e34e2SKonstantin Belousov	 * discounted in setting the target for the active queue scan.
311e34e2SKonstantin Belousov	 */
9099545aSAlan Cox	addl_page_shortage = 0;
9099545aSAlan Cox
1c7c3c6aSMatthew Dillon	/*
e57dd910SAlan Cox	 * Calculate the number of pages that we want to free.  This number
e57dd910SAlan Cox	 * can be negative if many pages are freed between the wakeup call to
e57dd910SAlan Cox	 * the page daemon and this calculation.
1c7c3c6aSMatthew Dillon	 */
60196cdaSAlan Cox	if (pass > 0) {
60196cdaSAlan Cox		deficit = atomic_readandclear_int(&vm_pageout_deficit);
9099545aSAlan Cox		page_shortage = vm_paging_target() + deficit;
60196cdaSAlan Cox	} else
60196cdaSAlan Cox		page_shortage = deficit = 0;
76386c7eSKonstantin Belousov	starting_page_shortage = page_shortage;
1c7c3c6aSMatthew Dillon
936524aaSMatthew Dillon	/*
f095d1bbSAlan Cox	 * Start scanning the inactive queue for pages that we can free.  The
f095d1bbSAlan Cox	 * scan will stop when we reach the target or we have scanned the
f095d1bbSAlan Cox	 * entire queue.  (Note that m->act_count is not used to make
f095d1bbSAlan Cox	 * decisions for the inactive queue, only for the active queue.)
8d220203SAlan Cox	 */
449c2e92SKonstantin Belousov	pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
449c2e92SKonstantin Belousov	maxscan = pq->pq_cnt;
8d220203SAlan Cox	vm_pagequeue_lock(pq);
3ac8f842SMark Johnston	queue_locked = TRUE;
8d220203SAlan Cox	for (m = TAILQ_FIRST(&pq->pq_pl);
1c7c3c6aSMatthew Dillon	     m != NULL && maxscan-- > 0 && page_shortage > 0;
e929c00dSKirk McKusick	     m = next) {
8d220203SAlan Cox		vm_pagequeue_assert_locked(pq);
3ac8f842SMark Johnston		KASSERT(queue_locked, ("unlocked inactive queue"));
ebcddc72SAlan Cox		KASSERT(vm_page_inactive(m), ("Inactive queue %p", m));
df8bae1dSRodney W. Grimes
*83c9dea1SGleb Smirnoff		VM_CNT_INC(v_pdpages);
c325e866SKonstantin Belousov		next = TAILQ_NEXT(m, plinks.q);
df8bae1dSRodney W. Grimes
936524aaSMatthew Dillon		/*
936524aaSMatthew Dillon		 * skip marker pages
936524aaSMatthew Dillon		 */
936524aaSMatthew Dillon		if (m->flags & PG_MARKER)
936524aaSMatthew Dillon			continue;
936524aaSMatthew Dillon
7900f95dSKonstantin Belousov		KASSERT((m->flags & PG_FICTITIOUS) == 0,
7900f95dSKonstantin Belousov		    ("Fictitious page %p cannot be in inactive queue", m));
7900f95dSKonstantin Belousov		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7900f95dSKonstantin Belousov		    ("Unmanaged page %p cannot be in inactive queue", m));
7900f95dSKonstantin Belousov
8c616246SKonstantin Belousov		/*
311e34e2SKonstantin Belousov		 * The page or object lock acquisitions fail if the
311e34e2SKonstantin Belousov		 * page was removed from the queue or moved to a
311e34e2SKonstantin Belousov		 * different position within the queue.  In either
311e34e2SKonstantin Belousov		 * case, addl_page_shortage should not be incremented.
8c616246SKonstantin Belousov		 */
a3aeedabSAlan Cox		if (!vm_pageout_page_lock(m, &next))
a3aeedabSAlan Cox			goto unlock_page;
a3aeedabSAlan Cox		else if (m->hold_count != 0) {
a3aeedabSAlan Cox			/*
a3aeedabSAlan Cox			 * Held pages are essentially stuck in the
a3aeedabSAlan Cox			 * queue.  So, they ought to be discounted
a3aeedabSAlan Cox			 * from the inactive count.  See the
e57dd910SAlan Cox			 * calculation of inactq_shortage before the
a3aeedabSAlan Cox			 * loop over the active queue below.
a3aeedabSAlan Cox			 */
a3aeedabSAlan Cox			addl_page_shortage++;
a3aeedabSAlan Cox			goto unlock_page;
df8bae1dSRodney W. Grimes		}
9ee2165fSAlan Cox		object = m->object;
a3aeedabSAlan Cox		if (!VM_OBJECT_TRYWLOCK(object)) {
a3aeedabSAlan Cox			if (!vm_pageout_fallback_object_lock(m, &next))
a3aeedabSAlan Cox				goto unlock_object;
a3aeedabSAlan Cox			else if (m->hold_count != 0) {
b182ec9eSJohn Dyson				addl_page_shortage++;
a3aeedabSAlan Cox				goto unlock_object;
a3aeedabSAlan Cox			}
a3aeedabSAlan Cox		}
a3aeedabSAlan Cox		if (vm_page_busied(m)) {
a3aeedabSAlan Cox			/*
a3aeedabSAlan Cox			 * Don't mess with busy pages.  Leave them at
a3aeedabSAlan Cox			 * the front of the queue.  Most likely, they
a3aeedabSAlan Cox			 * are being paged out and will leave the
a3aeedabSAlan Cox			 * queue shortly after the scan finishes.  So,
a3aeedabSAlan Cox			 * they ought to be discounted from the
a3aeedabSAlan Cox			 * inactive count.
a3aeedabSAlan Cox			 */
a3aeedabSAlan Cox			addl_page_shortage++;
a3aeedabSAlan Coxunlock_object:
a3aeedabSAlan Cox			VM_OBJECT_WUNLOCK(object);
a3aeedabSAlan Coxunlock_page:
a3aeedabSAlan Cox			vm_page_unlock(m);
26f9a767SRodney W. Grimes			continue;
26f9a767SRodney W. Grimes		}
a3aeedabSAlan Cox		KASSERT(m->hold_count == 0, ("Held page %p", m));
bd7e5f99SJohn Dyson
7e006499SJohn Dyson		/*
ebcddc72SAlan Cox		 * Dequeue the inactive page and unlock the inactive page
ebcddc72SAlan Cox		 * queue, invalidating the 'next' pointer.  Dequeueing the
ebcddc72SAlan Cox		 * page here avoids a later reacquisition (and release) of
ebcddc72SAlan Cox		 * the inactive page queue lock when vm_page_activate(),
ebcddc72SAlan Cox		 * vm_page_free(), or vm_page_launder() is called.  Use a
ebcddc72SAlan Cox		 * marker to remember our place in the inactive queue.
48cc2fc7SKonstantin Belousov		 */
c325e866SKonstantin Belousov		TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q);
ebcddc72SAlan Cox		vm_page_dequeue_locked(m);
8d220203SAlan Cox		vm_pagequeue_unlock(pq);
3ac8f842SMark Johnston		queue_locked = FALSE;
48cc2fc7SKonstantin Belousov
48cc2fc7SKonstantin Belousov		/*
8748f58cSKonstantin Belousov		 * Invalid pages can be easily freed. They cannot be
8748f58cSKonstantin Belousov		 * mapped, vm_page_free() asserts this.
776f729cSKonstantin Belousov		 */
8748f58cSKonstantin Belousov		if (m->valid == 0)
8748f58cSKonstantin Belousov			goto free_page;
776f729cSKonstantin Belousov
776f729cSKonstantin Belousov		/*
960810ccSAlan Cox		 * If the page has been referenced and the object is not dead,
960810ccSAlan Cox		 * reactivate or requeue the page depending on whether the
960810ccSAlan Cox		 * object is mapped.
7e006499SJohn Dyson		 */
bb7858eaSJeff Roberson		if ((m->aflags & PGA_REFERENCED) != 0) {
bb7858eaSJeff Roberson			vm_page_aflag_clear(m, PGA_REFERENCED);
bb7858eaSJeff Roberson			act_delta = 1;
86fa2471SAlan Cox		} else
86fa2471SAlan Cox			act_delta = 0;
bb7858eaSJeff Roberson		if (object->ref_count != 0) {
bb7858eaSJeff Roberson			act_delta += pmap_ts_referenced(m);
bb7858eaSJeff Roberson		} else {
bb7858eaSJeff Roberson			KASSERT(!pmap_page_is_mapped(m),
bb7858eaSJeff Roberson			    ("vm_pageout_scan: page %p is mapped", m));
2fe6e4d7SDavid Greenman		}
bb7858eaSJeff Roberson		if (act_delta != 0) {
86fa2471SAlan Cox			if (object->ref_count != 0) {
*83c9dea1SGleb Smirnoff				VM_CNT_INC(v_reactivated);
26f9a767SRodney W. Grimes				vm_page_activate(m);
960810ccSAlan Cox
960810ccSAlan Cox				/*
960810ccSAlan Cox				 * Increase the activation count if the page
960810ccSAlan Cox				 * was referenced while in the inactive queue.
960810ccSAlan Cox				 * This makes it less likely that the page will
960810ccSAlan Cox				 * be returned prematurely to the inactive
960810ccSAlan Cox				 * queue.
960810ccSAlan Cox 				 */
bb7858eaSJeff Roberson				m->act_count += act_delta + ACT_ADVANCE;
960810ccSAlan Cox				goto drop_page;
ebcddc72SAlan Cox			} else if ((object->flags & OBJ_DEAD) == 0) {
ebcddc72SAlan Cox				vm_pagequeue_lock(pq);
ebcddc72SAlan Cox				queue_locked = TRUE;
ebcddc72SAlan Cox				m->queue = PQ_INACTIVE;
ebcddc72SAlan Cox				TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
ebcddc72SAlan Cox				vm_pagequeue_cnt_inc(pq);
ebcddc72SAlan Cox				goto drop_page;
ebcddc72SAlan Cox			}
960810ccSAlan Cox		}
67bf6868SJohn Dyson
7e006499SJohn Dyson		/*
9fc4739dSAlan Cox		 * If the page appears to be clean at the machine-independent
9fc4739dSAlan Cox		 * layer, then remove all of its mappings from the pmap in
a766ffd0SAlan Cox		 * anticipation of freeing it.  If, however, any of the page's
a766ffd0SAlan Cox		 * mappings allow write access, then the page may still be
a766ffd0SAlan Cox		 * modified until the last of those mappings are removed.
7e006499SJohn Dyson		 */
aa044135SAlan Cox		if (object->ref_count != 0) {
9fc4739dSAlan Cox			vm_page_test_dirty(m);
aa044135SAlan Cox			if (m->dirty == 0)
b78ddb0bSAlan Cox				pmap_remove_all(m);
aa044135SAlan Cox		}
dcbcd518SBruce Evans
6989c456SAlan Cox		/*
ebcddc72SAlan Cox		 * Clean pages can be freed, but dirty pages must be sent back
ebcddc72SAlan Cox		 * to the laundry, unless they belong to a dead object.
ebcddc72SAlan Cox		 * Requeueing dirty pages from dead objects is pointless, as
ebcddc72SAlan Cox		 * they are being paged out and freed by the thread that
ebcddc72SAlan Cox		 * destroyed the object.
6989c456SAlan Cox		 */
ebcddc72SAlan Cox		if (m->dirty == 0) {
8748f58cSKonstantin Belousovfree_page:
78afdce6SAlan Cox			vm_page_free(m);
*83c9dea1SGleb Smirnoff			VM_CNT_INC(v_dfree);
1c7c3c6aSMatthew Dillon			--page_shortage;
ebcddc72SAlan Cox		} else if ((object->flags & OBJ_DEAD) == 0)
ebcddc72SAlan Cox			vm_page_launder(m);
776f729cSKonstantin Belousovdrop_page:
48cc2fc7SKonstantin Belousov		vm_page_unlock(m);
89f6b863SAttilio Rao		VM_OBJECT_WUNLOCK(object);
3ac8f842SMark Johnston		if (!queue_locked) {
8d220203SAlan Cox			vm_pagequeue_lock(pq);
3ac8f842SMark Johnston			queue_locked = TRUE;
6989c456SAlan Cox		}
c325e866SKonstantin Belousov		next = TAILQ_NEXT(&vmd->vmd_marker, plinks.q);
c325e866SKonstantin Belousov		TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_marker, plinks.q);
0d94caffSDavid Greenman	}
8d220203SAlan Cox	vm_pagequeue_unlock(pq);
26f9a767SRodney W. Grimes
ebcddc72SAlan Cox	/*
ebcddc72SAlan Cox	 * Wake up the laundry thread so that it can perform any needed
ebcddc72SAlan Cox	 * laundering.  If we didn't meet our target, we're in shortfall and
b1fd102eSMark Johnston	 * need to launder more aggressively.  If PQ_LAUNDRY is empty and no
b1fd102eSMark Johnston	 * swap devices are configured, the laundry thread has no work to do, so
b1fd102eSMark Johnston	 * don't bother waking it up.
ebcddc72SAlan Cox	 */
ebcddc72SAlan Cox	if (vm_laundry_request == VM_LAUNDRY_IDLE &&
ebcddc72SAlan Cox	    starting_page_shortage > 0) {
ebcddc72SAlan Cox		pq = &vm_dom[0].vmd_pagequeues[PQ_LAUNDRY];
ebcddc72SAlan Cox		vm_pagequeue_lock(pq);
b1fd102eSMark Johnston		if (pq->pq_cnt > 0 || atomic_load_acq_int(&swapdev_enabled)) {
ebcddc72SAlan Cox			if (page_shortage > 0) {
ebcddc72SAlan Cox				vm_laundry_request = VM_LAUNDRY_SHORTFALL;
*83c9dea1SGleb Smirnoff				VM_CNT_INC(v_pdshortfalls);
ebcddc72SAlan Cox			} else if (vm_laundry_request != VM_LAUNDRY_SHORTFALL)
ebcddc72SAlan Cox				vm_laundry_request = VM_LAUNDRY_BACKGROUND;
ebcddc72SAlan Cox			wakeup(&vm_laundry_request);
b1fd102eSMark Johnston		}
ebcddc72SAlan Cox		vm_pagequeue_unlock(pq);
ebcddc72SAlan Cox	}
ebcddc72SAlan Cox
9452b5edSAlan Cox#if !defined(NO_SWAPPING)
9452b5edSAlan Cox	/*
f095d1bbSAlan Cox	 * Wakeup the swapout daemon if we didn't free the targeted number of
f095d1bbSAlan Cox	 * pages.
9452b5edSAlan Cox	 */
9452b5edSAlan Cox	if (vm_swap_enabled && page_shortage > 0)
9452b5edSAlan Cox		vm_req_vmdaemon(VM_SWAP_NORMAL);
9452b5edSAlan Cox#endif
9452b5edSAlan Cox
9452b5edSAlan Cox	/*
76386c7eSKonstantin Belousov	 * If the inactive queue scan fails repeatedly to meet its
76386c7eSKonstantin Belousov	 * target, kill the largest process.
76386c7eSKonstantin Belousov	 */
76386c7eSKonstantin Belousov	vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage);
76386c7eSKonstantin Belousov
76386c7eSKonstantin Belousov	/*
936524aaSMatthew Dillon	 * Compute the number of pages we want to try to move from the
ebcddc72SAlan Cox	 * active queue to either the inactive or laundry queue.
ebcddc72SAlan Cox	 *
ebcddc72SAlan Cox	 * When scanning active pages, we make clean pages count more heavily
ebcddc72SAlan Cox	 * towards the page shortage than dirty pages.  This is because dirty
ebcddc72SAlan Cox	 * pages must be laundered before they can be reused and thus have less
ebcddc72SAlan Cox	 * utility when attempting to quickly alleviate a shortage.  However,
ebcddc72SAlan Cox	 * this weighting also causes the scan to deactivate dirty pages more
ebcddc72SAlan Cox	 * more aggressively, improving the effectiveness of clustering and
ebcddc72SAlan Cox	 * ensuring that they can eventually be reused.
1c7c3c6aSMatthew Dillon	 */
ebcddc72SAlan Cox	inactq_shortage = vm_cnt.v_inactive_target - (vm_cnt.v_inactive_count +
ebcddc72SAlan Cox	    vm_cnt.v_laundry_count / act_scan_laundry_weight) +
9099545aSAlan Cox	    vm_paging_target() + deficit + addl_page_shortage;
ebcddc72SAlan Cox	page_shortage *= act_scan_laundry_weight;
9099545aSAlan Cox
114f62c6SJeff Roberson	pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
114f62c6SJeff Roberson	vm_pagequeue_lock(pq);
9099545aSAlan Cox	maxscan = pq->pq_cnt;
9099545aSAlan Cox
d9e23210SJeff Roberson	/*
d9e23210SJeff Roberson	 * If we're just idle polling attempt to visit every
d9e23210SJeff Roberson	 * active page within 'update_period' seconds.
d9e23210SJeff Roberson	 */
22cf98d1SAlan Cox	scan_tick = ticks;
22cf98d1SAlan Cox	if (vm_pageout_update_period != 0) {
22cf98d1SAlan Cox		min_scan = pq->pq_cnt;
22cf98d1SAlan Cox		min_scan *= scan_tick - vmd->vmd_last_active_scan;
22cf98d1SAlan Cox		min_scan /= hz * vm_pageout_update_period;
22cf98d1SAlan Cox	} else
22cf98d1SAlan Cox		min_scan = 0;
e57dd910SAlan Cox	if (min_scan > 0 || (inactq_shortage > 0 && maxscan > 0))
22cf98d1SAlan Cox		vmd->vmd_last_active_scan = scan_tick;
1c7c3c6aSMatthew Dillon
1c7c3c6aSMatthew Dillon	/*
22cf98d1SAlan Cox	 * Scan the active queue for pages that can be deactivated.  Update
22cf98d1SAlan Cox	 * the per-page activity counter and use it to identify deactivation
79144408SAlan Cox	 * candidates.  Held pages may be deactivated.
1c7c3c6aSMatthew Dillon	 */
22cf98d1SAlan Cox	for (m = TAILQ_FIRST(&pq->pq_pl), scanned = 0; m != NULL && (scanned <
e57dd910SAlan Cox	    min_scan || (inactq_shortage > 0 && scanned < maxscan)); m = next,
22cf98d1SAlan Cox	    scanned++) {
9cf51988SAlan Cox		KASSERT(m->queue == PQ_ACTIVE,
d3c09dd7SAlan Cox		    ("vm_pageout_scan: page %p isn't active", m));
c325e866SKonstantin Belousov		next = TAILQ_NEXT(m, plinks.q);
22cf98d1SAlan Cox		if ((m->flags & PG_MARKER) != 0)
8dbca793STor Egge			continue;
7900f95dSKonstantin Belousov		KASSERT((m->flags & PG_FICTITIOUS) == 0,
7900f95dSKonstantin Belousov		    ("Fictitious page %p cannot be in active queue", m));
7900f95dSKonstantin Belousov		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7900f95dSKonstantin Belousov		    ("Unmanaged page %p cannot be in active queue", m));
9ee2165fSAlan Cox		if (!vm_pageout_page_lock(m, &next)) {
8c616246SKonstantin Belousov			vm_page_unlock(m);
2965a453SKip Macy			continue;
2965a453SKip Macy		}
b18bfc3dSJohn Dyson
b18bfc3dSJohn Dyson		/*
79144408SAlan Cox		 * The count for page daemon pages is updated after checking
79144408SAlan Cox		 * the page for eligibility.
b18bfc3dSJohn Dyson		 */
*83c9dea1SGleb Smirnoff		VM_CNT_INC(v_pdpages);
ef743ce6SJohn Dyson
7e006499SJohn Dyson		/*
7e006499SJohn Dyson		 * Check to see "how much" the page has been used.
7e006499SJohn Dyson		 */
86fa2471SAlan Cox		if ((m->aflags & PGA_REFERENCED) != 0) {
bb7858eaSJeff Roberson			vm_page_aflag_clear(m, PGA_REFERENCED);
86fa2471SAlan Cox			act_delta = 1;
86fa2471SAlan Cox		} else
86fa2471SAlan Cox			act_delta = 0;
86fa2471SAlan Cox
274132acSJeff Roberson		/*
79144408SAlan Cox		 * Perform an unsynchronized object ref count check.  While
79144408SAlan Cox		 * the page lock ensures that the page is not reallocated to
79144408SAlan Cox		 * another object, in particular, one with unmanaged mappings
79144408SAlan Cox		 * that cannot support pmap_ts_referenced(), two races are,
79144408SAlan Cox		 * nonetheless, possible:
79144408SAlan Cox		 * 1) The count was transitioning to zero, but we saw a non-
79144408SAlan Cox		 *    zero value.  pmap_ts_referenced() will return zero
79144408SAlan Cox		 *    because the page is not mapped.
79144408SAlan Cox		 * 2) The count was transitioning to one, but we saw zero.
79144408SAlan Cox		 *    This race delays the detection of a new reference.  At
79144408SAlan Cox		 *    worst, we will deactivate and reactivate the page.
274132acSJeff Roberson		 */
274132acSJeff Roberson		if (m->object->ref_count != 0)
bb7858eaSJeff Roberson			act_delta += pmap_ts_referenced(m);
bb7858eaSJeff Roberson
bb7858eaSJeff Roberson		/*
bb7858eaSJeff Roberson		 * Advance or decay the act_count based on recent usage.
bb7858eaSJeff Roberson		 */
86fa2471SAlan Cox		if (act_delta != 0) {
bb7858eaSJeff Roberson			m->act_count += ACT_ADVANCE + act_delta;
38efa82bSJohn Dyson			if (m->act_count > ACT_MAX)
38efa82bSJohn Dyson				m->act_count = ACT_MAX;
86fa2471SAlan Cox		} else
38efa82bSJohn Dyson			m->act_count -= min(m->act_count, ACT_DECLINE);
bb7858eaSJeff Roberson
bb7858eaSJeff Roberson		/*
ebcddc72SAlan Cox		 * Move this page to the tail of the active, inactive or laundry
bb7858eaSJeff Roberson		 * queue depending on usage.
bb7858eaSJeff Roberson		 */
86fa2471SAlan Cox		if (m->act_count == 0) {
8d220203SAlan Cox			/* Dequeue to avoid later lock recursion. */
8d220203SAlan Cox			vm_page_dequeue_locked(m);
ebcddc72SAlan Cox
ebcddc72SAlan Cox			/*
ebcddc72SAlan Cox			 * When not short for inactive pages, let dirty pages go
ebcddc72SAlan Cox			 * through the inactive queue before moving to the
ebcddc72SAlan Cox			 * laundry queues.  This gives them some extra time to
ebcddc72SAlan Cox			 * be reactivated, potentially avoiding an expensive
ebcddc72SAlan Cox			 * pageout.  During a page shortage, the inactive queue
ebcddc72SAlan Cox			 * is necessarily small, so we may move dirty pages
ebcddc72SAlan Cox			 * directly to the laundry queue.
ebcddc72SAlan Cox			 */
ebcddc72SAlan Cox			if (inactq_shortage <= 0)
d4a272dbSJohn Dyson				vm_page_deactivate(m);
ebcddc72SAlan Cox			else {
ebcddc72SAlan Cox				/*
ebcddc72SAlan Cox				 * Calling vm_page_test_dirty() here would
ebcddc72SAlan Cox				 * require acquisition of the object's write
ebcddc72SAlan Cox				 * lock.  However, during a page shortage,
ebcddc72SAlan Cox				 * directing dirty pages into the laundry
ebcddc72SAlan Cox				 * queue is only an optimization and not a
ebcddc72SAlan Cox				 * requirement.  Therefore, we simply rely on
ebcddc72SAlan Cox				 * the opportunistic updates to the page's
ebcddc72SAlan Cox				 * dirty field by the pmap.
ebcddc72SAlan Cox				 */
ebcddc72SAlan Cox				if (m->dirty == 0) {
ebcddc72SAlan Cox					vm_page_deactivate(m);
ebcddc72SAlan Cox					inactq_shortage -=
ebcddc72SAlan Cox					    act_scan_laundry_weight;
ebcddc72SAlan Cox				} else {
ebcddc72SAlan Cox					vm_page_launder(m);
e57dd910SAlan Cox					inactq_shortage--;
ebcddc72SAlan Cox				}
ebcddc72SAlan Cox			}
8d220203SAlan Cox		} else
8d220203SAlan Cox			vm_page_requeue_locked(m);
2965a453SKip Macy		vm_page_unlock(m);
26f9a767SRodney W. Grimes	}
8d220203SAlan Cox	vm_pagequeue_unlock(pq);
ceb0cf87SJohn Dyson#if !defined(NO_SWAPPING)
ceb0cf87SJohn Dyson	/*
87ff568cSAlan Cox	 * Idle process swapout -- run once per second when we are reclaiming
87ff568cSAlan Cox	 * pages.
ceb0cf87SJohn Dyson	 */
87ff568cSAlan Cox	if (vm_swap_idle_enabled && pass > 0) {
ceb0cf87SJohn Dyson		static long lsec;
227ee8a1SPoul-Henning Kamp		if (time_second != lsec) {
97824da3SAlan Cox			vm_req_vmdaemon(VM_SWAP_IDLE);
227ee8a1SPoul-Henning Kamp			lsec = time_second;
ceb0cf87SJohn Dyson		}
ceb0cf87SJohn Dyson	}
ceb0cf87SJohn Dyson#endif
e57dd910SAlan Cox	return (page_shortage <= 0);
2025d69bSKonstantin Belousov}
2025d69bSKonstantin Belousov
449c2e92SKonstantin Belousovstatic int vm_pageout_oom_vote;
449c2e92SKonstantin Belousov
449c2e92SKonstantin Belousov/*
449c2e92SKonstantin Belousov * The pagedaemon threads randlomly select one to perform the
449c2e92SKonstantin Belousov * OOM.  Trying to kill processes before all pagedaemons
449c2e92SKonstantin Belousov * failed to reach free target is premature.
449c2e92SKonstantin Belousov */
449c2e92SKonstantin Belousovstatic void
76386c7eSKonstantin Belousovvm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
76386c7eSKonstantin Belousov    int starting_page_shortage)
449c2e92SKonstantin Belousov{
449c2e92SKonstantin Belousov	int old_vote;
449c2e92SKonstantin Belousov
76386c7eSKonstantin Belousov	if (starting_page_shortage <= 0 || starting_page_shortage !=
76386c7eSKonstantin Belousov	    page_shortage)
76386c7eSKonstantin Belousov		vmd->vmd_oom_seq = 0;
76386c7eSKonstantin Belousov	else
76386c7eSKonstantin Belousov		vmd->vmd_oom_seq++;
76386c7eSKonstantin Belousov	if (vmd->vmd_oom_seq < vm_pageout_oom_seq) {
449c2e92SKonstantin Belousov		if (vmd->vmd_oom) {
449c2e92SKonstantin Belousov			vmd->vmd_oom = FALSE;
449c2e92SKonstantin Belousov			atomic_subtract_int(&vm_pageout_oom_vote, 1);
449c2e92SKonstantin Belousov		}
449c2e92SKonstantin Belousov		return;
449c2e92SKonstantin Belousov	}
449c2e92SKonstantin Belousov
76386c7eSKonstantin Belousov	/*
76386c7eSKonstantin Belousov	 * Do not follow the call sequence until OOM condition is
76386c7eSKonstantin Belousov	 * cleared.
76386c7eSKonstantin Belousov	 */
76386c7eSKonstantin Belousov	vmd->vmd_oom_seq = 0;
76386c7eSKonstantin Belousov
449c2e92SKonstantin Belousov	if (vmd->vmd_oom)
449c2e92SKonstantin Belousov		return;
449c2e92SKonstantin Belousov
449c2e92SKonstantin Belousov	vmd->vmd_oom = TRUE;
449c2e92SKonstantin Belousov	old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1);
449c2e92SKonstantin Belousov	if (old_vote != vm_ndomains - 1)
449c2e92SKonstantin Belousov		return;
449c2e92SKonstantin Belousov
449c2e92SKonstantin Belousov	/*
449c2e92SKonstantin Belousov	 * The current pagedaemon thread is the last in the quorum to
449c2e92SKonstantin Belousov	 * start OOM.  Initiate the selection and signaling of the
449c2e92SKonstantin Belousov	 * victim.
449c2e92SKonstantin Belousov	 */
449c2e92SKonstantin Belousov	vm_pageout_oom(VM_OOM_MEM);
449c2e92SKonstantin Belousov
449c2e92SKonstantin Belousov	/*
449c2e92SKonstantin Belousov	 * After one round of OOM terror, recall our vote.  On the
449c2e92SKonstantin Belousov	 * next pass, current pagedaemon would vote again if the low
449c2e92SKonstantin Belousov	 * memory condition is still there, due to vmd_oom being
449c2e92SKonstantin Belousov	 * false.
449c2e92SKonstantin Belousov	 */
449c2e92SKonstantin Belousov	vmd->vmd_oom = FALSE;
449c2e92SKonstantin Belousov	atomic_subtract_int(&vm_pageout_oom_vote, 1);
449c2e92SKonstantin Belousov}
2025d69bSKonstantin Belousov
3949873fSKonstantin Belousov/*
3949873fSKonstantin Belousov * The OOM killer is the page daemon's action of last resort when
3949873fSKonstantin Belousov * memory allocation requests have been stalled for a prolonged period
3949873fSKonstantin Belousov * of time because it cannot reclaim memory.  This function computes
3949873fSKonstantin Belousov * the approximate number of physical pages that could be reclaimed if
3949873fSKonstantin Belousov * the specified address space is destroyed.
3949873fSKonstantin Belousov *
3949873fSKonstantin Belousov * Private, anonymous memory owned by the address space is the
3949873fSKonstantin Belousov * principal resource that we expect to recover after an OOM kill.
3949873fSKonstantin Belousov * Since the physical pages mapped by the address space's COW entries
3949873fSKonstantin Belousov * are typically shared pages, they are unlikely to be released and so
3949873fSKonstantin Belousov * they are not counted.
3949873fSKonstantin Belousov *
3949873fSKonstantin Belousov * To get to the point where the page daemon runs the OOM killer, its
3949873fSKonstantin Belousov * efforts to write-back vnode-backed pages may have stalled.  This
3949873fSKonstantin Belousov * could be caused by a memory allocation deadlock in the write path
3949873fSKonstantin Belousov * that might be resolved by an OOM kill.  Therefore, physical pages
3949873fSKonstantin Belousov * belonging to vnode-backed objects are counted, because they might
3949873fSKonstantin Belousov * be freed without being written out first if the address space holds
3949873fSKonstantin Belousov * the last reference to an unlinked vnode.
3949873fSKonstantin Belousov *
3949873fSKonstantin Belousov * Similarly, physical pages belonging to OBJT_PHYS objects are
3949873fSKonstantin Belousov * counted because the address space might hold the last reference to
3949873fSKonstantin Belousov * the object.
3949873fSKonstantin Belousov */
3949873fSKonstantin Belousovstatic long
3949873fSKonstantin Belousovvm_pageout_oom_pagecount(struct vmspace *vmspace)
3949873fSKonstantin Belousov{
3949873fSKonstantin Belousov	vm_map_t map;
3949873fSKonstantin Belousov	vm_map_entry_t entry;
3949873fSKonstantin Belousov	vm_object_t obj;
3949873fSKonstantin Belousov	long res;
3949873fSKonstantin Belousov
3949873fSKonstantin Belousov	map = &vmspace->vm_map;
3949873fSKonstantin Belousov	KASSERT(!map->system_map, ("system map"));
3949873fSKonstantin Belousov	sx_assert(&map->lock, SA_LOCKED);
3949873fSKonstantin Belousov	res = 0;
3949873fSKonstantin Belousov	for (entry = map->header.next; entry != &map->header;
3949873fSKonstantin Belousov	    entry = entry->next) {
3949873fSKonstantin Belousov		if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
3949873fSKonstantin Belousov			continue;
3949873fSKonstantin Belousov		obj = entry->object.vm_object;
3949873fSKonstantin Belousov		if (obj == NULL)
3949873fSKonstantin Belousov			continue;
3949873fSKonstantin Belousov		if ((entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0 &&
3949873fSKonstantin Belousov		    obj->ref_count != 1)
3949873fSKonstantin Belousov			continue;
3949873fSKonstantin Belousov		switch (obj->type) {
3949873fSKonstantin Belousov		case OBJT_DEFAULT:
3949873fSKonstantin Belousov		case OBJT_SWAP:
3949873fSKonstantin Belousov		case OBJT_PHYS:
3949873fSKonstantin Belousov		case OBJT_VNODE:
3949873fSKonstantin Belousov			res += obj->resident_page_count;
3949873fSKonstantin Belousov			break;
3949873fSKonstantin Belousov		}
3949873fSKonstantin Belousov	}
3949873fSKonstantin Belousov	return (res);
3949873fSKonstantin Belousov}
3949873fSKonstantin Belousov
2025d69bSKonstantin Belousovvoid
2025d69bSKonstantin Belousovvm_pageout_oom(int shortage)
2025d69bSKonstantin Belousov{
2025d69bSKonstantin Belousov	struct proc *p, *bigproc;
2025d69bSKonstantin Belousov	vm_offset_t size, bigsize;
2025d69bSKonstantin Belousov	struct thread *td;
6bed074cSKonstantin Belousov	struct vmspace *vm;
2025d69bSKonstantin Belousov
2025d69bSKonstantin Belousov	/*
1c58e4e5SJohn Baldwin	 * We keep the process bigproc locked once we find it to keep anyone
1c58e4e5SJohn Baldwin	 * from messing with it; however, there is a possibility of
28323addSBryan Drewery	 * deadlock if process B is bigproc and one of its child processes
1c58e4e5SJohn Baldwin	 * attempts to propagate a signal to B while we are waiting for A's
1c58e4e5SJohn Baldwin	 * lock while walking this list.  To avoid this, we don't block on
1c58e4e5SJohn Baldwin	 * the process lock but just skip a process if it is already locked.
5663e6deSDavid Greenman	 */
5663e6deSDavid Greenman	bigproc = NULL;
5663e6deSDavid Greenman	bigsize = 0;
1005a129SJohn Baldwin	sx_slock(&allproc_lock);
e602ba25SJulian Elischer	FOREACH_PROC_IN_SYSTEM(p) {
e602ba25SJulian Elischer		int breakout;
dcbcd518SBruce Evans
71943c3dSKonstantin Belousov		PROC_LOCK(p);
71943c3dSKonstantin Belousov
1c58e4e5SJohn Baldwin		/*
3f1c4c4fSKonstantin Belousov		 * If this is a system, protected or killed process, skip it.
5663e6deSDavid Greenman		 */
71943c3dSKonstantin Belousov		if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC |
71943c3dSKonstantin Belousov		    P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 ||
71943c3dSKonstantin Belousov		    p->p_pid == 1 || P_KILLED(p) ||
71943c3dSKonstantin Belousov		    (p->p_pid < 48 && swap_pager_avail != 0)) {
8606d880SJohn Baldwin			PROC_UNLOCK(p);
5663e6deSDavid Greenman			continue;
5663e6deSDavid Greenman		}
5663e6deSDavid Greenman		/*
dcbcd518SBruce Evans		 * If the process is in a non-running type state,
e602ba25SJulian Elischer		 * don't touch it.  Check all the threads individually.
5663e6deSDavid Greenman		 */
e602ba25SJulian Elischer		breakout = 0;
e602ba25SJulian Elischer		FOREACH_THREAD_IN_PROC(p, td) {
982d11f8SJeff Roberson			thread_lock(td);
71fad9fdSJulian Elischer			if (!TD_ON_RUNQ(td) &&
71fad9fdSJulian Elischer			    !TD_IS_RUNNING(td) &&
f497cda2SEdward Tomasz Napierala			    !TD_IS_SLEEPING(td) &&
b98acc0aSKonstantin Belousov			    !TD_IS_SUSPENDED(td) &&
b98acc0aSKonstantin Belousov			    !TD_IS_SWAPPED(td)) {
982d11f8SJeff Roberson				thread_unlock(td);
e602ba25SJulian Elischer				breakout = 1;
e602ba25SJulian Elischer				break;
e602ba25SJulian Elischer			}
982d11f8SJeff Roberson			thread_unlock(td);
e602ba25SJulian Elischer		}
e602ba25SJulian Elischer		if (breakout) {
1c58e4e5SJohn Baldwin			PROC_UNLOCK(p);
5663e6deSDavid Greenman			continue;
5663e6deSDavid Greenman		}
5663e6deSDavid Greenman		/*
5663e6deSDavid Greenman		 * get the process size
5663e6deSDavid Greenman		 */
6bed074cSKonstantin Belousov		vm = vmspace_acquire_ref(p);
6bed074cSKonstantin Belousov		if (vm == NULL) {
6bed074cSKonstantin Belousov			PROC_UNLOCK(p);
6bed074cSKonstantin Belousov			continue;
6bed074cSKonstantin Belousov		}
95e2409aSKonstantin Belousov		_PHOLD_LITE(p);
72d97679SDavid Schultz		PROC_UNLOCK(p);
95e2409aSKonstantin Belousov		sx_sunlock(&allproc_lock);
95e2409aSKonstantin Belousov		if (!vm_map_trylock_read(&vm->vm_map)) {
71943c3dSKonstantin Belousov			vmspace_free(vm);
95e2409aSKonstantin Belousov			sx_slock(&allproc_lock);
95e2409aSKonstantin Belousov			PRELE(p);
72d97679SDavid Schultz			continue;
72d97679SDavid Schultz		}
7981aa24SKonstantin Belousov		size = vmspace_swap_count(vm);
2025d69bSKonstantin Belousov		if (shortage == VM_OOM_MEM)
3949873fSKonstantin Belousov			size += vm_pageout_oom_pagecount(vm);
3949873fSKonstantin Belousov		vm_map_unlock_read(&vm->vm_map);
6bed074cSKonstantin Belousov		vmspace_free(vm);
95e2409aSKonstantin Belousov		sx_slock(&allproc_lock);
3949873fSKonstantin Belousov
5663e6deSDavid Greenman		/*
3949873fSKonstantin Belousov		 * If this process is bigger than the biggest one,
5663e6deSDavid Greenman		 * remember it.
5663e6deSDavid Greenman		 */
5663e6deSDavid Greenman		if (size > bigsize) {
1c58e4e5SJohn Baldwin			if (bigproc != NULL)
71943c3dSKonstantin Belousov				PRELE(bigproc);
5663e6deSDavid Greenman			bigproc = p;
5663e6deSDavid Greenman			bigsize = size;
71943c3dSKonstantin Belousov		} else {
71943c3dSKonstantin Belousov			PRELE(p);
71943c3dSKonstantin Belousov		}
5663e6deSDavid Greenman	}
1005a129SJohn Baldwin	sx_sunlock(&allproc_lock);
5663e6deSDavid Greenman	if (bigproc != NULL) {
8311a2b8SWill Andrews		if (vm_panic_on_oom != 0)
8311a2b8SWill Andrews			panic("out of swap space");
71943c3dSKonstantin Belousov		PROC_LOCK(bigproc);
729b1e51SDavid Greenman		killproc(bigproc, "out of swap space");
fa885116SJulian Elischer		sched_nice(bigproc, PRIO_MIN);
71943c3dSKonstantin Belousov		_PRELE(bigproc);
1c58e4e5SJohn Baldwin		PROC_UNLOCK(bigproc);
44f1c916SBryan Drewery		wakeup(&vm_cnt.v_free_count);
5663e6deSDavid Greenman	}
5663e6deSDavid Greenman}
26f9a767SRodney W. Grimes
449c2e92SKonstantin Belousovstatic void
449c2e92SKonstantin Belousovvm_pageout_worker(void *arg)
449c2e92SKonstantin Belousov{
449c2e92SKonstantin Belousov	struct vm_domain *domain;
70cf3cedSAlan Cox	int domidx, pass;
e57dd910SAlan Cox	bool target_met;
449c2e92SKonstantin Belousov
449c2e92SKonstantin Belousov	domidx = (uintptr_t)arg;
449c2e92SKonstantin Belousov	domain = &vm_dom[domidx];
70cf3cedSAlan Cox	pass = 0;
e57dd910SAlan Cox	target_met = true;
449c2e92SKonstantin Belousov
449c2e92SKonstantin Belousov	/*
949c9186SKonstantin Belousov	 * XXXKIB It could be useful to bind pageout daemon threads to
949c9186SKonstantin Belousov	 * the cores belonging to the domain, from which vm_page_array
949c9186SKonstantin Belousov	 * is allocated.
449c2e92SKonstantin Belousov	 */
449c2e92SKonstantin Belousov
449c2e92SKonstantin Belousov	KASSERT(domain->vmd_segs != 0, ("domain without segments"));
22cf98d1SAlan Cox	domain->vmd_last_active_scan = ticks;
449c2e92SKonstantin Belousov	vm_pageout_init_marker(&domain->vmd_marker, PQ_INACTIVE);
7e78597fSMark Johnston	vm_pageout_init_marker(&domain->vmd_inacthead, PQ_INACTIVE);
7e78597fSMark Johnston	TAILQ_INSERT_HEAD(&domain->vmd_pagequeues[PQ_INACTIVE].pq_pl,
7e78597fSMark Johnston	    &domain->vmd_inacthead, plinks.q);
449c2e92SKonstantin Belousov
449c2e92SKonstantin Belousov	/*
449c2e92SKonstantin Belousov	 * The pageout daemon worker is never done, so loop forever.
449c2e92SKonstantin Belousov	 */
449c2e92SKonstantin Belousov	while (TRUE) {
449c2e92SKonstantin Belousov		mtx_lock(&vm_page_queue_free_mtx);
56ce0690SAlan Cox
56ce0690SAlan Cox		/*
56ce0690SAlan Cox		 * Generally, after a level >= 1 scan, if there are enough
56ce0690SAlan Cox		 * free pages to wakeup the waiters, then they are already
56ce0690SAlan Cox		 * awake.  A call to vm_page_free() during the scan awakened
56ce0690SAlan Cox		 * them.  However, in the following case, this wakeup serves
56ce0690SAlan Cox		 * to bound the amount of time that a thread might wait.
56ce0690SAlan Cox		 * Suppose a thread's call to vm_page_alloc() fails, but
56ce0690SAlan Cox		 * before that thread calls VM_WAIT, enough pages are freed by
56ce0690SAlan Cox		 * other threads to alleviate the free page shortage.  The
56ce0690SAlan Cox		 * thread will, nonetheless, wait until another page is freed
56ce0690SAlan Cox		 * or this wakeup is performed.
56ce0690SAlan Cox		 */
449c2e92SKonstantin Belousov		if (vm_pages_needed && !vm_page_count_min()) {
56ce0690SAlan Cox			vm_pages_needed = false;
44f1c916SBryan Drewery			wakeup(&vm_cnt.v_free_count);
449c2e92SKonstantin Belousov		}
56ce0690SAlan Cox
449c2e92SKonstantin Belousov		/*
e57dd910SAlan Cox		 * Do not clear vm_pageout_wanted until we reach our free page
e57dd910SAlan Cox		 * target.  Otherwise, we may be awakened over and over again,
e57dd910SAlan Cox		 * wasting CPU time.
449c2e92SKonstantin Belousov		 */
e57dd910SAlan Cox		if (vm_pageout_wanted && target_met)
56ce0690SAlan Cox			vm_pageout_wanted = false;
56ce0690SAlan Cox
56ce0690SAlan Cox		/*
56ce0690SAlan Cox		 * Might the page daemon receive a wakeup call?
56ce0690SAlan Cox		 */
56ce0690SAlan Cox		if (vm_pageout_wanted) {
56ce0690SAlan Cox			/*
56ce0690SAlan Cox			 * No.  Either vm_pageout_wanted was set by another
56ce0690SAlan Cox			 * thread during the previous scan, which must have
56ce0690SAlan Cox			 * been a level 0 scan, or vm_pageout_wanted was
56ce0690SAlan Cox			 * already set and the scan failed to free enough
ebcddc72SAlan Cox			 * pages.  If we haven't yet performed a level >= 1
ebcddc72SAlan Cox			 * (page reclamation) scan, then increase the level
ebcddc72SAlan Cox			 * and scan again now.  Otherwise, sleep a bit and
ebcddc72SAlan Cox			 * try again later.
56ce0690SAlan Cox			 */
56ce0690SAlan Cox			mtx_unlock(&vm_page_queue_free_mtx);
ebcddc72SAlan Cox			if (pass >= 1)
ebcddc72SAlan Cox				pause("psleep", hz / VM_INACT_SCAN_RATE);
70cf3cedSAlan Cox			pass++;
449c2e92SKonstantin Belousov		} else {
449c2e92SKonstantin Belousov			/*
56ce0690SAlan Cox			 * Yes.  Sleep until pages need to be reclaimed or
56ce0690SAlan Cox			 * have their reference stats updated.
449c2e92SKonstantin Belousov			 */
56ce0690SAlan Cox			if (mtx_sleep(&vm_pageout_wanted,
56ce0690SAlan Cox			    &vm_page_queue_free_mtx, PDROP | PVM, "psleep",
56ce0690SAlan Cox			    hz) == 0) {
*83c9dea1SGleb Smirnoff				VM_CNT_INC(v_pdwakeups);
70cf3cedSAlan Cox				pass = 1;
d9347bcaSAlan Cox			} else
70cf3cedSAlan Cox				pass = 0;
56ce0690SAlan Cox		}
56ce0690SAlan Cox
70cf3cedSAlan Cox		target_met = vm_pageout_scan(domain, pass);
449c2e92SKonstantin Belousov	}
449c2e92SKonstantin Belousov}
449c2e92SKonstantin Belousov
df8bae1dSRodney W. Grimes/*
4d19f4adSSteven Hartland *	vm_pageout_init initialises basic pageout daemon settings.
df8bae1dSRodney W. Grimes */
2b14f991SJulian Elischerstatic void
4d19f4adSSteven Hartlandvm_pageout_init(void)
df8bae1dSRodney W. Grimes{
df8bae1dSRodney W. Grimes	/*
df8bae1dSRodney W. Grimes	 * Initialize some paging parameters.
df8bae1dSRodney W. Grimes	 */
44f1c916SBryan Drewery	vm_cnt.v_interrupt_free_min = 2;
44f1c916SBryan Drewery	if (vm_cnt.v_page_count < 2000)
f35329acSJohn Dyson		vm_pageout_page_count = 8;
f6b04d2bSDavid Greenman
45ae1d91SAlan Cox	/*
45ae1d91SAlan Cox	 * v_free_reserved needs to include enough for the largest
45ae1d91SAlan Cox	 * swap pager structures plus enough for any pv_entry structs
45ae1d91SAlan Cox	 * when paging.
45ae1d91SAlan Cox	 */
44f1c916SBryan Drewery	if (vm_cnt.v_page_count > 1024)
44f1c916SBryan Drewery		vm_cnt.v_free_min = 4 + (vm_cnt.v_page_count - 1024) / 200;
2feb50bfSAttilio Rao	else
44f1c916SBryan Drewery		vm_cnt.v_free_min = 4;
44f1c916SBryan Drewery	vm_cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
44f1c916SBryan Drewery	    vm_cnt.v_interrupt_free_min;
44f1c916SBryan Drewery	vm_cnt.v_free_reserved = vm_pageout_page_count +
44f1c916SBryan Drewery	    vm_cnt.v_pageout_free_min + (vm_cnt.v_page_count / 768);
44f1c916SBryan Drewery	vm_cnt.v_free_severe = vm_cnt.v_free_min / 2;
44f1c916SBryan Drewery	vm_cnt.v_free_target = 4 * vm_cnt.v_free_min + vm_cnt.v_free_reserved;
44f1c916SBryan Drewery	vm_cnt.v_free_min += vm_cnt.v_free_reserved;
44f1c916SBryan Drewery	vm_cnt.v_free_severe += vm_cnt.v_free_reserved;
44f1c916SBryan Drewery	vm_cnt.v_inactive_target = (3 * vm_cnt.v_free_target) / 2;
44f1c916SBryan Drewery	if (vm_cnt.v_inactive_target > vm_cnt.v_free_count / 3)
44f1c916SBryan Drewery		vm_cnt.v_inactive_target = vm_cnt.v_free_count / 3;
df8bae1dSRodney W. Grimes
d9e23210SJeff Roberson	/*
d9e23210SJeff Roberson	 * Set the default wakeup threshold to be 10% above the minimum
d9e23210SJeff Roberson	 * page limit.  This keeps the steady state out of shortfall.
d9e23210SJeff Roberson	 */
44f1c916SBryan Drewery	vm_pageout_wakeup_thresh = (vm_cnt.v_free_min / 10) * 11;
d9e23210SJeff Roberson
d9e23210SJeff Roberson	/*
d9e23210SJeff Roberson	 * Set interval in seconds for active scan.  We want to visit each
c9612b2dSJeff Roberson	 * page at least once every ten minutes.  This is to prevent worst
c9612b2dSJeff Roberson	 * case paging behaviors with stale active LRU.
d9e23210SJeff Roberson	 */
d9e23210SJeff Roberson	if (vm_pageout_update_period == 0)
c9612b2dSJeff Roberson		vm_pageout_update_period = 600;
d9e23210SJeff Roberson
df8bae1dSRodney W. Grimes	/* XXX does not really belong here */
df8bae1dSRodney W. Grimes	if (vm_page_max_wired == 0)
44f1c916SBryan Drewery		vm_page_max_wired = vm_cnt.v_free_count / 3;
ebcddc72SAlan Cox
ebcddc72SAlan Cox	/*
ebcddc72SAlan Cox	 * Target amount of memory to move out of the laundry queue during a
ebcddc72SAlan Cox	 * background laundering.  This is proportional to the amount of system
ebcddc72SAlan Cox	 * memory.
ebcddc72SAlan Cox	 */
ebcddc72SAlan Cox	vm_background_launder_target = (vm_cnt.v_free_target -
ebcddc72SAlan Cox	    vm_cnt.v_free_min) / 10;
4d19f4adSSteven Hartland}
4d19f4adSSteven Hartland
4d19f4adSSteven Hartland/*
4d19f4adSSteven Hartland *     vm_pageout is the high level pageout daemon.
4d19f4adSSteven Hartland */
4d19f4adSSteven Hartlandstatic void
4d19f4adSSteven Hartlandvm_pageout(void)
4d19f4adSSteven Hartland{
44ec2b63SKonstantin Belousov	int error;
62d70a81SJohn Baldwin#ifdef VM_NUMA_ALLOC
44ec2b63SKonstantin Belousov	int i;
4d19f4adSSteven Hartland#endif
df8bae1dSRodney W. Grimes
24a1cce3SDavid Greenman	swap_pager_swap_init();
ebcddc72SAlan Cox	error = kthread_add(vm_pageout_laundry_worker, NULL, curproc, NULL,
ebcddc72SAlan Cox	    0, 0, "laundry: dom0");
ebcddc72SAlan Cox	if (error != 0)
ebcddc72SAlan Cox		panic("starting laundry for domain 0, error %d", error);
62d70a81SJohn Baldwin#ifdef VM_NUMA_ALLOC
449c2e92SKonstantin Belousov	for (i = 1; i < vm_ndomains; i++) {
449c2e92SKonstantin Belousov		error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i,
449c2e92SKonstantin Belousov		    curproc, NULL, 0, 0, "dom%d", i);
449c2e92SKonstantin Belousov		if (error != 0) {
449c2e92SKonstantin Belousov			panic("starting pageout for domain %d, error %d\n",
449c2e92SKonstantin Belousov			    i, error);
dc2efb27SJohn Dyson		}
f919ebdeSDavid Greenman	}
449c2e92SKonstantin Belousov#endif
44ec2b63SKonstantin Belousov	error = kthread_add(uma_reclaim_worker, NULL, curproc, NULL,
44ec2b63SKonstantin Belousov	    0, 0, "uma");
44ec2b63SKonstantin Belousov	if (error != 0)
44ec2b63SKonstantin Belousov		panic("starting uma_reclaim helper, error %d\n", error);
d395270dSDimitry Andric	vm_pageout_worker((void *)(uintptr_t)0);
df8bae1dSRodney W. Grimes}
26f9a767SRodney W. Grimes
6b4b77adSAlan Cox/*
e9f995d8SAlan Cox * Unless the free page queue lock is held by the caller, this function
6b4b77adSAlan Cox * should be regarded as advisory.  Specifically, the caller should
44f1c916SBryan Drewery * not msleep() on &vm_cnt.v_free_count following this function unless
e9f995d8SAlan Cox * the free page queue lock is held until the msleep() is performed.
6b4b77adSAlan Cox */
e0c5a895SJohn Dysonvoid
4a365329SAndrey Zonovpagedaemon_wakeup(void)
e0c5a895SJohn Dyson{
a1c0a785SAlan Cox
56ce0690SAlan Cox	if (!vm_pageout_wanted && curthread->td_proc != pageproc) {
56ce0690SAlan Cox		vm_pageout_wanted = true;
56ce0690SAlan Cox		wakeup(&vm_pageout_wanted);
e0c5a895SJohn Dyson	}
e0c5a895SJohn Dyson}
e0c5a895SJohn Dyson
38efa82bSJohn Dyson#if !defined(NO_SWAPPING)
5afce282SDavid Greenmanstatic void
97824da3SAlan Coxvm_req_vmdaemon(int req)
5afce282SDavid Greenman{
5afce282SDavid Greenman	static int lastrun = 0;
5afce282SDavid Greenman
97824da3SAlan Cox	mtx_lock(&vm_daemon_mtx);
97824da3SAlan Cox	vm_pageout_req_swapout |= req;
b18bfc3dSJohn Dyson	if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
5afce282SDavid Greenman		wakeup(&vm_daemon_needed);
5afce282SDavid Greenman		lastrun = ticks;
5afce282SDavid Greenman	}
97824da3SAlan Cox	mtx_unlock(&vm_daemon_mtx);
5afce282SDavid Greenman}
5afce282SDavid Greenman
2b14f991SJulian Elischerstatic void
4a365329SAndrey Zonovvm_daemon(void)
0d94caffSDavid Greenman{
91d5354aSJohn Baldwin	struct rlimit rsslim;
dcbcd518SBruce Evans	struct proc *p;
dcbcd518SBruce Evans	struct thread *td;
6bed074cSKonstantin Belousov	struct vmspace *vm;
099e7e95SEdward Tomasz Napierala	int breakout, swapout_flags, tryagain, attempts;
afcc55f3SEdward Tomasz Napierala#ifdef RACCT
099e7e95SEdward Tomasz Napierala	uint64_t rsize, ravailable;
afcc55f3SEdward Tomasz Napierala#endif
0d94caffSDavid Greenman
2fe6e4d7SDavid Greenman	while (TRUE) {
97824da3SAlan Cox		mtx_lock(&vm_daemon_mtx);
4b5c9cf6SEdward Tomasz Napierala		msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep",
099e7e95SEdward Tomasz Napierala#ifdef RACCT
4b5c9cf6SEdward Tomasz Napierala		    racct_enable ? hz : 0
099e7e95SEdward Tomasz Napierala#else
4b5c9cf6SEdward Tomasz Napierala		    0
099e7e95SEdward Tomasz Napierala#endif
4b5c9cf6SEdward Tomasz Napierala		);
97824da3SAlan Cox		swapout_flags = vm_pageout_req_swapout;
4c1f8ee9SDavid Greenman		vm_pageout_req_swapout = 0;
97824da3SAlan Cox		mtx_unlock(&vm_daemon_mtx);
97824da3SAlan Cox		if (swapout_flags)
97824da3SAlan Cox			swapout_procs(swapout_flags);
97824da3SAlan Cox
2fe6e4d7SDavid Greenman		/*
0d94caffSDavid Greenman		 * scan the processes for exceeding their rlimits or if
0d94caffSDavid Greenman		 * process is swapped out -- deactivate pages
2fe6e4d7SDavid Greenman		 */
099e7e95SEdward Tomasz Napierala		tryagain = 0;
099e7e95SEdward Tomasz Napierala		attempts = 0;
099e7e95SEdward Tomasz Napieralaagain:
099e7e95SEdward Tomasz Napierala		attempts++;
1005a129SJohn Baldwin		sx_slock(&allproc_lock);
f67af5c9SXin LI		FOREACH_PROC_IN_SYSTEM(p) {
fe2144fdSLuoqi Chen			vm_pindex_t limit, size;
2fe6e4d7SDavid Greenman
2fe6e4d7SDavid Greenman			/*
2fe6e4d7SDavid Greenman			 * if this is a system process or if we have already
2fe6e4d7SDavid Greenman			 * looked at this process, skip it.
2fe6e4d7SDavid Greenman			 */
897ecacdSJohn Baldwin			PROC_LOCK(p);
8e6fa660SJohn Baldwin			if (p->p_state != PRS_NORMAL ||
8e6fa660SJohn Baldwin			    p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) {
897ecacdSJohn Baldwin				PROC_UNLOCK(p);
2fe6e4d7SDavid Greenman				continue;
2fe6e4d7SDavid Greenman			}
2fe6e4d7SDavid Greenman			/*
2fe6e4d7SDavid Greenman			 * if the process is in a non-running type state,
2fe6e4d7SDavid Greenman			 * don't touch it.
2fe6e4d7SDavid Greenman			 */
e602ba25SJulian Elischer			breakout = 0;
e602ba25SJulian Elischer			FOREACH_THREAD_IN_PROC(p, td) {
982d11f8SJeff Roberson				thread_lock(td);
71fad9fdSJulian Elischer				if (!TD_ON_RUNQ(td) &&
71fad9fdSJulian Elischer				    !TD_IS_RUNNING(td) &&
f497cda2SEdward Tomasz Napierala				    !TD_IS_SLEEPING(td) &&
f497cda2SEdward Tomasz Napierala				    !TD_IS_SUSPENDED(td)) {
982d11f8SJeff Roberson					thread_unlock(td);
e602ba25SJulian Elischer					breakout = 1;
e602ba25SJulian Elischer					break;
e602ba25SJulian Elischer				}
982d11f8SJeff Roberson				thread_unlock(td);
e602ba25SJulian Elischer			}
897ecacdSJohn Baldwin			if (breakout) {
897ecacdSJohn Baldwin				PROC_UNLOCK(p);
2fe6e4d7SDavid Greenman				continue;
2fe6e4d7SDavid Greenman			}
2fe6e4d7SDavid Greenman			/*
2fe6e4d7SDavid Greenman			 * get a limit
2fe6e4d7SDavid Greenman			 */
f6f6d240SMateusz Guzik			lim_rlimit_proc(p, RLIMIT_RSS, &rsslim);
fe2144fdSLuoqi Chen			limit = OFF_TO_IDX(
91d5354aSJohn Baldwin			    qmin(rsslim.rlim_cur, rsslim.rlim_max));
2fe6e4d7SDavid Greenman
2fe6e4d7SDavid Greenman			/*
0d94caffSDavid Greenman			 * let processes that are swapped out really be
0d94caffSDavid Greenman			 * swapped out set the limit to nothing (will force a
0d94caffSDavid Greenman			 * swap-out.)
2fe6e4d7SDavid Greenman			 */
b61ce5b0SJeff Roberson			if ((p->p_flag & P_INMEM) == 0)
0d94caffSDavid Greenman				limit = 0;	/* XXX */
6bed074cSKonstantin Belousov			vm = vmspace_acquire_ref(p);
95e2409aSKonstantin Belousov			_PHOLD_LITE(p);
897ecacdSJohn Baldwin			PROC_UNLOCK(p);
95e2409aSKonstantin Belousov			if (vm == NULL) {
95e2409aSKonstantin Belousov				PRELE(p);
6bed074cSKonstantin Belousov				continue;
95e2409aSKonstantin Belousov			}
95e2409aSKonstantin Belousov			sx_sunlock(&allproc_lock);
2fe6e4d7SDavid Greenman
6bed074cSKonstantin Belousov			size = vmspace_resident_count(vm);
a406d8c3SEdward Tomasz Napierala			if (size >= limit) {
fe2144fdSLuoqi Chen				vm_pageout_map_deactivate_pages(
6bed074cSKonstantin Belousov				    &vm->vm_map, limit);
937c1b07SAndriy Gapon				size = vmspace_resident_count(vm);
2fe6e4d7SDavid Greenman			}
afcc55f3SEdward Tomasz Napierala#ifdef RACCT
4b5c9cf6SEdward Tomasz Napierala			if (racct_enable) {
099e7e95SEdward Tomasz Napierala				rsize = IDX_TO_OFF(size);
099e7e95SEdward Tomasz Napierala				PROC_LOCK(p);
937c1b07SAndriy Gapon				if (p->p_state == PRS_NORMAL)
099e7e95SEdward Tomasz Napierala					racct_set(p, RACCT_RSS, rsize);
099e7e95SEdward Tomasz Napierala				ravailable = racct_get_available(p, RACCT_RSS);
099e7e95SEdward Tomasz Napierala				PROC_UNLOCK(p);
099e7e95SEdward Tomasz Napierala				if (rsize > ravailable) {
099e7e95SEdward Tomasz Napierala					/*
4b5c9cf6SEdward Tomasz Napierala					 * Don't be overly aggressive; this
4b5c9cf6SEdward Tomasz Napierala					 * might be an innocent process,
4b5c9cf6SEdward Tomasz Napierala					 * and the limit could've been exceeded
4b5c9cf6SEdward Tomasz Napierala					 * by some memory hog.  Don't try
4b5c9cf6SEdward Tomasz Napierala					 * to deactivate more than 1/4th
4b5c9cf6SEdward Tomasz Napierala					 * of process' resident set size.
099e7e95SEdward Tomasz Napierala					 */
099e7e95SEdward Tomasz Napierala					if (attempts <= 8) {
4b5c9cf6SEdward Tomasz Napierala						if (ravailable < rsize -
4b5c9cf6SEdward Tomasz Napierala						    (rsize / 4)) {
4b5c9cf6SEdward Tomasz Napierala							ravailable = rsize -
4b5c9cf6SEdward Tomasz Napierala							    (rsize / 4);
4b5c9cf6SEdward Tomasz Napierala						}
099e7e95SEdward Tomasz Napierala					}
099e7e95SEdward Tomasz Napierala					vm_pageout_map_deactivate_pages(
4b5c9cf6SEdward Tomasz Napierala					    &vm->vm_map,
4b5c9cf6SEdward Tomasz Napierala					    OFF_TO_IDX(ravailable));
099e7e95SEdward Tomasz Napierala					/* Update RSS usage after paging out. */
099e7e95SEdward Tomasz Napierala					size = vmspace_resident_count(vm);
099e7e95SEdward Tomasz Napierala					rsize = IDX_TO_OFF(size);
099e7e95SEdward Tomasz Napierala					PROC_LOCK(p);
937c1b07SAndriy Gapon					if (p->p_state == PRS_NORMAL)
099e7e95SEdward Tomasz Napierala						racct_set(p, RACCT_RSS, rsize);
099e7e95SEdward Tomasz Napierala					PROC_UNLOCK(p);
099e7e95SEdward Tomasz Napierala					if (rsize > ravailable)
099e7e95SEdward Tomasz Napierala						tryagain = 1;
099e7e95SEdward Tomasz Napierala				}
4b5c9cf6SEdward Tomasz Napierala			}
afcc55f3SEdward Tomasz Napierala#endif
6bed074cSKonstantin Belousov			vmspace_free(vm);
95e2409aSKonstantin Belousov			sx_slock(&allproc_lock);
95e2409aSKonstantin Belousov			PRELE(p);
2fe6e4d7SDavid Greenman		}
1005a129SJohn Baldwin		sx_sunlock(&allproc_lock);
099e7e95SEdward Tomasz Napierala		if (tryagain != 0 && attempts <= 10)
099e7e95SEdward Tomasz Napierala			goto again;
24a1cce3SDavid Greenman	}
2fe6e4d7SDavid Greenman}
a1287949SEivind Eklund#endif			/* !defined(NO_SWAPPING) */