sys/vm/vm_pageout.c

60727d8bSWarner Losh/*-
26f9a767SRodney W. Grimes * Copyright (c) 1991 Regents of the University of California.
26f9a767SRodney W. Grimes * All rights reserved.
26f9a767SRodney W. Grimes * Copyright (c) 1994 John S. Dyson
26f9a767SRodney W. Grimes * All rights reserved.
26f9a767SRodney W. Grimes * Copyright (c) 1994 David Greenman
26f9a767SRodney W. Grimes * All rights reserved.
8dbca793STor Egge * Copyright (c) 2005 Yahoo! Technologies Norway AS
8dbca793STor Egge * All rights reserved.
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes * This code is derived from software contributed to Berkeley by
df8bae1dSRodney W. Grimes * The Mach Operating System project at Carnegie-Mellon University.
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes * Redistribution and use in source and binary forms, with or without
df8bae1dSRodney W. Grimes * modification, are permitted provided that the following conditions
df8bae1dSRodney W. Grimes * are met:
df8bae1dSRodney W. Grimes * 1. Redistributions of source code must retain the above copyright
df8bae1dSRodney W. Grimes *    notice, this list of conditions and the following disclaimer.
df8bae1dSRodney W. Grimes * 2. Redistributions in binary form must reproduce the above copyright
df8bae1dSRodney W. Grimes *    notice, this list of conditions and the following disclaimer in the
df8bae1dSRodney W. Grimes *    documentation and/or other materials provided with the distribution.
df8bae1dSRodney W. Grimes * 3. All advertising materials mentioning features or use of this software
5929bcfaSPhilippe Charnier *    must display the following acknowledgement:
df8bae1dSRodney W. Grimes *	This product includes software developed by the University of
df8bae1dSRodney W. Grimes *	California, Berkeley and its contributors.
df8bae1dSRodney W. Grimes * 4. Neither the name of the University nor the names of its contributors
df8bae1dSRodney W. Grimes *    may be used to endorse or promote products derived from this software
df8bae1dSRodney W. Grimes *    without specific prior written permission.
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
df8bae1dSRodney W. Grimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
df8bae1dSRodney W. Grimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
df8bae1dSRodney W. Grimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
df8bae1dSRodney W. Grimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
df8bae1dSRodney W. Grimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
df8bae1dSRodney W. Grimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
df8bae1dSRodney W. Grimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
df8bae1dSRodney W. Grimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
df8bae1dSRodney W. Grimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
df8bae1dSRodney W. Grimes * SUCH DAMAGE.
df8bae1dSRodney W. Grimes *
3c4dd356SDavid Greenman *	from: @(#)vm_pageout.c	7.4 (Berkeley) 5/7/91
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes * Copyright (c) 1987, 1990 Carnegie-Mellon University.
df8bae1dSRodney W. Grimes * All rights reserved.
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes * Authors: Avadis Tevanian, Jr., Michael Wayne Young
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes * Permission to use, copy, modify and distribute this software and
df8bae1dSRodney W. Grimes * its documentation is hereby granted, provided that both the copyright
df8bae1dSRodney W. Grimes * notice and this permission notice appear in all copies of the
df8bae1dSRodney W. Grimes * software, derivative works or modified versions, and any portions
df8bae1dSRodney W. Grimes * thereof, and that both notices appear in supporting documentation.
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
df8bae1dSRodney W. Grimes * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
df8bae1dSRodney W. Grimes * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes * Carnegie Mellon requests users of this software to return to
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
df8bae1dSRodney W. Grimes *  School of Computer Science
df8bae1dSRodney W. Grimes *  Carnegie Mellon University
df8bae1dSRodney W. Grimes *  Pittsburgh PA 15213-3890
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes * any improvements or extensions that they make and grant Carnegie the
df8bae1dSRodney W. Grimes * rights to redistribute these changes.
df8bae1dSRodney W. Grimes */
df8bae1dSRodney W. Grimes
df8bae1dSRodney W. Grimes/*
df8bae1dSRodney W. Grimes *	The proverbial page-out daemon.
df8bae1dSRodney W. Grimes */
df8bae1dSRodney W. Grimes
874651b1SDavid E. O'Brien#include <sys/cdefs.h>
874651b1SDavid E. O'Brien__FBSDID("$FreeBSD$");
874651b1SDavid E. O'Brien
faa5f8d8SAndrzej Bialecki#include "opt_vm.h"
14a0d74eSSteven Hartland#include "opt_kdtrace.h"
df8bae1dSRodney W. Grimes#include <sys/param.h>
26f9a767SRodney W. Grimes#include <sys/systm.h>
b5e8ce9fSBruce Evans#include <sys/kernel.h>
855a310fSJeff Roberson#include <sys/eventhandler.h>
fb919e4dSMark Murray#include <sys/lock.h>
fb919e4dSMark Murray#include <sys/mutex.h>
26f9a767SRodney W. Grimes#include <sys/proc.h>
9c8b8baaSPeter Wemm#include <sys/kthread.h>
0384fff8SJason Evans#include <sys/ktr.h>
97824da3SAlan Cox#include <sys/mount.h>
099e7e95SEdward Tomasz Napierala#include <sys/racct.h>
26f9a767SRodney W. Grimes#include <sys/resourcevar.h>
b43179fbSJeff Roberson#include <sys/sched.h>
14a0d74eSSteven Hartland#include <sys/sdt.h>
d2fc5315SPoul-Henning Kamp#include <sys/signalvar.h>
449c2e92SKonstantin Belousov#include <sys/smp.h>
*a6bf3a9eSRyan Stone#include <sys/time.h>
f6b04d2bSDavid Greenman#include <sys/vnode.h>
efeaf95aSDavid Greenman#include <sys/vmmeter.h>
89f6b863SAttilio Rao#include <sys/rwlock.h>
1005a129SJohn Baldwin#include <sys/sx.h>
38efa82bSJohn Dyson#include <sys/sysctl.h>
df8bae1dSRodney W. Grimes
df8bae1dSRodney W. Grimes#include <vm/vm.h>
efeaf95aSDavid Greenman#include <vm/vm_param.h>
efeaf95aSDavid Greenman#include <vm/vm_object.h>
df8bae1dSRodney W. Grimes#include <vm/vm_page.h>
efeaf95aSDavid Greenman#include <vm/vm_map.h>
df8bae1dSRodney W. Grimes#include <vm/vm_pageout.h>
24a1cce3SDavid Greenman#include <vm/vm_pager.h>
449c2e92SKonstantin Belousov#include <vm/vm_phys.h>
05f0fdd2SPoul-Henning Kamp#include <vm/swap_pager.h>
efeaf95aSDavid Greenman#include <vm/vm_extern.h>
670d17b5SJeff Roberson#include <vm/uma.h>
df8bae1dSRodney W. Grimes
2b14f991SJulian Elischer/*
2b14f991SJulian Elischer * System initialization
2b14f991SJulian Elischer */
2b14f991SJulian Elischer
2b14f991SJulian Elischer/* the kernel process "vm_pageout"*/
11caded3SAlfred Perlsteinstatic void vm_pageout(void);
4d19f4adSSteven Hartlandstatic void vm_pageout_init(void);
34d8b7eaSJeff Robersonstatic int vm_pageout_clean(vm_page_t m);
34d8b7eaSJeff Robersonstatic int vm_pageout_cluster(vm_page_t m);
449c2e92SKonstantin Belousovstatic void vm_pageout_scan(struct vm_domain *vmd, int pass);
449c2e92SKonstantin Belousovstatic void vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass);
45ae1d91SAlan Cox
4d19f4adSSteven HartlandSYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init,
4d19f4adSSteven Hartland    NULL);
4d19f4adSSteven Hartland
2b14f991SJulian Elischerstruct proc *pageproc;
2b14f991SJulian Elischer
2b14f991SJulian Elischerstatic struct kproc_desc page_kp = {
2b14f991SJulian Elischer	"pagedaemon",
2b14f991SJulian Elischer	vm_pageout,
2b14f991SJulian Elischer	&pageproc
2b14f991SJulian Elischer};
4d19f4adSSteven HartlandSYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start,
237fdd78SRobert Watson    &page_kp);
2b14f991SJulian Elischer
14a0d74eSSteven HartlandSDT_PROVIDER_DEFINE(vm);
14a0d74eSSteven HartlandSDT_PROBE_DEFINE(vm, , , vm__lowmem_cache);
14a0d74eSSteven HartlandSDT_PROBE_DEFINE(vm, , , vm__lowmem_scan);
14a0d74eSSteven Hartland
38efa82bSJohn Dyson#if !defined(NO_SWAPPING)
2b14f991SJulian Elischer/* the kernel process "vm_daemon"*/
11caded3SAlfred Perlsteinstatic void vm_daemon(void);
f708ef1bSPoul-Henning Kampstatic struct	proc *vmproc;
2b14f991SJulian Elischer
2b14f991SJulian Elischerstatic struct kproc_desc vm_kp = {
2b14f991SJulian Elischer	"vmdaemon",
2b14f991SJulian Elischer	vm_daemon,
2b14f991SJulian Elischer	&vmproc
2b14f991SJulian Elischer};
237fdd78SRobert WatsonSYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
38efa82bSJohn Dyson#endif
2b14f991SJulian Elischer
2b14f991SJulian Elischer
8b245767SAlan Coxint vm_pages_needed;		/* Event on which pageout daemon sleeps */
8b245767SAlan Coxint vm_pageout_deficit;		/* Estimated number of pages deficit */
8b245767SAlan Coxint vm_pageout_pages_needed;	/* flag saying that the pageout daemon needs pages */
d9e23210SJeff Robersonint vm_pageout_wakeup_thresh;
26f9a767SRodney W. Grimes
38efa82bSJohn Dyson#if !defined(NO_SWAPPING)
f708ef1bSPoul-Henning Kampstatic int vm_pageout_req_swapout;	/* XXX */
f708ef1bSPoul-Henning Kampstatic int vm_daemon_needed;
97824da3SAlan Coxstatic struct mtx vm_daemon_mtx;
97824da3SAlan Cox/* Allow for use by vm_pageout before vm_daemon is initialized. */
97824da3SAlan CoxMTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF);
38efa82bSJohn Dyson#endif
2b6b0df7SMatthew Dillonstatic int vm_max_launder = 32;
d9e23210SJeff Robersonstatic int vm_pageout_update_period;
4a365329SAndrey Zonovstatic int defer_swap_pageouts;
4a365329SAndrey Zonovstatic int disable_swap_pageouts;
c9612b2dSJeff Robersonstatic int lowmem_period = 10;
*a6bf3a9eSRyan Stonestatic time_t lowmem_uptime;
70111b90SJohn Dyson
38efa82bSJohn Dyson#if defined(NO_SWAPPING)
303b270bSEivind Eklundstatic int vm_swap_enabled = 0;
303b270bSEivind Eklundstatic int vm_swap_idle_enabled = 0;
38efa82bSJohn Dyson#else
303b270bSEivind Eklundstatic int vm_swap_enabled = 1;
303b270bSEivind Eklundstatic int vm_swap_idle_enabled = 0;
38efa82bSJohn Dyson#endif
38efa82bSJohn Dyson
8311a2b8SWill Andrewsstatic int vm_panic_on_oom = 0;
8311a2b8SWill Andrews
8311a2b8SWill AndrewsSYSCTL_INT(_vm, OID_AUTO, panic_on_oom,
8311a2b8SWill Andrews	CTLFLAG_RWTUN, &vm_panic_on_oom, 0,
8311a2b8SWill Andrews	"panic on out of memory instead of killing the largest process");
8311a2b8SWill Andrews
d9e23210SJeff RobersonSYSCTL_INT(_vm, OID_AUTO, pageout_wakeup_thresh,
d9e23210SJeff Roberson	CTLFLAG_RW, &vm_pageout_wakeup_thresh, 0,
d9e23210SJeff Roberson	"free page threshold for waking up the pageout daemon");
d9e23210SJeff Roberson
2b6b0df7SMatthew DillonSYSCTL_INT(_vm, OID_AUTO, max_launder,
2b6b0df7SMatthew Dillon	CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
38efa82bSJohn Dyson
d9e23210SJeff RobersonSYSCTL_INT(_vm, OID_AUTO, pageout_update_period,
d9e23210SJeff Roberson	CTLFLAG_RW, &vm_pageout_update_period, 0,
d9e23210SJeff Roberson	"Maximum active LRU update period");
53636869SAndrey Zonov
c9612b2dSJeff RobersonSYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RW, &lowmem_period, 0,
c9612b2dSJeff Roberson	"Low memory callback period");
c9612b2dSJeff Roberson
38efa82bSJohn Dyson#if defined(NO_SWAPPING)
ceb0cf87SJohn DysonSYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
6bd9cb1cSTom Rhodes	CTLFLAG_RD, &vm_swap_enabled, 0, "Enable entire process swapout");
ceb0cf87SJohn DysonSYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
6bd9cb1cSTom Rhodes	CTLFLAG_RD, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
38efa82bSJohn Dyson#else
ceb0cf87SJohn DysonSYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
b0359e2cSPeter Wemm	CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
ceb0cf87SJohn DysonSYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
b0359e2cSPeter Wemm	CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
38efa82bSJohn Dyson#endif
26f9a767SRodney W. Grimes
ceb0cf87SJohn DysonSYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
b0359e2cSPeter Wemm	CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
12ac6a1dSJohn Dyson
ceb0cf87SJohn DysonSYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
b0359e2cSPeter Wemm	CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
12ac6a1dSJohn Dyson
23b59018SMatthew Dillonstatic int pageout_lock_miss;
23b59018SMatthew DillonSYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
23b59018SMatthew Dillon	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
23b59018SMatthew Dillon
ffc82b0aSJohn Dyson#define VM_PAGEOUT_PAGE_COUNT 16
bbc0ec52SDavid Greenmanint vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
df8bae1dSRodney W. Grimes
c3cb3e12SDavid Greenmanint vm_page_max_wired;		/* XXX max # of wired pages system-wide */
5dfc2870SAlan CoxSYSCTL_INT(_vm, OID_AUTO, max_wired,
5dfc2870SAlan Cox	CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count");
df8bae1dSRodney W. Grimes
85eeca35SAlan Coxstatic boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *);
449c2e92SKonstantin Belousovstatic boolean_t vm_pageout_launder(struct vm_pagequeue *pq, int, vm_paddr_t,
449c2e92SKonstantin Belousov    vm_paddr_t);
38efa82bSJohn Dyson#if !defined(NO_SWAPPING)
ecf6279fSAlan Coxstatic void vm_pageout_map_deactivate_pages(vm_map_t, long);
ecf6279fSAlan Coxstatic void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long);
97824da3SAlan Coxstatic void vm_req_vmdaemon(int req);
38efa82bSJohn Dyson#endif
85eeca35SAlan Coxstatic boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *);
cd41fc12SDavid Greenman
a8229fa3SAlan Cox/*
a8229fa3SAlan Cox * Initialize a dummy page for marking the caller's place in the specified
a8229fa3SAlan Cox * paging queue.  In principle, this function only needs to set the flag
c7aebda8SAttilio Rao * PG_MARKER.  Nonetheless, it wirte busies and initializes the hold count
c7aebda8SAttilio Rao * to one as safety precautions.
a8229fa3SAlan Cox */
8c616246SKonstantin Belousovstatic void
8c616246SKonstantin Belousovvm_pageout_init_marker(vm_page_t marker, u_short queue)
8c616246SKonstantin Belousov{
8c616246SKonstantin Belousov
8c616246SKonstantin Belousov	bzero(marker, sizeof(*marker));
a8229fa3SAlan Cox	marker->flags = PG_MARKER;
c7aebda8SAttilio Rao	marker->busy_lock = VPB_SINGLE_EXCLUSIVER;
8c616246SKonstantin Belousov	marker->queue = queue;
a8229fa3SAlan Cox	marker->hold_count = 1;
8c616246SKonstantin Belousov}
8c616246SKonstantin Belousov
26f9a767SRodney W. Grimes/*
8dbca793STor Egge * vm_pageout_fallback_object_lock:
8dbca793STor Egge *
89f6b863SAttilio Rao * Lock vm object currently associated with `m'. VM_OBJECT_TRYWLOCK is
8dbca793STor Egge * known to have failed and page queue must be either PQ_ACTIVE or
8dbca793STor Egge * PQ_INACTIVE.  To avoid lock order violation, unlock the page queues
8dbca793STor Egge * while locking the vm object.  Use marker page to detect page queue
8dbca793STor Egge * changes and maintain notion of next page on page queue.  Return
8dbca793STor Egge * TRUE if no changes were detected, FALSE otherwise.  vm object is
8dbca793STor Egge * locked on return.
8dbca793STor Egge *
8dbca793STor Egge * This function depends on both the lock portion of struct vm_object
8dbca793STor Egge * and normal struct vm_page being type stable.
8dbca793STor Egge */
85eeca35SAlan Coxstatic boolean_t
8dbca793STor Eggevm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next)
8dbca793STor Egge{
8dbca793STor Egge	struct vm_page marker;
8d220203SAlan Cox	struct vm_pagequeue *pq;
8dbca793STor Egge	boolean_t unchanged;
8dbca793STor Egge	u_short queue;
8dbca793STor Egge	vm_object_t object;
8dbca793STor Egge
8dbca793STor Egge	queue = m->queue;
8c616246SKonstantin Belousov	vm_pageout_init_marker(&marker, queue);
449c2e92SKonstantin Belousov	pq = vm_page_pagequeue(m);
8dbca793STor Egge	object = m->object;
8dbca793STor Egge
c325e866SKonstantin Belousov	TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q);
8d220203SAlan Cox	vm_pagequeue_unlock(pq);
2965a453SKip Macy	vm_page_unlock(m);
89f6b863SAttilio Rao	VM_OBJECT_WLOCK(object);
2965a453SKip Macy	vm_page_lock(m);
8d220203SAlan Cox	vm_pagequeue_lock(pq);
8dbca793STor Egge
8dbca793STor Egge	/* Page queue might have changed. */
c325e866SKonstantin Belousov	*next = TAILQ_NEXT(&marker, plinks.q);
8dbca793STor Egge	unchanged = (m->queue == queue &&
8dbca793STor Egge		     m->object == object &&
c325e866SKonstantin Belousov		     &marker == TAILQ_NEXT(m, plinks.q));
c325e866SKonstantin Belousov	TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q);
8dbca793STor Egge	return (unchanged);
8dbca793STor Egge}
8dbca793STor Egge
8dbca793STor Egge/*
8c616246SKonstantin Belousov * Lock the page while holding the page queue lock.  Use marker page
8c616246SKonstantin Belousov * to detect page queue changes and maintain notion of next page on
8c616246SKonstantin Belousov * page queue.  Return TRUE if no changes were detected, FALSE
8c616246SKonstantin Belousov * otherwise.  The page is locked on return. The page queue lock might
8c616246SKonstantin Belousov * be dropped and reacquired.
8c616246SKonstantin Belousov *
8c616246SKonstantin Belousov * This function depends on normal struct vm_page being type stable.
8c616246SKonstantin Belousov */
85eeca35SAlan Coxstatic boolean_t
8c616246SKonstantin Belousovvm_pageout_page_lock(vm_page_t m, vm_page_t *next)
8c616246SKonstantin Belousov{
8c616246SKonstantin Belousov	struct vm_page marker;
8d220203SAlan Cox	struct vm_pagequeue *pq;
8c616246SKonstantin Belousov	boolean_t unchanged;
8c616246SKonstantin Belousov	u_short queue;
8c616246SKonstantin Belousov
8c616246SKonstantin Belousov	vm_page_lock_assert(m, MA_NOTOWNED);
8c616246SKonstantin Belousov	if (vm_page_trylock(m))
8c616246SKonstantin Belousov		return (TRUE);
8c616246SKonstantin Belousov
8c616246SKonstantin Belousov	queue = m->queue;
8c616246SKonstantin Belousov	vm_pageout_init_marker(&marker, queue);
449c2e92SKonstantin Belousov	pq = vm_page_pagequeue(m);
8c616246SKonstantin Belousov
c325e866SKonstantin Belousov	TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q);
8d220203SAlan Cox	vm_pagequeue_unlock(pq);
8c616246SKonstantin Belousov	vm_page_lock(m);
8d220203SAlan Cox	vm_pagequeue_lock(pq);
8c616246SKonstantin Belousov
8c616246SKonstantin Belousov	/* Page queue might have changed. */
c325e866SKonstantin Belousov	*next = TAILQ_NEXT(&marker, plinks.q);
c325e866SKonstantin Belousov	unchanged = (m->queue == queue && &marker == TAILQ_NEXT(m, plinks.q));
c325e866SKonstantin Belousov	TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q);
8c616246SKonstantin Belousov	return (unchanged);
8c616246SKonstantin Belousov}
8c616246SKonstantin Belousov
8c616246SKonstantin Belousov/*
26f9a767SRodney W. Grimes * vm_pageout_clean:
24a1cce3SDavid Greenman *
0d94caffSDavid Greenman * Clean the page and remove it from the laundry.
26f9a767SRodney W. Grimes *
0d94caffSDavid Greenman * We set the busy bit to cause potential page faults on this page to
1c7c3c6aSMatthew Dillon * block.  Note the careful timing, however, the busy bit isn't set till
1c7c3c6aSMatthew Dillon * late and we cannot do anything that will mess with the page.
26f9a767SRodney W. Grimes */
3af76890SPoul-Henning Kampstatic int
34d8b7eaSJeff Robersonvm_pageout_cluster(vm_page_t m)
24a1cce3SDavid Greenman{
54d92145SMatthew Dillon	vm_object_t object;
91b4f427SAlan Cox	vm_page_t mc[2*vm_pageout_page_count], pb, ps;
3562af12SAlan Cox	int pageout_count;
90ecac61SMatthew Dillon	int ib, is, page_base;
a316d390SJohn Dyson	vm_pindex_t pindex = m->pindex;
26f9a767SRodney W. Grimes
95976f3fSAlan Cox	vm_page_lock_assert(m, MA_OWNED);
17f6a17bSAlan Cox	object = m->object;
89f6b863SAttilio Rao	VM_OBJECT_ASSERT_WLOCKED(object);
0cddd8f0SMatthew Dillon
26f9a767SRodney W. Grimes	/*
1c7c3c6aSMatthew Dillon	 * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
1c7c3c6aSMatthew Dillon	 * with the new swapper, but we could have serious problems paging
1c7c3c6aSMatthew Dillon	 * out other object types if there is insufficient memory.
1c7c3c6aSMatthew Dillon	 *
1c7c3c6aSMatthew Dillon	 * Unfortunately, checking free memory here is far too late, so the
1c7c3c6aSMatthew Dillon	 * check has been moved up a procedural level.
1c7c3c6aSMatthew Dillon	 */
1c7c3c6aSMatthew Dillon
24a1cce3SDavid Greenman	/*
9e897b1bSAlan Cox	 * Can't clean the page if it's busy or held.
24a1cce3SDavid Greenman	 */
c7aebda8SAttilio Rao	vm_page_assert_unbusied(m);
95976f3fSAlan Cox	KASSERT(m->hold_count == 0, ("vm_pageout_clean: page %p is held", m));
17f6a17bSAlan Cox	vm_page_unlock(m);
0d94caffSDavid Greenman
91b4f427SAlan Cox	mc[vm_pageout_page_count] = pb = ps = m;
26f9a767SRodney W. Grimes	pageout_count = 1;
f35329acSJohn Dyson	page_base = vm_pageout_page_count;
90ecac61SMatthew Dillon	ib = 1;
90ecac61SMatthew Dillon	is = 1;
90ecac61SMatthew Dillon
24a1cce3SDavid Greenman	/*
24a1cce3SDavid Greenman	 * Scan object for clusterable pages.
24a1cce3SDavid Greenman	 *
24a1cce3SDavid Greenman	 * We can cluster ONLY if: ->> the page is NOT
24a1cce3SDavid Greenman	 * clean, wired, busy, held, or mapped into a
24a1cce3SDavid Greenman	 * buffer, and one of the following:
24a1cce3SDavid Greenman	 * 1) The page is inactive, or a seldom used
24a1cce3SDavid Greenman	 *    active page.
24a1cce3SDavid Greenman	 * -or-
24a1cce3SDavid Greenman	 * 2) we force the issue.
90ecac61SMatthew Dillon	 *
90ecac61SMatthew Dillon	 * During heavy mmap/modification loads the pageout
90ecac61SMatthew Dillon	 * daemon can really fragment the underlying file
90ecac61SMatthew Dillon	 * due to flushing pages out of order and not trying
90ecac61SMatthew Dillon	 * align the clusters (which leave sporatic out-of-order
90ecac61SMatthew Dillon	 * holes).  To solve this problem we do the reverse scan
90ecac61SMatthew Dillon	 * first and attempt to align our cluster, then do a
90ecac61SMatthew Dillon	 * forward scan if room remains.
24a1cce3SDavid Greenman	 */
90ecac61SMatthew Dillonmore:
90ecac61SMatthew Dillon	while (ib && pageout_count < vm_pageout_page_count) {
24a1cce3SDavid Greenman		vm_page_t p;
f6b04d2bSDavid Greenman
90ecac61SMatthew Dillon		if (ib > pindex) {
90ecac61SMatthew Dillon			ib = 0;
90ecac61SMatthew Dillon			break;
f6b04d2bSDavid Greenman		}
90ecac61SMatthew Dillon
c7aebda8SAttilio Rao		if ((p = vm_page_prev(pb)) == NULL || vm_page_busied(p)) {
90ecac61SMatthew Dillon			ib = 0;
90ecac61SMatthew Dillon			break;
f6b04d2bSDavid Greenman		}
2965a453SKip Macy		vm_page_lock(p);
24a1cce3SDavid Greenman		vm_page_test_dirty(p);
26f4eea5SAlan Cox		if (p->dirty == 0 ||
90ecac61SMatthew Dillon		    p->queue != PQ_INACTIVE ||
57601bcbSMatthew Dillon		    p->hold_count != 0) {	/* may be undergoing I/O */
2965a453SKip Macy			vm_page_unlock(p);
90ecac61SMatthew Dillon			ib = 0;
24a1cce3SDavid Greenman			break;
f6b04d2bSDavid Greenman		}
2965a453SKip Macy		vm_page_unlock(p);
91b4f427SAlan Cox		mc[--page_base] = pb = p;
90ecac61SMatthew Dillon		++pageout_count;
90ecac61SMatthew Dillon		++ib;
24a1cce3SDavid Greenman		/*
90ecac61SMatthew Dillon		 * alignment boundry, stop here and switch directions.  Do
90ecac61SMatthew Dillon		 * not clear ib.
24a1cce3SDavid Greenman		 */
90ecac61SMatthew Dillon		if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
90ecac61SMatthew Dillon			break;
24a1cce3SDavid Greenman	}
90ecac61SMatthew Dillon
90ecac61SMatthew Dillon	while (pageout_count < vm_pageout_page_count &&
90ecac61SMatthew Dillon	    pindex + is < object->size) {
90ecac61SMatthew Dillon		vm_page_t p;
90ecac61SMatthew Dillon
c7aebda8SAttilio Rao		if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p))
90ecac61SMatthew Dillon			break;
2965a453SKip Macy		vm_page_lock(p);
24a1cce3SDavid Greenman		vm_page_test_dirty(p);
26f4eea5SAlan Cox		if (p->dirty == 0 ||
90ecac61SMatthew Dillon		    p->queue != PQ_INACTIVE ||
57601bcbSMatthew Dillon		    p->hold_count != 0) {	/* may be undergoing I/O */
2965a453SKip Macy			vm_page_unlock(p);
24a1cce3SDavid Greenman			break;
24a1cce3SDavid Greenman		}
2965a453SKip Macy		vm_page_unlock(p);
91b4f427SAlan Cox		mc[page_base + pageout_count] = ps = p;
90ecac61SMatthew Dillon		++pageout_count;
90ecac61SMatthew Dillon		++is;
24a1cce3SDavid Greenman	}
90ecac61SMatthew Dillon
90ecac61SMatthew Dillon	/*
90ecac61SMatthew Dillon	 * If we exhausted our forward scan, continue with the reverse scan
90ecac61SMatthew Dillon	 * when possible, even past a page boundry.  This catches boundry
90ecac61SMatthew Dillon	 * conditions.
90ecac61SMatthew Dillon	 */
90ecac61SMatthew Dillon	if (ib && pageout_count < vm_pageout_page_count)
90ecac61SMatthew Dillon		goto more;
f6b04d2bSDavid Greenman
67bf6868SJohn Dyson	/*
67bf6868SJohn Dyson	 * we allow reads during pageouts...
67bf6868SJohn Dyson	 */
126d6082SKonstantin Belousov	return (vm_pageout_flush(&mc[page_base], pageout_count, 0, 0, NULL,
126d6082SKonstantin Belousov	    NULL));
aef922f5SJohn Dyson}
aef922f5SJohn Dyson
1c7c3c6aSMatthew Dillon/*
1c7c3c6aSMatthew Dillon * vm_pageout_flush() - launder the given pages
1c7c3c6aSMatthew Dillon *
1c7c3c6aSMatthew Dillon *	The given pages are laundered.  Note that we setup for the start of
1c7c3c6aSMatthew Dillon *	I/O ( i.e. busy the page ), mark it read-only, and bump the object
1c7c3c6aSMatthew Dillon *	reference count all in here rather then in the parent.  If we want
1c7c3c6aSMatthew Dillon *	the parent to do more sophisticated things we may have to change
1c7c3c6aSMatthew Dillon *	the ordering.
1e8a675cSKonstantin Belousov *
1e8a675cSKonstantin Belousov *	Returned runlen is the count of pages between mreq and first
1e8a675cSKonstantin Belousov *	page after mreq with status VM_PAGER_AGAIN.
126d6082SKonstantin Belousov *	*eio is set to TRUE if pager returned VM_PAGER_ERROR or VM_PAGER_FAIL
126d6082SKonstantin Belousov *	for any page in runlen set.
1c7c3c6aSMatthew Dillon */
aef922f5SJohn Dysonint
126d6082SKonstantin Belousovvm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen,
126d6082SKonstantin Belousov    boolean_t *eio)
aef922f5SJohn Dyson{
2e3b314dSAlan Cox	vm_object_t object = mc[0]->object;
aef922f5SJohn Dyson	int pageout_status[count];
95461b45SJohn Dyson	int numpagedout = 0;
1e8a675cSKonstantin Belousov	int i, runlen;
aef922f5SJohn Dyson
89f6b863SAttilio Rao	VM_OBJECT_ASSERT_WLOCKED(object);
7bec141bSKip Macy
1c7c3c6aSMatthew Dillon	/*
1c7c3c6aSMatthew Dillon	 * Initiate I/O.  Bump the vm_page_t->busy counter and
1c7c3c6aSMatthew Dillon	 * mark the pages read-only.
1c7c3c6aSMatthew Dillon	 *
1c7c3c6aSMatthew Dillon	 * We do not have to fixup the clean/dirty bits here... we can
1c7c3c6aSMatthew Dillon	 * allow the pager to do it after the I/O completes.
02fa91d3SMatthew Dillon	 *
02fa91d3SMatthew Dillon	 * NOTE! mc[i]->dirty may be partial or fragmented due to an
02fa91d3SMatthew Dillon	 * edge case with file fragments.
1c7c3c6aSMatthew Dillon	 */
8f9110f6SJohn Dyson	for (i = 0; i < count; i++) {
7a935082SAlan Cox		KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
7a935082SAlan Cox		    ("vm_pageout_flush: partially invalid page %p index %d/%d",
7a935082SAlan Cox			mc[i], i, count));
c7aebda8SAttilio Rao		vm_page_sbusy(mc[i]);
78985e42SAlan Cox		pmap_remove_write(mc[i]);
2965a453SKip Macy	}
d474eaaaSDoug Rabson	vm_object_pip_add(object, count);
aef922f5SJohn Dyson
d076fbeaSAlan Cox	vm_pager_put_pages(object, mc, count, flags, pageout_status);
26f9a767SRodney W. Grimes
1e8a675cSKonstantin Belousov	runlen = count - mreq;
126d6082SKonstantin Belousov	if (eio != NULL)
126d6082SKonstantin Belousov		*eio = FALSE;
aef922f5SJohn Dyson	for (i = 0; i < count; i++) {
aef922f5SJohn Dyson		vm_page_t mt = mc[i];
24a1cce3SDavid Greenman
4cd45723SAlan Cox		KASSERT(pageout_status[i] == VM_PAGER_PEND ||
6031c68dSAlan Cox		    !pmap_page_is_write_mapped(mt),
9ea8d1a6SAlan Cox		    ("vm_pageout_flush: page %p is not write protected", mt));
26f9a767SRodney W. Grimes		switch (pageout_status[i]) {
26f9a767SRodney W. Grimes		case VM_PAGER_OK:
26f9a767SRodney W. Grimes		case VM_PAGER_PEND:
95461b45SJohn Dyson			numpagedout++;
26f9a767SRodney W. Grimes			break;
26f9a767SRodney W. Grimes		case VM_PAGER_BAD:
26f9a767SRodney W. Grimes			/*
0d94caffSDavid Greenman			 * Page outside of range of object. Right now we
0d94caffSDavid Greenman			 * essentially lose the changes by pretending it
0d94caffSDavid Greenman			 * worked.
26f9a767SRodney W. Grimes			 */
90ecac61SMatthew Dillon			vm_page_undirty(mt);
26f9a767SRodney W. Grimes			break;
26f9a767SRodney W. Grimes		case VM_PAGER_ERROR:
26f9a767SRodney W. Grimes		case VM_PAGER_FAIL:
26f9a767SRodney W. Grimes			/*
0d94caffSDavid Greenman			 * If page couldn't be paged out, then reactivate the
0d94caffSDavid Greenman			 * page so it doesn't clog the inactive list.  (We
0d94caffSDavid Greenman			 * will try paging out it again later).
26f9a767SRodney W. Grimes			 */
3c4a2440SAlan Cox			vm_page_lock(mt);
24a1cce3SDavid Greenman			vm_page_activate(mt);
3c4a2440SAlan Cox			vm_page_unlock(mt);
126d6082SKonstantin Belousov			if (eio != NULL && i >= mreq && i - mreq < runlen)
126d6082SKonstantin Belousov				*eio = TRUE;
26f9a767SRodney W. Grimes			break;
26f9a767SRodney W. Grimes		case VM_PAGER_AGAIN:
1e8a675cSKonstantin Belousov			if (i >= mreq && i - mreq < runlen)
1e8a675cSKonstantin Belousov				runlen = i - mreq;
26f9a767SRodney W. Grimes			break;
26f9a767SRodney W. Grimes		}
26f9a767SRodney W. Grimes
26f9a767SRodney W. Grimes		/*
0d94caffSDavid Greenman		 * If the operation is still going, leave the page busy to
0d94caffSDavid Greenman		 * block all other accesses. Also, leave the paging in
0d94caffSDavid Greenman		 * progress indicator set so that we don't attempt an object
0d94caffSDavid Greenman		 * collapse.
26f9a767SRodney W. Grimes		 */
26f9a767SRodney W. Grimes		if (pageout_status[i] != VM_PAGER_PEND) {
f919ebdeSDavid Greenman			vm_object_pip_wakeup(object);
c7aebda8SAttilio Rao			vm_page_sunbusy(mt);
3c4a2440SAlan Cox		}
3c4a2440SAlan Cox	}
1e8a675cSKonstantin Belousov	if (prunlen != NULL)
1e8a675cSKonstantin Belousov		*prunlen = runlen;
3c4a2440SAlan Cox	return (numpagedout);
26f9a767SRodney W. Grimes}
26f9a767SRodney W. Grimes
85eeca35SAlan Coxstatic boolean_t
449c2e92SKonstantin Belousovvm_pageout_launder(struct vm_pagequeue *pq, int tries, vm_paddr_t low,
449c2e92SKonstantin Belousov    vm_paddr_t high)
85eeca35SAlan Cox{
85eeca35SAlan Cox	struct mount *mp;
85eeca35SAlan Cox	struct vnode *vp;
85eeca35SAlan Cox	vm_object_t object;
85eeca35SAlan Cox	vm_paddr_t pa;
85eeca35SAlan Cox	vm_page_t m, m_tmp, next;
1bd7d0b7SKonstantin Belousov	int lockmode;
85eeca35SAlan Cox
8d220203SAlan Cox	vm_pagequeue_lock(pq);
c325e866SKonstantin Belousov	TAILQ_FOREACH_SAFE(m, &pq->pq_pl, plinks.q, next) {
85eeca35SAlan Cox		if ((m->flags & PG_MARKER) != 0)
85eeca35SAlan Cox			continue;
85eeca35SAlan Cox		pa = VM_PAGE_TO_PHYS(m);
85eeca35SAlan Cox		if (pa < low || pa + PAGE_SIZE > high)
85eeca35SAlan Cox			continue;
85eeca35SAlan Cox		if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) {
85eeca35SAlan Cox			vm_page_unlock(m);
85eeca35SAlan Cox			continue;
85eeca35SAlan Cox		}
85eeca35SAlan Cox		object = m->object;
89f6b863SAttilio Rao		if ((!VM_OBJECT_TRYWLOCK(object) &&
85eeca35SAlan Cox		    (!vm_pageout_fallback_object_lock(m, &next) ||
c7aebda8SAttilio Rao		    m->hold_count != 0)) || vm_page_busied(m)) {
85eeca35SAlan Cox			vm_page_unlock(m);
89f6b863SAttilio Rao			VM_OBJECT_WUNLOCK(object);
85eeca35SAlan Cox			continue;
85eeca35SAlan Cox		}
85eeca35SAlan Cox		vm_page_test_dirty(m);
9fc4739dSAlan Cox		if (m->dirty == 0 && object->ref_count != 0)
85eeca35SAlan Cox			pmap_remove_all(m);
85eeca35SAlan Cox		if (m->dirty != 0) {
85eeca35SAlan Cox			vm_page_unlock(m);
85eeca35SAlan Cox			if (tries == 0 || (object->flags & OBJ_DEAD) != 0) {
89f6b863SAttilio Rao				VM_OBJECT_WUNLOCK(object);
85eeca35SAlan Cox				continue;
85eeca35SAlan Cox			}
85eeca35SAlan Cox			if (object->type == OBJT_VNODE) {
8d220203SAlan Cox				vm_pagequeue_unlock(pq);
85eeca35SAlan Cox				vp = object->handle;
85eeca35SAlan Cox				vm_object_reference_locked(object);
89f6b863SAttilio Rao				VM_OBJECT_WUNLOCK(object);
85eeca35SAlan Cox				(void)vn_start_write(vp, &mp, V_WAIT);
1bd7d0b7SKonstantin Belousov				lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
1bd7d0b7SKonstantin Belousov				    LK_SHARED : LK_EXCLUSIVE;
1bd7d0b7SKonstantin Belousov				vn_lock(vp, lockmode | LK_RETRY);
89f6b863SAttilio Rao				VM_OBJECT_WLOCK(object);
85eeca35SAlan Cox				vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
89f6b863SAttilio Rao				VM_OBJECT_WUNLOCK(object);
85eeca35SAlan Cox				VOP_UNLOCK(vp, 0);
85eeca35SAlan Cox				vm_object_deallocate(object);
85eeca35SAlan Cox				vn_finished_write(mp);
85eeca35SAlan Cox				return (TRUE);
85eeca35SAlan Cox			} else if (object->type == OBJT_SWAP ||
85eeca35SAlan Cox			    object->type == OBJT_DEFAULT) {
8d220203SAlan Cox				vm_pagequeue_unlock(pq);
85eeca35SAlan Cox				m_tmp = m;
85eeca35SAlan Cox				vm_pageout_flush(&m_tmp, 1, VM_PAGER_PUT_SYNC,
85eeca35SAlan Cox				    0, NULL, NULL);
89f6b863SAttilio Rao				VM_OBJECT_WUNLOCK(object);
85eeca35SAlan Cox				return (TRUE);
85eeca35SAlan Cox			}
85eeca35SAlan Cox		} else {
8d220203SAlan Cox			/*
8d220203SAlan Cox			 * Dequeue here to prevent lock recursion in
8d220203SAlan Cox			 * vm_page_cache().
8d220203SAlan Cox			 */
8d220203SAlan Cox			vm_page_dequeue_locked(m);
85eeca35SAlan Cox			vm_page_cache(m);
85eeca35SAlan Cox			vm_page_unlock(m);
85eeca35SAlan Cox		}
89f6b863SAttilio Rao		VM_OBJECT_WUNLOCK(object);
85eeca35SAlan Cox	}
8d220203SAlan Cox	vm_pagequeue_unlock(pq);
85eeca35SAlan Cox	return (FALSE);
85eeca35SAlan Cox}
85eeca35SAlan Cox
85eeca35SAlan Cox/*
85eeca35SAlan Cox * Increase the number of cached pages.  The specified value, "tries",
85eeca35SAlan Cox * determines which categories of pages are cached:
85eeca35SAlan Cox *
85eeca35SAlan Cox *  0: All clean, inactive pages within the specified physical address range
85eeca35SAlan Cox *     are cached.  Will not sleep.
85eeca35SAlan Cox *  1: The vm_lowmem handlers are called.  All inactive pages within
85eeca35SAlan Cox *     the specified physical address range are cached.  May sleep.
85eeca35SAlan Cox *  2: The vm_lowmem handlers are called.  All inactive and active pages
85eeca35SAlan Cox *     within the specified physical address range are cached.  May sleep.
85eeca35SAlan Cox */
85eeca35SAlan Coxvoid
85eeca35SAlan Coxvm_pageout_grow_cache(int tries, vm_paddr_t low, vm_paddr_t high)
85eeca35SAlan Cox{
449c2e92SKonstantin Belousov	int actl, actmax, inactl, inactmax, dom, initial_dom;
449c2e92SKonstantin Belousov	static int start_dom = 0;
85eeca35SAlan Cox
85eeca35SAlan Cox	if (tries > 0) {
85eeca35SAlan Cox		/*
85eeca35SAlan Cox		 * Decrease registered cache sizes.  The vm_lowmem handlers
85eeca35SAlan Cox		 * may acquire locks and/or sleep, so they can only be invoked
85eeca35SAlan Cox		 * when "tries" is greater than zero.
85eeca35SAlan Cox		 */
14a0d74eSSteven Hartland		SDT_PROBE0(vm, , , vm__lowmem_cache);
85eeca35SAlan Cox		EVENTHANDLER_INVOKE(vm_lowmem, 0);
85eeca35SAlan Cox
85eeca35SAlan Cox		/*
85eeca35SAlan Cox		 * We do this explicitly after the caches have been drained
85eeca35SAlan Cox		 * above.
85eeca35SAlan Cox		 */
85eeca35SAlan Cox		uma_reclaim();
85eeca35SAlan Cox	}
449c2e92SKonstantin Belousov
449c2e92SKonstantin Belousov	/*
449c2e92SKonstantin Belousov	 * Make the next scan start on the next domain.
449c2e92SKonstantin Belousov	 */
449c2e92SKonstantin Belousov	initial_dom = atomic_fetchadd_int(&start_dom, 1) % vm_ndomains;
449c2e92SKonstantin Belousov
85eeca35SAlan Cox	inactl = 0;
44f1c916SBryan Drewery	inactmax = vm_cnt.v_inactive_count;
85eeca35SAlan Cox	actl = 0;
44f1c916SBryan Drewery	actmax = tries < 2 ? 0 : vm_cnt.v_active_count;
449c2e92SKonstantin Belousov	dom = initial_dom;
449c2e92SKonstantin Belousov
449c2e92SKonstantin Belousov	/*
449c2e92SKonstantin Belousov	 * Scan domains in round-robin order, first inactive queues,
449c2e92SKonstantin Belousov	 * then active.  Since domain usually owns large physically
449c2e92SKonstantin Belousov	 * contiguous chunk of memory, it makes sense to completely
449c2e92SKonstantin Belousov	 * exhaust one domain before switching to next, while growing
449c2e92SKonstantin Belousov	 * the pool of contiguous physical pages.
449c2e92SKonstantin Belousov	 *
449c2e92SKonstantin Belousov	 * Do not even start launder a domain which cannot contain
449c2e92SKonstantin Belousov	 * the specified address range, as indicated by segments
449c2e92SKonstantin Belousov	 * constituting the domain.
449c2e92SKonstantin Belousov	 */
85eeca35SAlan Coxagain:
449c2e92SKonstantin Belousov	if (inactl < inactmax) {
449c2e92SKonstantin Belousov		if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs,
449c2e92SKonstantin Belousov		    low, high) &&
449c2e92SKonstantin Belousov		    vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_INACTIVE],
449c2e92SKonstantin Belousov		    tries, low, high)) {
85eeca35SAlan Cox			inactl++;
85eeca35SAlan Cox			goto again;
85eeca35SAlan Cox		}
449c2e92SKonstantin Belousov		if (++dom == vm_ndomains)
449c2e92SKonstantin Belousov			dom = 0;
449c2e92SKonstantin Belousov		if (dom != initial_dom)
449c2e92SKonstantin Belousov			goto again;
449c2e92SKonstantin Belousov	}
449c2e92SKonstantin Belousov	if (actl < actmax) {
449c2e92SKonstantin Belousov		if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs,
449c2e92SKonstantin Belousov		    low, high) &&
449c2e92SKonstantin Belousov		    vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_ACTIVE],
449c2e92SKonstantin Belousov		      tries, low, high)) {
85eeca35SAlan Cox			actl++;
85eeca35SAlan Cox			goto again;
85eeca35SAlan Cox		}
449c2e92SKonstantin Belousov		if (++dom == vm_ndomains)
449c2e92SKonstantin Belousov			dom = 0;
449c2e92SKonstantin Belousov		if (dom != initial_dom)
449c2e92SKonstantin Belousov			goto again;
449c2e92SKonstantin Belousov	}
85eeca35SAlan Cox}
85eeca35SAlan Cox
38efa82bSJohn Dyson#if !defined(NO_SWAPPING)
26f9a767SRodney W. Grimes/*
26f9a767SRodney W. Grimes *	vm_pageout_object_deactivate_pages
26f9a767SRodney W. Grimes *
ce186587SAlan Cox *	Deactivate enough pages to satisfy the inactive target
ce186587SAlan Cox *	requirements.
26f9a767SRodney W. Grimes *
26f9a767SRodney W. Grimes *	The object and map must be locked.
26f9a767SRodney W. Grimes */
38efa82bSJohn Dysonstatic void
ce186587SAlan Coxvm_pageout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object,
ce186587SAlan Cox    long desired)
26f9a767SRodney W. Grimes{
ecf6279fSAlan Cox	vm_object_t backing_object, object;
ce186587SAlan Cox	vm_page_t p;
bb7858eaSJeff Roberson	int act_delta, remove_mode;
26f9a767SRodney W. Grimes
e23b0a19SAlan Cox	VM_OBJECT_ASSERT_LOCKED(first_object);
28634820SAlan Cox	if ((first_object->flags & OBJ_FICTITIOUS) != 0)
38efa82bSJohn Dyson		return;
ecf6279fSAlan Cox	for (object = first_object;; object = backing_object) {
ecf6279fSAlan Cox		if (pmap_resident_count(pmap) <= desired)
ecf6279fSAlan Cox			goto unlock_return;
e23b0a19SAlan Cox		VM_OBJECT_ASSERT_LOCKED(object);
28634820SAlan Cox		if ((object->flags & OBJ_UNMANAGED) != 0 ||
28634820SAlan Cox		    object->paging_in_progress != 0)
ecf6279fSAlan Cox			goto unlock_return;
26f9a767SRodney W. Grimes
85b1dc89SAlan Cox		remove_mode = 0;
38efa82bSJohn Dyson		if (object->shadow_count > 1)
38efa82bSJohn Dyson			remove_mode = 1;
26f9a767SRodney W. Grimes		/*
ce186587SAlan Cox		 * Scan the object's entire memory queue.
26f9a767SRodney W. Grimes		 */
ce186587SAlan Cox		TAILQ_FOREACH(p, &object->memq, listq) {
447fe2a4SAlan Cox			if (pmap_resident_count(pmap) <= desired)
447fe2a4SAlan Cox				goto unlock_return;
c7aebda8SAttilio Rao			if (vm_page_busied(p))
447fe2a4SAlan Cox				continue;
ce186587SAlan Cox			PCPU_INC(cnt.v_pdpages);
2965a453SKip Macy			vm_page_lock(p);
ce186587SAlan Cox			if (p->wire_count != 0 || p->hold_count != 0 ||
ecf6279fSAlan Cox			    !pmap_page_exists_quick(pmap, p)) {
2965a453SKip Macy				vm_page_unlock(p);
0d94caffSDavid Greenman				continue;
0d94caffSDavid Greenman			}
bb7858eaSJeff Roberson			act_delta = pmap_ts_referenced(p);
3407fefeSKonstantin Belousov			if ((p->aflags & PGA_REFERENCED) != 0) {
bb7858eaSJeff Roberson				if (act_delta == 0)
bb7858eaSJeff Roberson					act_delta = 1;
3407fefeSKonstantin Belousov				vm_page_aflag_clear(p, PGA_REFERENCED);
ef743ce6SJohn Dyson			}
bb7858eaSJeff Roberson			if (p->queue != PQ_ACTIVE && act_delta != 0) {
ef743ce6SJohn Dyson				vm_page_activate(p);
bb7858eaSJeff Roberson				p->act_count += act_delta;
c8c4b40cSJohn Dyson			} else if (p->queue == PQ_ACTIVE) {
bb7858eaSJeff Roberson				if (act_delta == 0) {
ce186587SAlan Cox					p->act_count -= min(p->act_count,
ce186587SAlan Cox					    ACT_DECLINE);
90776bd7SJeff Roberson					if (!remove_mode && p->act_count == 0) {
4fec79beSAlan Cox						pmap_remove_all(p);
26f9a767SRodney W. Grimes						vm_page_deactivate(p);
8d220203SAlan Cox					} else
8d220203SAlan Cox						vm_page_requeue(p);
c8c4b40cSJohn Dyson				} else {
eaf13dd7SJohn Dyson					vm_page_activate(p);
ce186587SAlan Cox					if (p->act_count < ACT_MAX -
ce186587SAlan Cox					    ACT_ADVANCE)
38efa82bSJohn Dyson						p->act_count += ACT_ADVANCE;
8d220203SAlan Cox					vm_page_requeue(p);
ce186587SAlan Cox				}
ce186587SAlan Cox			} else if (p->queue == PQ_INACTIVE)
ce186587SAlan Cox				pmap_remove_all(p);
2965a453SKip Macy			vm_page_unlock(p);
26f9a767SRodney W. Grimes		}
ecf6279fSAlan Cox		if ((backing_object = object->backing_object) == NULL)
ecf6279fSAlan Cox			goto unlock_return;
e23b0a19SAlan Cox		VM_OBJECT_RLOCK(backing_object);
ecf6279fSAlan Cox		if (object != first_object)
e23b0a19SAlan Cox			VM_OBJECT_RUNLOCK(object);
38efa82bSJohn Dyson	}
ecf6279fSAlan Coxunlock_return:
ecf6279fSAlan Cox	if (object != first_object)
e23b0a19SAlan Cox		VM_OBJECT_RUNLOCK(object);
26f9a767SRodney W. Grimes}
26f9a767SRodney W. Grimes
26f9a767SRodney W. Grimes/*
26f9a767SRodney W. Grimes * deactivate some number of pages in a map, try to do it fairly, but
26f9a767SRodney W. Grimes * that is really hard to do.
26f9a767SRodney W. Grimes */
cd41fc12SDavid Greenmanstatic void
38efa82bSJohn Dysonvm_pageout_map_deactivate_pages(map, desired)
26f9a767SRodney W. Grimes	vm_map_t map;
ecf6279fSAlan Cox	long desired;
26f9a767SRodney W. Grimes{
26f9a767SRodney W. Grimes	vm_map_entry_t tmpe;
38efa82bSJohn Dyson	vm_object_t obj, bigobj;
30105b9eSTor Egge	int nothingwired;
0d94caffSDavid Greenman
d974f03cSAlan Cox	if (!vm_map_trylock(map))
26f9a767SRodney W. Grimes		return;
38efa82bSJohn Dyson
38efa82bSJohn Dyson	bigobj = NULL;
30105b9eSTor Egge	nothingwired = TRUE;
38efa82bSJohn Dyson
38efa82bSJohn Dyson	/*
38efa82bSJohn Dyson	 * first, search out the biggest object, and try to free pages from
38efa82bSJohn Dyson	 * that.
38efa82bSJohn Dyson	 */
26f9a767SRodney W. Grimes	tmpe = map->header.next;
38efa82bSJohn Dyson	while (tmpe != &map->header) {
9fdfe602SMatthew Dillon		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
38efa82bSJohn Dyson			obj = tmpe->object.vm_object;
e23b0a19SAlan Cox			if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) {
0774dfb3SAlan Cox				if (obj->shadow_count <= 1 &&
0774dfb3SAlan Cox				    (bigobj == NULL ||
0774dfb3SAlan Cox				     bigobj->resident_page_count < obj->resident_page_count)) {
0774dfb3SAlan Cox					if (bigobj != NULL)
e23b0a19SAlan Cox						VM_OBJECT_RUNLOCK(bigobj);
38efa82bSJohn Dyson					bigobj = obj;
0774dfb3SAlan Cox				} else
e23b0a19SAlan Cox					VM_OBJECT_RUNLOCK(obj);
38efa82bSJohn Dyson			}
38efa82bSJohn Dyson		}
30105b9eSTor Egge		if (tmpe->wired_count > 0)
30105b9eSTor Egge			nothingwired = FALSE;
38efa82bSJohn Dyson		tmpe = tmpe->next;
38efa82bSJohn Dyson	}
38efa82bSJohn Dyson
0774dfb3SAlan Cox	if (bigobj != NULL) {
ecf6279fSAlan Cox		vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired);
e23b0a19SAlan Cox		VM_OBJECT_RUNLOCK(bigobj);
0774dfb3SAlan Cox	}
38efa82bSJohn Dyson	/*
38efa82bSJohn Dyson	 * Next, hunt around for other pages to deactivate.  We actually
38efa82bSJohn Dyson	 * do this search sort of wrong -- .text first is not the best idea.
38efa82bSJohn Dyson	 */
38efa82bSJohn Dyson	tmpe = map->header.next;
38efa82bSJohn Dyson	while (tmpe != &map->header) {
b1028ad1SLuoqi Chen		if (pmap_resident_count(vm_map_pmap(map)) <= desired)
38efa82bSJohn Dyson			break;
9fdfe602SMatthew Dillon		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
38efa82bSJohn Dyson			obj = tmpe->object.vm_object;
0774dfb3SAlan Cox			if (obj != NULL) {
e23b0a19SAlan Cox				VM_OBJECT_RLOCK(obj);
ecf6279fSAlan Cox				vm_pageout_object_deactivate_pages(map->pmap, obj, desired);
e23b0a19SAlan Cox				VM_OBJECT_RUNLOCK(obj);
0774dfb3SAlan Cox			}
38efa82bSJohn Dyson		}
26f9a767SRodney W. Grimes		tmpe = tmpe->next;
38857e7fSAlan Cox	}
38efa82bSJohn Dyson
38efa82bSJohn Dyson	/*
38efa82bSJohn Dyson	 * Remove all mappings if a process is swapped out, this will free page
38efa82bSJohn Dyson	 * table pages.
38efa82bSJohn Dyson	 */
38857e7fSAlan Cox	if (desired == 0 && nothingwired) {
8d01a3b2SNathan Whitehorn		pmap_remove(vm_map_pmap(map), vm_map_min(map),
8d01a3b2SNathan Whitehorn		    vm_map_max(map));
38857e7fSAlan Cox	}
938b0f5bSMarcel Moolenaar
38efa82bSJohn Dyson	vm_map_unlock(map);
26f9a767SRodney W. Grimes}
a1287949SEivind Eklund#endif		/* !defined(NO_SWAPPING) */
df8bae1dSRodney W. Grimes
1c7c3c6aSMatthew Dillon/*
34d8b7eaSJeff Roberson * Attempt to acquire all of the necessary locks to launder a page and
34d8b7eaSJeff Roberson * then call through the clustering layer to PUTPAGES.  Wait a short
34d8b7eaSJeff Roberson * time for a vnode lock.
34d8b7eaSJeff Roberson *
34d8b7eaSJeff Roberson * Requires the page and object lock on entry, releases both before return.
34d8b7eaSJeff Roberson * Returns 0 on success and an errno otherwise.
34d8b7eaSJeff Roberson */
34d8b7eaSJeff Robersonstatic int
34d8b7eaSJeff Robersonvm_pageout_clean(vm_page_t m)
34d8b7eaSJeff Roberson{
34d8b7eaSJeff Roberson	struct vnode *vp;
34d8b7eaSJeff Roberson	struct mount *mp;
34d8b7eaSJeff Roberson	vm_object_t object;
34d8b7eaSJeff Roberson	vm_pindex_t pindex;
34d8b7eaSJeff Roberson	int error, lockmode;
34d8b7eaSJeff Roberson
34d8b7eaSJeff Roberson	vm_page_assert_locked(m);
34d8b7eaSJeff Roberson	object = m->object;
34d8b7eaSJeff Roberson	VM_OBJECT_ASSERT_WLOCKED(object);
34d8b7eaSJeff Roberson	error = 0;
34d8b7eaSJeff Roberson	vp = NULL;
34d8b7eaSJeff Roberson	mp = NULL;
34d8b7eaSJeff Roberson
34d8b7eaSJeff Roberson	/*
34d8b7eaSJeff Roberson	 * The object is already known NOT to be dead.   It
34d8b7eaSJeff Roberson	 * is possible for the vget() to block the whole
34d8b7eaSJeff Roberson	 * pageout daemon, but the new low-memory handling
34d8b7eaSJeff Roberson	 * code should prevent it.
34d8b7eaSJeff Roberson	 *
34d8b7eaSJeff Roberson	 * We can't wait forever for the vnode lock, we might
34d8b7eaSJeff Roberson	 * deadlock due to a vn_read() getting stuck in
34d8b7eaSJeff Roberson	 * vm_wait while holding this vnode.  We skip the
34d8b7eaSJeff Roberson	 * vnode if we can't get it in a reasonable amount
34d8b7eaSJeff Roberson	 * of time.
34d8b7eaSJeff Roberson	 */
34d8b7eaSJeff Roberson	if (object->type == OBJT_VNODE) {
34d8b7eaSJeff Roberson		vm_page_unlock(m);
34d8b7eaSJeff Roberson		vp = object->handle;
34d8b7eaSJeff Roberson		if (vp->v_type == VREG &&
34d8b7eaSJeff Roberson		    vn_start_write(vp, &mp, V_NOWAIT) != 0) {
34d8b7eaSJeff Roberson			mp = NULL;
34d8b7eaSJeff Roberson			error = EDEADLK;
34d8b7eaSJeff Roberson			goto unlock_all;
34d8b7eaSJeff Roberson		}
34d8b7eaSJeff Roberson		KASSERT(mp != NULL,
34d8b7eaSJeff Roberson		    ("vp %p with NULL v_mount", vp));
34d8b7eaSJeff Roberson		vm_object_reference_locked(object);
34d8b7eaSJeff Roberson		pindex = m->pindex;
34d8b7eaSJeff Roberson		VM_OBJECT_WUNLOCK(object);
34d8b7eaSJeff Roberson		lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
34d8b7eaSJeff Roberson		    LK_SHARED : LK_EXCLUSIVE;
34d8b7eaSJeff Roberson		if (vget(vp, lockmode | LK_TIMELOCK, curthread)) {
34d8b7eaSJeff Roberson			vp = NULL;
34d8b7eaSJeff Roberson			error = EDEADLK;
34d8b7eaSJeff Roberson			goto unlock_mp;
34d8b7eaSJeff Roberson		}
34d8b7eaSJeff Roberson		VM_OBJECT_WLOCK(object);
34d8b7eaSJeff Roberson		vm_page_lock(m);
34d8b7eaSJeff Roberson		/*
34d8b7eaSJeff Roberson		 * While the object and page were unlocked, the page
34d8b7eaSJeff Roberson		 * may have been:
34d8b7eaSJeff Roberson		 * (1) moved to a different queue,
34d8b7eaSJeff Roberson		 * (2) reallocated to a different object,
34d8b7eaSJeff Roberson		 * (3) reallocated to a different offset, or
34d8b7eaSJeff Roberson		 * (4) cleaned.
34d8b7eaSJeff Roberson		 */
34d8b7eaSJeff Roberson		if (m->queue != PQ_INACTIVE || m->object != object ||
34d8b7eaSJeff Roberson		    m->pindex != pindex || m->dirty == 0) {
34d8b7eaSJeff Roberson			vm_page_unlock(m);
34d8b7eaSJeff Roberson			error = ENXIO;
34d8b7eaSJeff Roberson			goto unlock_all;
34d8b7eaSJeff Roberson		}
34d8b7eaSJeff Roberson
34d8b7eaSJeff Roberson		/*
34d8b7eaSJeff Roberson		 * The page may have been busied or held while the object
34d8b7eaSJeff Roberson		 * and page locks were released.
34d8b7eaSJeff Roberson		 */
34d8b7eaSJeff Roberson		if (vm_page_busied(m) || m->hold_count != 0) {
34d8b7eaSJeff Roberson			vm_page_unlock(m);
34d8b7eaSJeff Roberson			error = EBUSY;
34d8b7eaSJeff Roberson			goto unlock_all;
34d8b7eaSJeff Roberson		}
34d8b7eaSJeff Roberson	}
34d8b7eaSJeff Roberson
34d8b7eaSJeff Roberson	/*
34d8b7eaSJeff Roberson	 * If a page is dirty, then it is either being washed
34d8b7eaSJeff Roberson	 * (but not yet cleaned) or it is still in the
34d8b7eaSJeff Roberson	 * laundry.  If it is still in the laundry, then we
34d8b7eaSJeff Roberson	 * start the cleaning operation.
34d8b7eaSJeff Roberson	 */
34d8b7eaSJeff Roberson	if (vm_pageout_cluster(m) == 0)
34d8b7eaSJeff Roberson		error = EIO;
34d8b7eaSJeff Roberson
34d8b7eaSJeff Robersonunlock_all:
34d8b7eaSJeff Roberson	VM_OBJECT_WUNLOCK(object);
34d8b7eaSJeff Roberson
34d8b7eaSJeff Robersonunlock_mp:
34d8b7eaSJeff Roberson	vm_page_lock_assert(m, MA_NOTOWNED);
34d8b7eaSJeff Roberson	if (mp != NULL) {
34d8b7eaSJeff Roberson		if (vp != NULL)
34d8b7eaSJeff Roberson			vput(vp);
34d8b7eaSJeff Roberson		vm_object_deallocate(object);
34d8b7eaSJeff Roberson		vn_finished_write(mp);
34d8b7eaSJeff Roberson	}
34d8b7eaSJeff Roberson
34d8b7eaSJeff Roberson	return (error);
34d8b7eaSJeff Roberson}
34d8b7eaSJeff Roberson
34d8b7eaSJeff Roberson/*
df8bae1dSRodney W. Grimes *	vm_pageout_scan does the dirty work for the pageout daemon.
d9e23210SJeff Roberson *
d9e23210SJeff Roberson *	pass 0 - Update active LRU/deactivate pages
d9e23210SJeff Roberson *	pass 1 - Move inactive to cache or free
d9e23210SJeff Roberson *	pass 2 - Launder dirty pages
df8bae1dSRodney W. Grimes */
2b6b0df7SMatthew Dillonstatic void
449c2e92SKonstantin Belousovvm_pageout_scan(struct vm_domain *vmd, int pass)
df8bae1dSRodney W. Grimes{
502ba6e4SJohn Dyson	vm_page_t m, next;
8d220203SAlan Cox	struct vm_pagequeue *pq;
df8bae1dSRodney W. Grimes	vm_object_t object;
22cf98d1SAlan Cox	long min_scan;
9099545aSAlan Cox	int act_delta, addl_page_shortage, deficit, maxscan, page_shortage;
f6b04d2bSDavid Greenman	int vnodes_skipped = 0;
22cf98d1SAlan Cox	int maxlaunder, scan_tick, scanned;
48cc2fc7SKonstantin Belousov	boolean_t queues_locked;
0d94caffSDavid Greenman
df8bae1dSRodney W. Grimes	/*
d9e23210SJeff Roberson	 * If we need to reclaim memory ask kernel caches to return
c9612b2dSJeff Roberson	 * some.  We rate limit to avoid thrashing.
d9e23210SJeff Roberson	 */
c9612b2dSJeff Roberson	if (vmd == &vm_dom[0] && pass > 0 &&
*a6bf3a9eSRyan Stone	    (time_uptime - lowmem_uptime) >= lowmem_period) {
d9e23210SJeff Roberson		/*
855a310fSJeff Roberson		 * Decrease registered cache sizes.
855a310fSJeff Roberson		 */
14a0d74eSSteven Hartland		SDT_PROBE0(vm, , , vm__lowmem_scan);
855a310fSJeff Roberson		EVENTHANDLER_INVOKE(vm_lowmem, 0);
855a310fSJeff Roberson		/*
d9e23210SJeff Roberson		 * We do this explicitly after the caches have been
d9e23210SJeff Roberson		 * drained above.
855a310fSJeff Roberson		 */
855a310fSJeff Roberson		uma_reclaim();
*a6bf3a9eSRyan Stone		lowmem_uptime = time_uptime;
d9e23210SJeff Roberson	}
5985940eSJohn Dyson
311e34e2SKonstantin Belousov	/*
96240c89SEitan Adler	 * The addl_page_shortage is the number of temporarily
311e34e2SKonstantin Belousov	 * stuck pages in the inactive queue.  In other words, the
449c2e92SKonstantin Belousov	 * number of pages from the inactive count that should be
311e34e2SKonstantin Belousov	 * discounted in setting the target for the active queue scan.
311e34e2SKonstantin Belousov	 */
9099545aSAlan Cox	addl_page_shortage = 0;
9099545aSAlan Cox
1c7c3c6aSMatthew Dillon	/*
1c7c3c6aSMatthew Dillon	 * Calculate the number of pages we want to either free or move
2b6b0df7SMatthew Dillon	 * to the cache.
1c7c3c6aSMatthew Dillon	 */
60196cdaSAlan Cox	if (pass > 0) {
60196cdaSAlan Cox		deficit = atomic_readandclear_int(&vm_pageout_deficit);
9099545aSAlan Cox		page_shortage = vm_paging_target() + deficit;
60196cdaSAlan Cox	} else
60196cdaSAlan Cox		page_shortage = deficit = 0;
1c7c3c6aSMatthew Dillon
936524aaSMatthew Dillon	/*
2b6b0df7SMatthew Dillon	 * maxlaunder limits the number of dirty pages we flush per scan.
2b6b0df7SMatthew Dillon	 * For most systems a smaller value (16 or 32) is more robust under
2b6b0df7SMatthew Dillon	 * extreme memory and disk pressure because any unnecessary writes
2b6b0df7SMatthew Dillon	 * to disk can result in extreme performance degredation.  However,
2b6b0df7SMatthew Dillon	 * systems with excessive dirty pages (especially when MAP_NOSYNC is
2b6b0df7SMatthew Dillon	 * used) will die horribly with limited laundering.  If the pageout
2b6b0df7SMatthew Dillon	 * daemon cannot clean enough pages in the first pass, we let it go
2b6b0df7SMatthew Dillon	 * all out in succeeding passes.
1c7c3c6aSMatthew Dillon	 */
2b6b0df7SMatthew Dillon	if ((maxlaunder = vm_max_launder) <= 1)
2b6b0df7SMatthew Dillon		maxlaunder = 1;
d9e23210SJeff Roberson	if (pass > 1)
2b6b0df7SMatthew Dillon		maxlaunder = 10000;
8d220203SAlan Cox
8d220203SAlan Cox	/*
8d220203SAlan Cox	 * Start scanning the inactive queue for pages we can move to the
8d220203SAlan Cox	 * cache or free.  The scan will stop when the target is reached or
8d220203SAlan Cox	 * we have scanned the entire inactive queue.  Note that m->act_count
8d220203SAlan Cox	 * is not used to form decisions for the inactive queue, only for the
8d220203SAlan Cox	 * active queue.
8d220203SAlan Cox	 */
449c2e92SKonstantin Belousov	pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
449c2e92SKonstantin Belousov	maxscan = pq->pq_cnt;
8d220203SAlan Cox	vm_pagequeue_lock(pq);
8d220203SAlan Cox	queues_locked = TRUE;
8d220203SAlan Cox	for (m = TAILQ_FIRST(&pq->pq_pl);
1c7c3c6aSMatthew Dillon	     m != NULL && maxscan-- > 0 && page_shortage > 0;
e929c00dSKirk McKusick	     m = next) {
8d220203SAlan Cox		vm_pagequeue_assert_locked(pq);
48cc2fc7SKonstantin Belousov		KASSERT(queues_locked, ("unlocked queues"));
d4961bcbSKonstantin Belousov		KASSERT(m->queue == PQ_INACTIVE, ("Inactive queue %p", m));
df8bae1dSRodney W. Grimes
8d220203SAlan Cox		PCPU_INC(cnt.v_pdpages);
c325e866SKonstantin Belousov		next = TAILQ_NEXT(m, plinks.q);
df8bae1dSRodney W. Grimes
936524aaSMatthew Dillon		/*
936524aaSMatthew Dillon		 * skip marker pages
936524aaSMatthew Dillon		 */
936524aaSMatthew Dillon		if (m->flags & PG_MARKER)
936524aaSMatthew Dillon			continue;
936524aaSMatthew Dillon
7900f95dSKonstantin Belousov		KASSERT((m->flags & PG_FICTITIOUS) == 0,
7900f95dSKonstantin Belousov		    ("Fictitious page %p cannot be in inactive queue", m));
7900f95dSKonstantin Belousov		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7900f95dSKonstantin Belousov		    ("Unmanaged page %p cannot be in inactive queue", m));
7900f95dSKonstantin Belousov
8c616246SKonstantin Belousov		/*
311e34e2SKonstantin Belousov		 * The page or object lock acquisitions fail if the
311e34e2SKonstantin Belousov		 * page was removed from the queue or moved to a
311e34e2SKonstantin Belousov		 * different position within the queue.  In either
311e34e2SKonstantin Belousov		 * case, addl_page_shortage should not be incremented.
8c616246SKonstantin Belousov		 */
8c616246SKonstantin Belousov		if (!vm_pageout_page_lock(m, &next)) {
8c616246SKonstantin Belousov			vm_page_unlock(m);
b182ec9eSJohn Dyson			continue;
df8bae1dSRodney W. Grimes		}
9ee2165fSAlan Cox		object = m->object;
89f6b863SAttilio Rao		if (!VM_OBJECT_TRYWLOCK(object) &&
311e34e2SKonstantin Belousov		    !vm_pageout_fallback_object_lock(m, &next)) {
2965a453SKip Macy			vm_page_unlock(m);
89f6b863SAttilio Rao			VM_OBJECT_WUNLOCK(object);
34d9e6fdSAlan Cox			continue;
34d9e6fdSAlan Cox		}
311e34e2SKonstantin Belousov
311e34e2SKonstantin Belousov		/*
311e34e2SKonstantin Belousov		 * Don't mess with busy pages, keep them at at the
311e34e2SKonstantin Belousov		 * front of the queue, most likely they are being
311e34e2SKonstantin Belousov		 * paged out.  Increment addl_page_shortage for busy
311e34e2SKonstantin Belousov		 * pages, because they may leave the inactive queue
311e34e2SKonstantin Belousov		 * shortly after page scan is finished.
311e34e2SKonstantin Belousov		 */
c7aebda8SAttilio Rao		if (vm_page_busied(m)) {
2965a453SKip Macy			vm_page_unlock(m);
89f6b863SAttilio Rao			VM_OBJECT_WUNLOCK(object);
b182ec9eSJohn Dyson			addl_page_shortage++;
26f9a767SRodney W. Grimes			continue;
26f9a767SRodney W. Grimes		}
bd7e5f99SJohn Dyson
7e006499SJohn Dyson		/*
8d220203SAlan Cox		 * We unlock the inactive page queue, invalidating the
48cc2fc7SKonstantin Belousov		 * 'next' pointer.  Use our marker to remember our
48cc2fc7SKonstantin Belousov		 * place.
48cc2fc7SKonstantin Belousov		 */
c325e866SKonstantin Belousov		TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q);
8d220203SAlan Cox		vm_pagequeue_unlock(pq);
48cc2fc7SKonstantin Belousov		queues_locked = FALSE;
48cc2fc7SKonstantin Belousov
48cc2fc7SKonstantin Belousov		/*
776f729cSKonstantin Belousov		 * Invalid pages can be easily freed. They cannot be
776f729cSKonstantin Belousov		 * mapped, vm_page_free() asserts this.
776f729cSKonstantin Belousov		 */
776f729cSKonstantin Belousov		if (m->valid == 0 && m->hold_count == 0) {
776f729cSKonstantin Belousov			vm_page_free(m);
776f729cSKonstantin Belousov			PCPU_INC(cnt.v_dfree);
776f729cSKonstantin Belousov			--page_shortage;
776f729cSKonstantin Belousov			goto drop_page;
776f729cSKonstantin Belousov		}
776f729cSKonstantin Belousov
776f729cSKonstantin Belousov		/*
bb7858eaSJeff Roberson		 * We bump the activation count if the page has been
bb7858eaSJeff Roberson		 * referenced while in the inactive queue.  This makes
bb7858eaSJeff Roberson		 * it less likely that the page will be added back to the
bb7858eaSJeff Roberson		 * inactive queue prematurely again.  Here we check the
1c7c3c6aSMatthew Dillon		 * page tables (or emulated bits, if any), given the upper
1c7c3c6aSMatthew Dillon		 * level VM system not knowing anything about existing
1c7c3c6aSMatthew Dillon		 * references.
7e006499SJohn Dyson		 */
bb7858eaSJeff Roberson		if ((m->aflags & PGA_REFERENCED) != 0) {
bb7858eaSJeff Roberson			vm_page_aflag_clear(m, PGA_REFERENCED);
bb7858eaSJeff Roberson			act_delta = 1;
86fa2471SAlan Cox		} else
86fa2471SAlan Cox			act_delta = 0;
bb7858eaSJeff Roberson		if (object->ref_count != 0) {
bb7858eaSJeff Roberson			act_delta += pmap_ts_referenced(m);
bb7858eaSJeff Roberson		} else {
bb7858eaSJeff Roberson			KASSERT(!pmap_page_is_mapped(m),
bb7858eaSJeff Roberson			    ("vm_pageout_scan: page %p is mapped", m));
2fe6e4d7SDavid Greenman		}
ef743ce6SJohn Dyson
7e006499SJohn Dyson		/*
1c7c3c6aSMatthew Dillon		 * If the upper level VM system knows about any page
bb7858eaSJeff Roberson		 * references, we reactivate the page or requeue it.
7e006499SJohn Dyson		 */
bb7858eaSJeff Roberson		if (act_delta != 0) {
86fa2471SAlan Cox			if (object->ref_count != 0) {
26f9a767SRodney W. Grimes				vm_page_activate(m);
bb7858eaSJeff Roberson				m->act_count += act_delta + ACT_ADVANCE;
bb7858eaSJeff Roberson			} else {
bb7858eaSJeff Roberson				vm_pagequeue_lock(pq);
bb7858eaSJeff Roberson				queues_locked = TRUE;
bb7858eaSJeff Roberson				vm_page_requeue_locked(m);
bb7858eaSJeff Roberson			}
776f729cSKonstantin Belousov			goto drop_page;
0d94caffSDavid Greenman		}
67bf6868SJohn Dyson
311e34e2SKonstantin Belousov		if (m->hold_count != 0) {
311e34e2SKonstantin Belousov			/*
311e34e2SKonstantin Belousov			 * Held pages are essentially stuck in the
311e34e2SKonstantin Belousov			 * queue.  So, they ought to be discounted
449c2e92SKonstantin Belousov			 * from the inactive count.  See the
311e34e2SKonstantin Belousov			 * calculation of the page_shortage for the
311e34e2SKonstantin Belousov			 * loop over the active queue below.
311e34e2SKonstantin Belousov			 */
311e34e2SKonstantin Belousov			addl_page_shortage++;
776f729cSKonstantin Belousov			goto drop_page;
311e34e2SKonstantin Belousov		}
311e34e2SKonstantin Belousov
7e006499SJohn Dyson		/*
9fc4739dSAlan Cox		 * If the page appears to be clean at the machine-independent
9fc4739dSAlan Cox		 * layer, then remove all of its mappings from the pmap in
9fc4739dSAlan Cox		 * anticipation of placing it onto the cache queue.  If,
9fc4739dSAlan Cox		 * however, any of the page's mappings allow write access,
9fc4739dSAlan Cox		 * then the page may still be modified until the last of those
9fc4739dSAlan Cox		 * mappings are removed.
7e006499SJohn Dyson		 */
aa044135SAlan Cox		if (object->ref_count != 0) {
9fc4739dSAlan Cox			vm_page_test_dirty(m);
aa044135SAlan Cox			if (m->dirty == 0)
b78ddb0bSAlan Cox				pmap_remove_all(m);
aa044135SAlan Cox		}
dcbcd518SBruce Evans
776f729cSKonstantin Belousov		if (m->dirty == 0) {
6989c456SAlan Cox			/*
78afdce6SAlan Cox			 * Clean pages can be freed.
6989c456SAlan Cox			 */
78afdce6SAlan Cox			vm_page_free(m);
78afdce6SAlan Cox			PCPU_INC(cnt.v_dfree);
1c7c3c6aSMatthew Dillon			--page_shortage;
d9e23210SJeff Roberson		} else if ((m->flags & PG_WINATCFLS) == 0 && pass < 2) {
7e006499SJohn Dyson			/*
2b6b0df7SMatthew Dillon			 * Dirty pages need to be paged out, but flushing
ab46f63eSJohn Baldwin			 * a page is extremely expensive versus freeing
2b6b0df7SMatthew Dillon			 * a clean page.  Rather then artificially limiting
2b6b0df7SMatthew Dillon			 * the number of pages we can flush, we instead give
2b6b0df7SMatthew Dillon			 * dirty pages extra priority on the inactive queue
2b6b0df7SMatthew Dillon			 * by forcing them to be cycled through the queue
2b6b0df7SMatthew Dillon			 * twice before being flushed, after which the
2b6b0df7SMatthew Dillon			 * (now clean) page will cycle through once more
2b6b0df7SMatthew Dillon			 * before being freed.  This significantly extends
2b6b0df7SMatthew Dillon			 * the thrash point for a heavily loaded machine.
7e006499SJohn Dyson			 */
3407fefeSKonstantin Belousov			m->flags |= PG_WINATCFLS;
8d220203SAlan Cox			vm_pagequeue_lock(pq);
48cc2fc7SKonstantin Belousov			queues_locked = TRUE;
8d220203SAlan Cox			vm_page_requeue_locked(m);
0d94caffSDavid Greenman		} else if (maxlaunder > 0) {
2b6b0df7SMatthew Dillon			/*
2b6b0df7SMatthew Dillon			 * We always want to try to flush some dirty pages if
2b6b0df7SMatthew Dillon			 * we encounter them, to keep the system stable.
2b6b0df7SMatthew Dillon			 * Normally this number is small, but under extreme
2b6b0df7SMatthew Dillon			 * pressure where there are insufficient clean pages
2b6b0df7SMatthew Dillon			 * on the inactive queue, we may have to go all out.
2b6b0df7SMatthew Dillon			 */
5050aa86SKonstantin Belousov			int swap_pageouts_ok;
34d8b7eaSJeff Roberson			int error;
0d94caffSDavid Greenman
12ac6a1dSJohn Dyson			if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) {
12ac6a1dSJohn Dyson				swap_pageouts_ok = 1;
12ac6a1dSJohn Dyson			} else {
12ac6a1dSJohn Dyson				swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts);
12ac6a1dSJohn Dyson				swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts &&
90ecac61SMatthew Dillon				vm_page_count_min());
12ac6a1dSJohn Dyson
12ac6a1dSJohn Dyson			}
70111b90SJohn Dyson
70111b90SJohn Dyson			/*
1c7c3c6aSMatthew Dillon			 * We don't bother paging objects that are "dead".
1c7c3c6aSMatthew Dillon			 * Those objects are in a "rundown" state.
70111b90SJohn Dyson			 */
70111b90SJohn Dyson			if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) {
8d220203SAlan Cox				vm_pagequeue_lock(pq);
2965a453SKip Macy				vm_page_unlock(m);
89f6b863SAttilio Rao				VM_OBJECT_WUNLOCK(object);
48cc2fc7SKonstantin Belousov				queues_locked = TRUE;
8d220203SAlan Cox				vm_page_requeue_locked(m);
48cc2fc7SKonstantin Belousov				goto relock_queues;
12ac6a1dSJohn Dyson			}
34d8b7eaSJeff Roberson			error = vm_pageout_clean(m);
1c7c3c6aSMatthew Dillon			/*
34d8b7eaSJeff Roberson			 * Decrement page_shortage on success to account for
2b6b0df7SMatthew Dillon			 * the (future) cleaned page.  Otherwise we could wind
2b6b0df7SMatthew Dillon			 * up laundering or cleaning too many pages.
0d94caffSDavid Greenman			 */
34d8b7eaSJeff Roberson			if (error == 0) {
34d8b7eaSJeff Roberson				page_shortage--;
34d8b7eaSJeff Roberson				maxlaunder--;
34d8b7eaSJeff Roberson			} else if (error == EDEADLK) {
34d8b7eaSJeff Roberson				pageout_lock_miss++;
34d8b7eaSJeff Roberson				vnodes_skipped++;
34d8b7eaSJeff Roberson			} else if (error == EBUSY) {
34d8b7eaSJeff Roberson				addl_page_shortage++;
48cc2fc7SKonstantin Belousov			}
48cc2fc7SKonstantin Belousov			vm_page_lock_assert(m, MA_NOTOWNED);
48cc2fc7SKonstantin Belousov			goto relock_queues;
48cc2fc7SKonstantin Belousov		}
776f729cSKonstantin Belousovdrop_page:
48cc2fc7SKonstantin Belousov		vm_page_unlock(m);
89f6b863SAttilio Rao		VM_OBJECT_WUNLOCK(object);
48cc2fc7SKonstantin Belousovrelock_queues:
48cc2fc7SKonstantin Belousov		if (!queues_locked) {
8d220203SAlan Cox			vm_pagequeue_lock(pq);
48cc2fc7SKonstantin Belousov			queues_locked = TRUE;
6989c456SAlan Cox		}
c325e866SKonstantin Belousov		next = TAILQ_NEXT(&vmd->vmd_marker, plinks.q);
c325e866SKonstantin Belousov		TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_marker, plinks.q);
0d94caffSDavid Greenman	}
8d220203SAlan Cox	vm_pagequeue_unlock(pq);
26f9a767SRodney W. Grimes
9452b5edSAlan Cox#if !defined(NO_SWAPPING)
9452b5edSAlan Cox	/*
9452b5edSAlan Cox	 * Wakeup the swapout daemon if we didn't cache or free the targeted
9452b5edSAlan Cox	 * number of pages.
9452b5edSAlan Cox	 */
9452b5edSAlan Cox	if (vm_swap_enabled && page_shortage > 0)
9452b5edSAlan Cox		vm_req_vmdaemon(VM_SWAP_NORMAL);
9452b5edSAlan Cox#endif
9452b5edSAlan Cox
9452b5edSAlan Cox	/*
9452b5edSAlan Cox	 * Wakeup the sync daemon if we skipped a vnode in a writeable object
9452b5edSAlan Cox	 * and we didn't cache or free enough pages.
9452b5edSAlan Cox	 */
9452b5edSAlan Cox	if (vnodes_skipped > 0 && page_shortage > vm_cnt.v_free_target -
9452b5edSAlan Cox	    vm_cnt.v_free_min)
9452b5edSAlan Cox		(void)speedup_syncer();
9452b5edSAlan Cox
df8bae1dSRodney W. Grimes	/*
936524aaSMatthew Dillon	 * Compute the number of pages we want to try to move from the
936524aaSMatthew Dillon	 * active queue to the inactive queue.
1c7c3c6aSMatthew Dillon	 */
44f1c916SBryan Drewery	page_shortage = vm_cnt.v_inactive_target - vm_cnt.v_inactive_count +
9099545aSAlan Cox	    vm_paging_target() + deficit + addl_page_shortage;
9099545aSAlan Cox
114f62c6SJeff Roberson	pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
114f62c6SJeff Roberson	vm_pagequeue_lock(pq);
9099545aSAlan Cox	maxscan = pq->pq_cnt;
9099545aSAlan Cox
d9e23210SJeff Roberson	/*
d9e23210SJeff Roberson	 * If we're just idle polling attempt to visit every
d9e23210SJeff Roberson	 * active page within 'update_period' seconds.
d9e23210SJeff Roberson	 */
22cf98d1SAlan Cox	scan_tick = ticks;
22cf98d1SAlan Cox	if (vm_pageout_update_period != 0) {
22cf98d1SAlan Cox		min_scan = pq->pq_cnt;
22cf98d1SAlan Cox		min_scan *= scan_tick - vmd->vmd_last_active_scan;
22cf98d1SAlan Cox		min_scan /= hz * vm_pageout_update_period;
22cf98d1SAlan Cox	} else
22cf98d1SAlan Cox		min_scan = 0;
22cf98d1SAlan Cox	if (min_scan > 0 || (page_shortage > 0 && maxscan > 0))
22cf98d1SAlan Cox		vmd->vmd_last_active_scan = scan_tick;
1c7c3c6aSMatthew Dillon
1c7c3c6aSMatthew Dillon	/*
22cf98d1SAlan Cox	 * Scan the active queue for pages that can be deactivated.  Update
22cf98d1SAlan Cox	 * the per-page activity counter and use it to identify deactivation
22cf98d1SAlan Cox	 * candidates.
1c7c3c6aSMatthew Dillon	 */
22cf98d1SAlan Cox	for (m = TAILQ_FIRST(&pq->pq_pl), scanned = 0; m != NULL && (scanned <
22cf98d1SAlan Cox	    min_scan || (page_shortage > 0 && scanned < maxscan)); m = next,
22cf98d1SAlan Cox	    scanned++) {
f35329acSJohn Dyson
9cf51988SAlan Cox		KASSERT(m->queue == PQ_ACTIVE,
d3c09dd7SAlan Cox		    ("vm_pageout_scan: page %p isn't active", m));
f35329acSJohn Dyson
c325e866SKonstantin Belousov		next = TAILQ_NEXT(m, plinks.q);
22cf98d1SAlan Cox		if ((m->flags & PG_MARKER) != 0)
8dbca793STor Egge			continue;
7900f95dSKonstantin Belousov		KASSERT((m->flags & PG_FICTITIOUS) == 0,
7900f95dSKonstantin Belousov		    ("Fictitious page %p cannot be in active queue", m));
7900f95dSKonstantin Belousov		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7900f95dSKonstantin Belousov		    ("Unmanaged page %p cannot be in active queue", m));
9ee2165fSAlan Cox		if (!vm_pageout_page_lock(m, &next)) {
8c616246SKonstantin Belousov			vm_page_unlock(m);
2965a453SKip Macy			continue;
2965a453SKip Macy		}
b18bfc3dSJohn Dyson
b18bfc3dSJohn Dyson		/*
b18bfc3dSJohn Dyson		 * The count for pagedaemon pages is done after checking the
956f3135SPhilippe Charnier		 * page for eligibility...
b18bfc3dSJohn Dyson		 */
8d220203SAlan Cox		PCPU_INC(cnt.v_pdpages);
ef743ce6SJohn Dyson
7e006499SJohn Dyson		/*
7e006499SJohn Dyson		 * Check to see "how much" the page has been used.
7e006499SJohn Dyson		 */
86fa2471SAlan Cox		if ((m->aflags & PGA_REFERENCED) != 0) {
bb7858eaSJeff Roberson			vm_page_aflag_clear(m, PGA_REFERENCED);
86fa2471SAlan Cox			act_delta = 1;
86fa2471SAlan Cox		} else
86fa2471SAlan Cox			act_delta = 0;
86fa2471SAlan Cox
274132acSJeff Roberson		/*
274132acSJeff Roberson		 * Unlocked object ref count check.  Two races are possible.
274132acSJeff Roberson		 * 1) The ref was transitioning to zero and we saw non-zero,
274132acSJeff Roberson		 *    the pmap bits will be checked unnecessarily.
274132acSJeff Roberson		 * 2) The ref was transitioning to one and we saw zero.
274132acSJeff Roberson		 *    The page lock prevents a new reference to this page so
274132acSJeff Roberson		 *    we need not check the reference bits.
274132acSJeff Roberson		 */
274132acSJeff Roberson		if (m->object->ref_count != 0)
bb7858eaSJeff Roberson			act_delta += pmap_ts_referenced(m);
bb7858eaSJeff Roberson
bb7858eaSJeff Roberson		/*
bb7858eaSJeff Roberson		 * Advance or decay the act_count based on recent usage.
bb7858eaSJeff Roberson		 */
86fa2471SAlan Cox		if (act_delta != 0) {
bb7858eaSJeff Roberson			m->act_count += ACT_ADVANCE + act_delta;
38efa82bSJohn Dyson			if (m->act_count > ACT_MAX)
38efa82bSJohn Dyson				m->act_count = ACT_MAX;
86fa2471SAlan Cox		} else
38efa82bSJohn Dyson			m->act_count -= min(m->act_count, ACT_DECLINE);
bb7858eaSJeff Roberson
bb7858eaSJeff Roberson		/*
bb7858eaSJeff Roberson		 * Move this page to the tail of the active or inactive
bb7858eaSJeff Roberson		 * queue depending on usage.
bb7858eaSJeff Roberson		 */
86fa2471SAlan Cox		if (m->act_count == 0) {
8d220203SAlan Cox			/* Dequeue to avoid later lock recursion. */
8d220203SAlan Cox			vm_page_dequeue_locked(m);
d4a272dbSJohn Dyson			vm_page_deactivate(m);
bb7858eaSJeff Roberson			page_shortage--;
8d220203SAlan Cox		} else
8d220203SAlan Cox			vm_page_requeue_locked(m);
2965a453SKip Macy		vm_page_unlock(m);
26f9a767SRodney W. Grimes	}
8d220203SAlan Cox	vm_pagequeue_unlock(pq);
ceb0cf87SJohn Dyson#if !defined(NO_SWAPPING)
ceb0cf87SJohn Dyson	/*
ceb0cf87SJohn Dyson	 * Idle process swapout -- run once per second.
ceb0cf87SJohn Dyson	 */
ceb0cf87SJohn Dyson	if (vm_swap_idle_enabled) {
ceb0cf87SJohn Dyson		static long lsec;
227ee8a1SPoul-Henning Kamp		if (time_second != lsec) {
97824da3SAlan Cox			vm_req_vmdaemon(VM_SWAP_IDLE);
227ee8a1SPoul-Henning Kamp			lsec = time_second;
ceb0cf87SJohn Dyson		}
ceb0cf87SJohn Dyson	}
ceb0cf87SJohn Dyson#endif
ceb0cf87SJohn Dyson
5663e6deSDavid Greenman	/*
e92686d0SDavid Schultz	 * If we are critically low on one of RAM or swap and low on
e92686d0SDavid Schultz	 * the other, kill the largest process.  However, we avoid
e92686d0SDavid Schultz	 * doing this on the first pass in order to give ourselves a
e92686d0SDavid Schultz	 * chance to flush out dirty vnode-backed pages and to allow
e92686d0SDavid Schultz	 * active pages to be moved to the inactive queue and reclaimed.
2025d69bSKonstantin Belousov	 */
449c2e92SKonstantin Belousov	vm_pageout_mightbe_oom(vmd, pass);
2025d69bSKonstantin Belousov}
2025d69bSKonstantin Belousov
449c2e92SKonstantin Belousovstatic int vm_pageout_oom_vote;
449c2e92SKonstantin Belousov
449c2e92SKonstantin Belousov/*
449c2e92SKonstantin Belousov * The pagedaemon threads randlomly select one to perform the
449c2e92SKonstantin Belousov * OOM.  Trying to kill processes before all pagedaemons
449c2e92SKonstantin Belousov * failed to reach free target is premature.
449c2e92SKonstantin Belousov */
449c2e92SKonstantin Belousovstatic void
449c2e92SKonstantin Belousovvm_pageout_mightbe_oom(struct vm_domain *vmd, int pass)
449c2e92SKonstantin Belousov{
449c2e92SKonstantin Belousov	int old_vote;
449c2e92SKonstantin Belousov
d9e23210SJeff Roberson	if (pass <= 1 || !((swap_pager_avail < 64 && vm_page_count_min()) ||
449c2e92SKonstantin Belousov	    (swap_pager_full && vm_paging_target() > 0))) {
449c2e92SKonstantin Belousov		if (vmd->vmd_oom) {
449c2e92SKonstantin Belousov			vmd->vmd_oom = FALSE;
449c2e92SKonstantin Belousov			atomic_subtract_int(&vm_pageout_oom_vote, 1);
449c2e92SKonstantin Belousov		}
449c2e92SKonstantin Belousov		return;
449c2e92SKonstantin Belousov	}
449c2e92SKonstantin Belousov
449c2e92SKonstantin Belousov	if (vmd->vmd_oom)
449c2e92SKonstantin Belousov		return;
449c2e92SKonstantin Belousov
449c2e92SKonstantin Belousov	vmd->vmd_oom = TRUE;
449c2e92SKonstantin Belousov	old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1);
449c2e92SKonstantin Belousov	if (old_vote != vm_ndomains - 1)
449c2e92SKonstantin Belousov		return;
449c2e92SKonstantin Belousov
449c2e92SKonstantin Belousov	/*
449c2e92SKonstantin Belousov	 * The current pagedaemon thread is the last in the quorum to
449c2e92SKonstantin Belousov	 * start OOM.  Initiate the selection and signaling of the
449c2e92SKonstantin Belousov	 * victim.
449c2e92SKonstantin Belousov	 */
449c2e92SKonstantin Belousov	vm_pageout_oom(VM_OOM_MEM);
449c2e92SKonstantin Belousov
449c2e92SKonstantin Belousov	/*
449c2e92SKonstantin Belousov	 * After one round of OOM terror, recall our vote.  On the
449c2e92SKonstantin Belousov	 * next pass, current pagedaemon would vote again if the low
449c2e92SKonstantin Belousov	 * memory condition is still there, due to vmd_oom being
449c2e92SKonstantin Belousov	 * false.
449c2e92SKonstantin Belousov	 */
449c2e92SKonstantin Belousov	vmd->vmd_oom = FALSE;
449c2e92SKonstantin Belousov	atomic_subtract_int(&vm_pageout_oom_vote, 1);
449c2e92SKonstantin Belousov}
2025d69bSKonstantin Belousov
2025d69bSKonstantin Belousovvoid
2025d69bSKonstantin Belousovvm_pageout_oom(int shortage)
2025d69bSKonstantin Belousov{
2025d69bSKonstantin Belousov	struct proc *p, *bigproc;
2025d69bSKonstantin Belousov	vm_offset_t size, bigsize;
2025d69bSKonstantin Belousov	struct thread *td;
6bed074cSKonstantin Belousov	struct vmspace *vm;
2025d69bSKonstantin Belousov
2025d69bSKonstantin Belousov	/*
1c58e4e5SJohn Baldwin	 * We keep the process bigproc locked once we find it to keep anyone
1c58e4e5SJohn Baldwin	 * from messing with it; however, there is a possibility of
1c58e4e5SJohn Baldwin	 * deadlock if process B is bigproc and one of it's child processes
1c58e4e5SJohn Baldwin	 * attempts to propagate a signal to B while we are waiting for A's
1c58e4e5SJohn Baldwin	 * lock while walking this list.  To avoid this, we don't block on
1c58e4e5SJohn Baldwin	 * the process lock but just skip a process if it is already locked.
5663e6deSDavid Greenman	 */
5663e6deSDavid Greenman	bigproc = NULL;
5663e6deSDavid Greenman	bigsize = 0;
1005a129SJohn Baldwin	sx_slock(&allproc_lock);
e602ba25SJulian Elischer	FOREACH_PROC_IN_SYSTEM(p) {
e602ba25SJulian Elischer		int breakout;
dcbcd518SBruce Evans
71943c3dSKonstantin Belousov		PROC_LOCK(p);
71943c3dSKonstantin Belousov
1c58e4e5SJohn Baldwin		/*
3f1c4c4fSKonstantin Belousov		 * If this is a system, protected or killed process, skip it.
5663e6deSDavid Greenman		 */
71943c3dSKonstantin Belousov		if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC |
71943c3dSKonstantin Belousov		    P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 ||
71943c3dSKonstantin Belousov		    p->p_pid == 1 || P_KILLED(p) ||
71943c3dSKonstantin Belousov		    (p->p_pid < 48 && swap_pager_avail != 0)) {
8606d880SJohn Baldwin			PROC_UNLOCK(p);
5663e6deSDavid Greenman			continue;
5663e6deSDavid Greenman		}
5663e6deSDavid Greenman		/*
dcbcd518SBruce Evans		 * If the process is in a non-running type state,
e602ba25SJulian Elischer		 * don't touch it.  Check all the threads individually.
5663e6deSDavid Greenman		 */
e602ba25SJulian Elischer		breakout = 0;
e602ba25SJulian Elischer		FOREACH_THREAD_IN_PROC(p, td) {
982d11f8SJeff Roberson			thread_lock(td);
71fad9fdSJulian Elischer			if (!TD_ON_RUNQ(td) &&
71fad9fdSJulian Elischer			    !TD_IS_RUNNING(td) &&
f497cda2SEdward Tomasz Napierala			    !TD_IS_SLEEPING(td) &&
f497cda2SEdward Tomasz Napierala			    !TD_IS_SUSPENDED(td)) {
982d11f8SJeff Roberson				thread_unlock(td);
e602ba25SJulian Elischer				breakout = 1;
e602ba25SJulian Elischer				break;
e602ba25SJulian Elischer			}
982d11f8SJeff Roberson			thread_unlock(td);
e602ba25SJulian Elischer		}
e602ba25SJulian Elischer		if (breakout) {
1c58e4e5SJohn Baldwin			PROC_UNLOCK(p);
5663e6deSDavid Greenman			continue;
5663e6deSDavid Greenman		}
5663e6deSDavid Greenman		/*
5663e6deSDavid Greenman		 * get the process size
5663e6deSDavid Greenman		 */
6bed074cSKonstantin Belousov		vm = vmspace_acquire_ref(p);
6bed074cSKonstantin Belousov		if (vm == NULL) {
6bed074cSKonstantin Belousov			PROC_UNLOCK(p);
6bed074cSKonstantin Belousov			continue;
6bed074cSKonstantin Belousov		}
71943c3dSKonstantin Belousov		_PHOLD(p);
6bed074cSKonstantin Belousov		if (!vm_map_trylock_read(&vm->vm_map)) {
71943c3dSKonstantin Belousov			_PRELE(p);
72d97679SDavid Schultz			PROC_UNLOCK(p);
71943c3dSKonstantin Belousov			vmspace_free(vm);
72d97679SDavid Schultz			continue;
72d97679SDavid Schultz		}
71943c3dSKonstantin Belousov		PROC_UNLOCK(p);
7981aa24SKonstantin Belousov		size = vmspace_swap_count(vm);
6bed074cSKonstantin Belousov		vm_map_unlock_read(&vm->vm_map);
2025d69bSKonstantin Belousov		if (shortage == VM_OOM_MEM)
6bed074cSKonstantin Belousov			size += vmspace_resident_count(vm);
6bed074cSKonstantin Belousov		vmspace_free(vm);
5663e6deSDavid Greenman		/*
5663e6deSDavid Greenman		 * if the this process is bigger than the biggest one
5663e6deSDavid Greenman		 * remember it.
5663e6deSDavid Greenman		 */
5663e6deSDavid Greenman		if (size > bigsize) {
1c58e4e5SJohn Baldwin			if (bigproc != NULL)
71943c3dSKonstantin Belousov				PRELE(bigproc);
5663e6deSDavid Greenman			bigproc = p;
5663e6deSDavid Greenman			bigsize = size;
71943c3dSKonstantin Belousov		} else {
71943c3dSKonstantin Belousov			PRELE(p);
71943c3dSKonstantin Belousov		}
5663e6deSDavid Greenman	}
1005a129SJohn Baldwin	sx_sunlock(&allproc_lock);
5663e6deSDavid Greenman	if (bigproc != NULL) {
8311a2b8SWill Andrews		if (vm_panic_on_oom != 0)
8311a2b8SWill Andrews			panic("out of swap space");
71943c3dSKonstantin Belousov		PROC_LOCK(bigproc);
729b1e51SDavid Greenman		killproc(bigproc, "out of swap space");
fa885116SJulian Elischer		sched_nice(bigproc, PRIO_MIN);
71943c3dSKonstantin Belousov		_PRELE(bigproc);
1c58e4e5SJohn Baldwin		PROC_UNLOCK(bigproc);
44f1c916SBryan Drewery		wakeup(&vm_cnt.v_free_count);
5663e6deSDavid Greenman	}
5663e6deSDavid Greenman}
26f9a767SRodney W. Grimes
449c2e92SKonstantin Belousovstatic void
449c2e92SKonstantin Belousovvm_pageout_worker(void *arg)
449c2e92SKonstantin Belousov{
449c2e92SKonstantin Belousov	struct vm_domain *domain;
949c9186SKonstantin Belousov	int domidx;
449c2e92SKonstantin Belousov
449c2e92SKonstantin Belousov	domidx = (uintptr_t)arg;
449c2e92SKonstantin Belousov	domain = &vm_dom[domidx];
449c2e92SKonstantin Belousov
449c2e92SKonstantin Belousov	/*
949c9186SKonstantin Belousov	 * XXXKIB It could be useful to bind pageout daemon threads to
949c9186SKonstantin Belousov	 * the cores belonging to the domain, from which vm_page_array
949c9186SKonstantin Belousov	 * is allocated.
449c2e92SKonstantin Belousov	 */
449c2e92SKonstantin Belousov
449c2e92SKonstantin Belousov	KASSERT(domain->vmd_segs != 0, ("domain without segments"));
22cf98d1SAlan Cox	domain->vmd_last_active_scan = ticks;
449c2e92SKonstantin Belousov	vm_pageout_init_marker(&domain->vmd_marker, PQ_INACTIVE);
449c2e92SKonstantin Belousov
449c2e92SKonstantin Belousov	/*
449c2e92SKonstantin Belousov	 * The pageout daemon worker is never done, so loop forever.
449c2e92SKonstantin Belousov	 */
449c2e92SKonstantin Belousov	while (TRUE) {
449c2e92SKonstantin Belousov		/*
449c2e92SKonstantin Belousov		 * If we have enough free memory, wakeup waiters.  Do
449c2e92SKonstantin Belousov		 * not clear vm_pages_needed until we reach our target,
449c2e92SKonstantin Belousov		 * otherwise we may be woken up over and over again and
449c2e92SKonstantin Belousov		 * waste a lot of cpu.
449c2e92SKonstantin Belousov		 */
449c2e92SKonstantin Belousov		mtx_lock(&vm_page_queue_free_mtx);
449c2e92SKonstantin Belousov		if (vm_pages_needed && !vm_page_count_min()) {
449c2e92SKonstantin Belousov			if (!vm_paging_needed())
449c2e92SKonstantin Belousov				vm_pages_needed = 0;
44f1c916SBryan Drewery			wakeup(&vm_cnt.v_free_count);
449c2e92SKonstantin Belousov		}
449c2e92SKonstantin Belousov		if (vm_pages_needed) {
449c2e92SKonstantin Belousov			/*
449c2e92SKonstantin Belousov			 * Still not done, take a second pass without waiting
449c2e92SKonstantin Belousov			 * (unlimited dirty cleaning), otherwise sleep a bit
449c2e92SKonstantin Belousov			 * and try again.
449c2e92SKonstantin Belousov			 */
449c2e92SKonstantin Belousov			if (domain->vmd_pass > 1)
449c2e92SKonstantin Belousov				msleep(&vm_pages_needed,
449c2e92SKonstantin Belousov				    &vm_page_queue_free_mtx, PVM, "psleep",
449c2e92SKonstantin Belousov				    hz / 2);
449c2e92SKonstantin Belousov		} else {
449c2e92SKonstantin Belousov			/*
d9e23210SJeff Roberson			 * Good enough, sleep until required to refresh
d9e23210SJeff Roberson			 * stats.
449c2e92SKonstantin Belousov			 */
449c2e92SKonstantin Belousov			domain->vmd_pass = 0;
d9e23210SJeff Roberson			msleep(&vm_pages_needed, &vm_page_queue_free_mtx,
d9e23210SJeff Roberson			    PVM, "psleep", hz);
d9e23210SJeff Roberson
449c2e92SKonstantin Belousov		}
d9e23210SJeff Roberson		if (vm_pages_needed) {
44f1c916SBryan Drewery			vm_cnt.v_pdwakeups++;
d9e23210SJeff Roberson			domain->vmd_pass++;
d9e23210SJeff Roberson		}
449c2e92SKonstantin Belousov		mtx_unlock(&vm_page_queue_free_mtx);
449c2e92SKonstantin Belousov		vm_pageout_scan(domain, domain->vmd_pass);
449c2e92SKonstantin Belousov	}
449c2e92SKonstantin Belousov}
449c2e92SKonstantin Belousov
df8bae1dSRodney W. Grimes/*
4d19f4adSSteven Hartland *	vm_pageout_init initialises basic pageout daemon settings.
df8bae1dSRodney W. Grimes */
2b14f991SJulian Elischerstatic void
4d19f4adSSteven Hartlandvm_pageout_init(void)
df8bae1dSRodney W. Grimes{
df8bae1dSRodney W. Grimes	/*
df8bae1dSRodney W. Grimes	 * Initialize some paging parameters.
df8bae1dSRodney W. Grimes	 */
44f1c916SBryan Drewery	vm_cnt.v_interrupt_free_min = 2;
44f1c916SBryan Drewery	if (vm_cnt.v_page_count < 2000)
f35329acSJohn Dyson		vm_pageout_page_count = 8;
f6b04d2bSDavid Greenman
45ae1d91SAlan Cox	/*
45ae1d91SAlan Cox	 * v_free_reserved needs to include enough for the largest
45ae1d91SAlan Cox	 * swap pager structures plus enough for any pv_entry structs
45ae1d91SAlan Cox	 * when paging.
45ae1d91SAlan Cox	 */
44f1c916SBryan Drewery	if (vm_cnt.v_page_count > 1024)
44f1c916SBryan Drewery		vm_cnt.v_free_min = 4 + (vm_cnt.v_page_count - 1024) / 200;
2feb50bfSAttilio Rao	else
44f1c916SBryan Drewery		vm_cnt.v_free_min = 4;
44f1c916SBryan Drewery	vm_cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
44f1c916SBryan Drewery	    vm_cnt.v_interrupt_free_min;
44f1c916SBryan Drewery	vm_cnt.v_free_reserved = vm_pageout_page_count +
44f1c916SBryan Drewery	    vm_cnt.v_pageout_free_min + (vm_cnt.v_page_count / 768);
44f1c916SBryan Drewery	vm_cnt.v_free_severe = vm_cnt.v_free_min / 2;
44f1c916SBryan Drewery	vm_cnt.v_free_target = 4 * vm_cnt.v_free_min + vm_cnt.v_free_reserved;
44f1c916SBryan Drewery	vm_cnt.v_free_min += vm_cnt.v_free_reserved;
44f1c916SBryan Drewery	vm_cnt.v_free_severe += vm_cnt.v_free_reserved;
44f1c916SBryan Drewery	vm_cnt.v_inactive_target = (3 * vm_cnt.v_free_target) / 2;
44f1c916SBryan Drewery	if (vm_cnt.v_inactive_target > vm_cnt.v_free_count / 3)
44f1c916SBryan Drewery		vm_cnt.v_inactive_target = vm_cnt.v_free_count / 3;
df8bae1dSRodney W. Grimes
d9e23210SJeff Roberson	/*
d9e23210SJeff Roberson	 * Set the default wakeup threshold to be 10% above the minimum
d9e23210SJeff Roberson	 * page limit.  This keeps the steady state out of shortfall.
d9e23210SJeff Roberson	 */
44f1c916SBryan Drewery	vm_pageout_wakeup_thresh = (vm_cnt.v_free_min / 10) * 11;
d9e23210SJeff Roberson
d9e23210SJeff Roberson	/*
d9e23210SJeff Roberson	 * Set interval in seconds for active scan.  We want to visit each
c9612b2dSJeff Roberson	 * page at least once every ten minutes.  This is to prevent worst
c9612b2dSJeff Roberson	 * case paging behaviors with stale active LRU.
d9e23210SJeff Roberson	 */
d9e23210SJeff Roberson	if (vm_pageout_update_period == 0)
c9612b2dSJeff Roberson		vm_pageout_update_period = 600;
d9e23210SJeff Roberson
df8bae1dSRodney W. Grimes	/* XXX does not really belong here */
df8bae1dSRodney W. Grimes	if (vm_page_max_wired == 0)
44f1c916SBryan Drewery		vm_page_max_wired = vm_cnt.v_free_count / 3;
4d19f4adSSteven Hartland}
4d19f4adSSteven Hartland
4d19f4adSSteven Hartland/*
4d19f4adSSteven Hartland *     vm_pageout is the high level pageout daemon.
4d19f4adSSteven Hartland */
4d19f4adSSteven Hartlandstatic void
4d19f4adSSteven Hartlandvm_pageout(void)
4d19f4adSSteven Hartland{
44ec2b63SKonstantin Belousov	int error;
4d19f4adSSteven Hartland#if MAXMEMDOM > 1
44ec2b63SKonstantin Belousov	int i;
4d19f4adSSteven Hartland#endif
df8bae1dSRodney W. Grimes
24a1cce3SDavid Greenman	swap_pager_swap_init();
449c2e92SKonstantin Belousov#if MAXMEMDOM > 1
449c2e92SKonstantin Belousov	for (i = 1; i < vm_ndomains; i++) {
449c2e92SKonstantin Belousov		error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i,
449c2e92SKonstantin Belousov		    curproc, NULL, 0, 0, "dom%d", i);
449c2e92SKonstantin Belousov		if (error != 0) {
449c2e92SKonstantin Belousov			panic("starting pageout for domain %d, error %d\n",
449c2e92SKonstantin Belousov			    i, error);
dc2efb27SJohn Dyson		}
f919ebdeSDavid Greenman	}
449c2e92SKonstantin Belousov#endif
44ec2b63SKonstantin Belousov	error = kthread_add(uma_reclaim_worker, NULL, curproc, NULL,
44ec2b63SKonstantin Belousov	    0, 0, "uma");
44ec2b63SKonstantin Belousov	if (error != 0)
44ec2b63SKonstantin Belousov		panic("starting uma_reclaim helper, error %d\n", error);
d395270dSDimitry Andric	vm_pageout_worker((void *)(uintptr_t)0);
df8bae1dSRodney W. Grimes}
26f9a767SRodney W. Grimes
6b4b77adSAlan Cox/*
e9f995d8SAlan Cox * Unless the free page queue lock is held by the caller, this function
6b4b77adSAlan Cox * should be regarded as advisory.  Specifically, the caller should
44f1c916SBryan Drewery * not msleep() on &vm_cnt.v_free_count following this function unless
e9f995d8SAlan Cox * the free page queue lock is held until the msleep() is performed.
6b4b77adSAlan Cox */
e0c5a895SJohn Dysonvoid
4a365329SAndrey Zonovpagedaemon_wakeup(void)
e0c5a895SJohn Dyson{
a1c0a785SAlan Cox
b40ce416SJulian Elischer	if (!vm_pages_needed && curthread->td_proc != pageproc) {
a1c0a785SAlan Cox		vm_pages_needed = 1;
e0c5a895SJohn Dyson		wakeup(&vm_pages_needed);
e0c5a895SJohn Dyson	}
e0c5a895SJohn Dyson}
e0c5a895SJohn Dyson
38efa82bSJohn Dyson#if !defined(NO_SWAPPING)
5afce282SDavid Greenmanstatic void
97824da3SAlan Coxvm_req_vmdaemon(int req)
5afce282SDavid Greenman{
5afce282SDavid Greenman	static int lastrun = 0;
5afce282SDavid Greenman
97824da3SAlan Cox	mtx_lock(&vm_daemon_mtx);
97824da3SAlan Cox	vm_pageout_req_swapout |= req;
b18bfc3dSJohn Dyson	if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
5afce282SDavid Greenman		wakeup(&vm_daemon_needed);
5afce282SDavid Greenman		lastrun = ticks;
5afce282SDavid Greenman	}
97824da3SAlan Cox	mtx_unlock(&vm_daemon_mtx);
5afce282SDavid Greenman}
5afce282SDavid Greenman
2b14f991SJulian Elischerstatic void
4a365329SAndrey Zonovvm_daemon(void)
0d94caffSDavid Greenman{
91d5354aSJohn Baldwin	struct rlimit rsslim;
dcbcd518SBruce Evans	struct proc *p;
dcbcd518SBruce Evans	struct thread *td;
6bed074cSKonstantin Belousov	struct vmspace *vm;
099e7e95SEdward Tomasz Napierala	int breakout, swapout_flags, tryagain, attempts;
afcc55f3SEdward Tomasz Napierala#ifdef RACCT
099e7e95SEdward Tomasz Napierala	uint64_t rsize, ravailable;
afcc55f3SEdward Tomasz Napierala#endif
0d94caffSDavid Greenman
2fe6e4d7SDavid Greenman	while (TRUE) {
97824da3SAlan Cox		mtx_lock(&vm_daemon_mtx);
4b5c9cf6SEdward Tomasz Napierala		msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep",
099e7e95SEdward Tomasz Napierala#ifdef RACCT
4b5c9cf6SEdward Tomasz Napierala		    racct_enable ? hz : 0
099e7e95SEdward Tomasz Napierala#else
4b5c9cf6SEdward Tomasz Napierala		    0
099e7e95SEdward Tomasz Napierala#endif
4b5c9cf6SEdward Tomasz Napierala		);
97824da3SAlan Cox		swapout_flags = vm_pageout_req_swapout;
4c1f8ee9SDavid Greenman		vm_pageout_req_swapout = 0;
97824da3SAlan Cox		mtx_unlock(&vm_daemon_mtx);
97824da3SAlan Cox		if (swapout_flags)
97824da3SAlan Cox			swapout_procs(swapout_flags);
97824da3SAlan Cox
2fe6e4d7SDavid Greenman		/*
0d94caffSDavid Greenman		 * scan the processes for exceeding their rlimits or if
0d94caffSDavid Greenman		 * process is swapped out -- deactivate pages
2fe6e4d7SDavid Greenman		 */
099e7e95SEdward Tomasz Napierala		tryagain = 0;
099e7e95SEdward Tomasz Napierala		attempts = 0;
099e7e95SEdward Tomasz Napieralaagain:
099e7e95SEdward Tomasz Napierala		attempts++;
1005a129SJohn Baldwin		sx_slock(&allproc_lock);
f67af5c9SXin LI		FOREACH_PROC_IN_SYSTEM(p) {
fe2144fdSLuoqi Chen			vm_pindex_t limit, size;
2fe6e4d7SDavid Greenman
2fe6e4d7SDavid Greenman			/*
2fe6e4d7SDavid Greenman			 * if this is a system process or if we have already
2fe6e4d7SDavid Greenman			 * looked at this process, skip it.
2fe6e4d7SDavid Greenman			 */
897ecacdSJohn Baldwin			PROC_LOCK(p);
8e6fa660SJohn Baldwin			if (p->p_state != PRS_NORMAL ||
8e6fa660SJohn Baldwin			    p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) {
897ecacdSJohn Baldwin				PROC_UNLOCK(p);
2fe6e4d7SDavid Greenman				continue;
2fe6e4d7SDavid Greenman			}
2fe6e4d7SDavid Greenman			/*
2fe6e4d7SDavid Greenman			 * if the process is in a non-running type state,
2fe6e4d7SDavid Greenman			 * don't touch it.
2fe6e4d7SDavid Greenman			 */
e602ba25SJulian Elischer			breakout = 0;
e602ba25SJulian Elischer			FOREACH_THREAD_IN_PROC(p, td) {
982d11f8SJeff Roberson				thread_lock(td);
71fad9fdSJulian Elischer				if (!TD_ON_RUNQ(td) &&
71fad9fdSJulian Elischer				    !TD_IS_RUNNING(td) &&
f497cda2SEdward Tomasz Napierala				    !TD_IS_SLEEPING(td) &&
f497cda2SEdward Tomasz Napierala				    !TD_IS_SUSPENDED(td)) {
982d11f8SJeff Roberson					thread_unlock(td);
e602ba25SJulian Elischer					breakout = 1;
e602ba25SJulian Elischer					break;
e602ba25SJulian Elischer				}
982d11f8SJeff Roberson				thread_unlock(td);
e602ba25SJulian Elischer			}
897ecacdSJohn Baldwin			if (breakout) {
897ecacdSJohn Baldwin				PROC_UNLOCK(p);
2fe6e4d7SDavid Greenman				continue;
2fe6e4d7SDavid Greenman			}
2fe6e4d7SDavid Greenman			/*
2fe6e4d7SDavid Greenman			 * get a limit
2fe6e4d7SDavid Greenman			 */
f6f6d240SMateusz Guzik			lim_rlimit_proc(p, RLIMIT_RSS, &rsslim);
fe2144fdSLuoqi Chen			limit = OFF_TO_IDX(
91d5354aSJohn Baldwin			    qmin(rsslim.rlim_cur, rsslim.rlim_max));
2fe6e4d7SDavid Greenman
2fe6e4d7SDavid Greenman			/*
0d94caffSDavid Greenman			 * let processes that are swapped out really be
0d94caffSDavid Greenman			 * swapped out set the limit to nothing (will force a
0d94caffSDavid Greenman			 * swap-out.)
2fe6e4d7SDavid Greenman			 */
b61ce5b0SJeff Roberson			if ((p->p_flag & P_INMEM) == 0)
0d94caffSDavid Greenman				limit = 0;	/* XXX */
6bed074cSKonstantin Belousov			vm = vmspace_acquire_ref(p);
897ecacdSJohn Baldwin			PROC_UNLOCK(p);
6bed074cSKonstantin Belousov			if (vm == NULL)
6bed074cSKonstantin Belousov				continue;
2fe6e4d7SDavid Greenman
6bed074cSKonstantin Belousov			size = vmspace_resident_count(vm);
a406d8c3SEdward Tomasz Napierala			if (size >= limit) {
fe2144fdSLuoqi Chen				vm_pageout_map_deactivate_pages(
6bed074cSKonstantin Belousov				    &vm->vm_map, limit);
2fe6e4d7SDavid Greenman			}
afcc55f3SEdward Tomasz Napierala#ifdef RACCT
4b5c9cf6SEdward Tomasz Napierala			if (racct_enable) {
099e7e95SEdward Tomasz Napierala				rsize = IDX_TO_OFF(size);
099e7e95SEdward Tomasz Napierala				PROC_LOCK(p);
099e7e95SEdward Tomasz Napierala				racct_set(p, RACCT_RSS, rsize);
099e7e95SEdward Tomasz Napierala				ravailable = racct_get_available(p, RACCT_RSS);
099e7e95SEdward Tomasz Napierala				PROC_UNLOCK(p);
099e7e95SEdward Tomasz Napierala				if (rsize > ravailable) {
099e7e95SEdward Tomasz Napierala					/*
4b5c9cf6SEdward Tomasz Napierala					 * Don't be overly aggressive; this
4b5c9cf6SEdward Tomasz Napierala					 * might be an innocent process,
4b5c9cf6SEdward Tomasz Napierala					 * and the limit could've been exceeded
4b5c9cf6SEdward Tomasz Napierala					 * by some memory hog.  Don't try
4b5c9cf6SEdward Tomasz Napierala					 * to deactivate more than 1/4th
4b5c9cf6SEdward Tomasz Napierala					 * of process' resident set size.
099e7e95SEdward Tomasz Napierala					 */
099e7e95SEdward Tomasz Napierala					if (attempts <= 8) {
4b5c9cf6SEdward Tomasz Napierala						if (ravailable < rsize -
4b5c9cf6SEdward Tomasz Napierala						    (rsize / 4)) {
4b5c9cf6SEdward Tomasz Napierala							ravailable = rsize -
4b5c9cf6SEdward Tomasz Napierala							    (rsize / 4);
4b5c9cf6SEdward Tomasz Napierala						}
099e7e95SEdward Tomasz Napierala					}
099e7e95SEdward Tomasz Napierala					vm_pageout_map_deactivate_pages(
4b5c9cf6SEdward Tomasz Napierala					    &vm->vm_map,
4b5c9cf6SEdward Tomasz Napierala					    OFF_TO_IDX(ravailable));
099e7e95SEdward Tomasz Napierala					/* Update RSS usage after paging out. */
099e7e95SEdward Tomasz Napierala					size = vmspace_resident_count(vm);
099e7e95SEdward Tomasz Napierala					rsize = IDX_TO_OFF(size);
099e7e95SEdward Tomasz Napierala					PROC_LOCK(p);
099e7e95SEdward Tomasz Napierala					racct_set(p, RACCT_RSS, rsize);
099e7e95SEdward Tomasz Napierala					PROC_UNLOCK(p);
099e7e95SEdward Tomasz Napierala					if (rsize > ravailable)
099e7e95SEdward Tomasz Napierala						tryagain = 1;
099e7e95SEdward Tomasz Napierala				}
4b5c9cf6SEdward Tomasz Napierala			}
afcc55f3SEdward Tomasz Napierala#endif
6bed074cSKonstantin Belousov			vmspace_free(vm);
2fe6e4d7SDavid Greenman		}
1005a129SJohn Baldwin		sx_sunlock(&allproc_lock);
099e7e95SEdward Tomasz Napierala		if (tryagain != 0 && attempts <= 10)
099e7e95SEdward Tomasz Napierala			goto again;
24a1cce3SDavid Greenman	}
2fe6e4d7SDavid Greenman}
a1287949SEivind Eklund#endif			/* !defined(NO_SWAPPING) */