sys/vm/vm_pageout.c

60727d8bSWarner Losh/*-
26f9a767SRodney W. Grimes * Copyright (c) 1991 Regents of the University of California.
26f9a767SRodney W. Grimes * All rights reserved.
26f9a767SRodney W. Grimes * Copyright (c) 1994 John S. Dyson
26f9a767SRodney W. Grimes * All rights reserved.
26f9a767SRodney W. Grimes * Copyright (c) 1994 David Greenman
26f9a767SRodney W. Grimes * All rights reserved.
8dbca793STor Egge * Copyright (c) 2005 Yahoo! Technologies Norway AS
8dbca793STor Egge * All rights reserved.
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes * This code is derived from software contributed to Berkeley by
df8bae1dSRodney W. Grimes * The Mach Operating System project at Carnegie-Mellon University.
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes * Redistribution and use in source and binary forms, with or without
df8bae1dSRodney W. Grimes * modification, are permitted provided that the following conditions
df8bae1dSRodney W. Grimes * are met:
df8bae1dSRodney W. Grimes * 1. Redistributions of source code must retain the above copyright
df8bae1dSRodney W. Grimes *    notice, this list of conditions and the following disclaimer.
df8bae1dSRodney W. Grimes * 2. Redistributions in binary form must reproduce the above copyright
df8bae1dSRodney W. Grimes *    notice, this list of conditions and the following disclaimer in the
df8bae1dSRodney W. Grimes *    documentation and/or other materials provided with the distribution.
df8bae1dSRodney W. Grimes * 3. All advertising materials mentioning features or use of this software
5929bcfaSPhilippe Charnier *    must display the following acknowledgement:
df8bae1dSRodney W. Grimes *	This product includes software developed by the University of
df8bae1dSRodney W. Grimes *	California, Berkeley and its contributors.
df8bae1dSRodney W. Grimes * 4. Neither the name of the University nor the names of its contributors
df8bae1dSRodney W. Grimes *    may be used to endorse or promote products derived from this software
df8bae1dSRodney W. Grimes *    without specific prior written permission.
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
df8bae1dSRodney W. Grimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
df8bae1dSRodney W. Grimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
df8bae1dSRodney W. Grimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
df8bae1dSRodney W. Grimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
df8bae1dSRodney W. Grimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
df8bae1dSRodney W. Grimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
df8bae1dSRodney W. Grimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
df8bae1dSRodney W. Grimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
df8bae1dSRodney W. Grimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
df8bae1dSRodney W. Grimes * SUCH DAMAGE.
df8bae1dSRodney W. Grimes *
3c4dd356SDavid Greenman *	from: @(#)vm_pageout.c	7.4 (Berkeley) 5/7/91
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes * Copyright (c) 1987, 1990 Carnegie-Mellon University.
df8bae1dSRodney W. Grimes * All rights reserved.
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes * Authors: Avadis Tevanian, Jr., Michael Wayne Young
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes * Permission to use, copy, modify and distribute this software and
df8bae1dSRodney W. Grimes * its documentation is hereby granted, provided that both the copyright
df8bae1dSRodney W. Grimes * notice and this permission notice appear in all copies of the
df8bae1dSRodney W. Grimes * software, derivative works or modified versions, and any portions
df8bae1dSRodney W. Grimes * thereof, and that both notices appear in supporting documentation.
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
df8bae1dSRodney W. Grimes * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
df8bae1dSRodney W. Grimes * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes * Carnegie Mellon requests users of this software to return to
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
df8bae1dSRodney W. Grimes *  School of Computer Science
df8bae1dSRodney W. Grimes *  Carnegie Mellon University
df8bae1dSRodney W. Grimes *  Pittsburgh PA 15213-3890
df8bae1dSRodney W. Grimes *
df8bae1dSRodney W. Grimes * any improvements or extensions that they make and grant Carnegie the
df8bae1dSRodney W. Grimes * rights to redistribute these changes.
df8bae1dSRodney W. Grimes */
df8bae1dSRodney W. Grimes
df8bae1dSRodney W. Grimes/*
df8bae1dSRodney W. Grimes *	The proverbial page-out daemon.
df8bae1dSRodney W. Grimes */
df8bae1dSRodney W. Grimes
874651b1SDavid E. O'Brien#include <sys/cdefs.h>
874651b1SDavid E. O'Brien__FBSDID("$FreeBSD$");
874651b1SDavid E. O'Brien
faa5f8d8SAndrzej Bialecki#include "opt_vm.h"
df8bae1dSRodney W. Grimes#include <sys/param.h>
26f9a767SRodney W. Grimes#include <sys/systm.h>
b5e8ce9fSBruce Evans#include <sys/kernel.h>
855a310fSJeff Roberson#include <sys/eventhandler.h>
fb919e4dSMark Murray#include <sys/lock.h>
fb919e4dSMark Murray#include <sys/mutex.h>
26f9a767SRodney W. Grimes#include <sys/proc.h>
9c8b8baaSPeter Wemm#include <sys/kthread.h>
0384fff8SJason Evans#include <sys/ktr.h>
97824da3SAlan Cox#include <sys/mount.h>
099e7e95SEdward Tomasz Napierala#include <sys/racct.h>
26f9a767SRodney W. Grimes#include <sys/resourcevar.h>
b43179fbSJeff Roberson#include <sys/sched.h>
d2fc5315SPoul-Henning Kamp#include <sys/signalvar.h>
f6b04d2bSDavid Greenman#include <sys/vnode.h>
efeaf95aSDavid Greenman#include <sys/vmmeter.h>
1005a129SJohn Baldwin#include <sys/sx.h>
38efa82bSJohn Dyson#include <sys/sysctl.h>
df8bae1dSRodney W. Grimes
df8bae1dSRodney W. Grimes#include <vm/vm.h>
efeaf95aSDavid Greenman#include <vm/vm_param.h>
efeaf95aSDavid Greenman#include <vm/vm_object.h>
df8bae1dSRodney W. Grimes#include <vm/vm_page.h>
efeaf95aSDavid Greenman#include <vm/vm_map.h>
df8bae1dSRodney W. Grimes#include <vm/vm_pageout.h>
24a1cce3SDavid Greenman#include <vm/vm_pager.h>
05f0fdd2SPoul-Henning Kamp#include <vm/swap_pager.h>
efeaf95aSDavid Greenman#include <vm/vm_extern.h>
670d17b5SJeff Roberson#include <vm/uma.h>
df8bae1dSRodney W. Grimes
2b14f991SJulian Elischer/*
2b14f991SJulian Elischer * System initialization
2b14f991SJulian Elischer */
2b14f991SJulian Elischer
2b14f991SJulian Elischer/* the kernel process "vm_pageout"*/
11caded3SAlfred Perlsteinstatic void vm_pageout(void);
11caded3SAlfred Perlsteinstatic int vm_pageout_clean(vm_page_t);
11caded3SAlfred Perlsteinstatic void vm_pageout_scan(int pass);
45ae1d91SAlan Cox
2b14f991SJulian Elischerstruct proc *pageproc;
2b14f991SJulian Elischer
2b14f991SJulian Elischerstatic struct kproc_desc page_kp = {
2b14f991SJulian Elischer	"pagedaemon",
2b14f991SJulian Elischer	vm_pageout,
2b14f991SJulian Elischer	&pageproc
2b14f991SJulian Elischer};
237fdd78SRobert WatsonSYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start,
237fdd78SRobert Watson    &page_kp);
2b14f991SJulian Elischer
38efa82bSJohn Dyson#if !defined(NO_SWAPPING)
2b14f991SJulian Elischer/* the kernel process "vm_daemon"*/
11caded3SAlfred Perlsteinstatic void vm_daemon(void);
f708ef1bSPoul-Henning Kampstatic struct	proc *vmproc;
2b14f991SJulian Elischer
2b14f991SJulian Elischerstatic struct kproc_desc vm_kp = {
2b14f991SJulian Elischer	"vmdaemon",
2b14f991SJulian Elischer	vm_daemon,
2b14f991SJulian Elischer	&vmproc
2b14f991SJulian Elischer};
237fdd78SRobert WatsonSYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
38efa82bSJohn Dyson#endif
2b14f991SJulian Elischer
2b14f991SJulian Elischer
8b245767SAlan Coxint vm_pages_needed;		/* Event on which pageout daemon sleeps */
8b245767SAlan Coxint vm_pageout_deficit;		/* Estimated number of pages deficit */
8b245767SAlan Coxint vm_pageout_pages_needed;	/* flag saying that the pageout daemon needs pages */
26f9a767SRodney W. Grimes
38efa82bSJohn Dyson#if !defined(NO_SWAPPING)
f708ef1bSPoul-Henning Kampstatic int vm_pageout_req_swapout;	/* XXX */
f708ef1bSPoul-Henning Kampstatic int vm_daemon_needed;
97824da3SAlan Coxstatic struct mtx vm_daemon_mtx;
97824da3SAlan Cox/* Allow for use by vm_pageout before vm_daemon is initialized. */
97824da3SAlan CoxMTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF);
38efa82bSJohn Dyson#endif
2b6b0df7SMatthew Dillonstatic int vm_max_launder = 32;
303b270bSEivind Eklundstatic int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
303b270bSEivind Eklundstatic int vm_pageout_full_stats_interval = 0;
26354d4cSAlan Coxstatic int vm_pageout_algorithm=0;
303b270bSEivind Eklundstatic int defer_swap_pageouts=0;
303b270bSEivind Eklundstatic int disable_swap_pageouts=0;
70111b90SJohn Dyson
38efa82bSJohn Dyson#if defined(NO_SWAPPING)
303b270bSEivind Eklundstatic int vm_swap_enabled=0;
303b270bSEivind Eklundstatic int vm_swap_idle_enabled=0;
38efa82bSJohn Dyson#else
303b270bSEivind Eklundstatic int vm_swap_enabled=1;
303b270bSEivind Eklundstatic int vm_swap_idle_enabled=0;
38efa82bSJohn Dyson#endif
38efa82bSJohn Dyson
38efa82bSJohn DysonSYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm,
2b6b0df7SMatthew Dillon	CTLFLAG_RW, &vm_pageout_algorithm, 0, "LRU page mgmt");
2b6b0df7SMatthew Dillon
2b6b0df7SMatthew DillonSYSCTL_INT(_vm, OID_AUTO, max_launder,
2b6b0df7SMatthew Dillon	CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
38efa82bSJohn Dyson
dc2efb27SJohn DysonSYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
b0359e2cSPeter Wemm	CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
dc2efb27SJohn Dyson
dc2efb27SJohn DysonSYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval,
b0359e2cSPeter Wemm	CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan");
dc2efb27SJohn Dyson
dc2efb27SJohn DysonSYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval,
b0359e2cSPeter Wemm	CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan");
dc2efb27SJohn Dyson
38efa82bSJohn Dyson#if defined(NO_SWAPPING)
ceb0cf87SJohn DysonSYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
6bd9cb1cSTom Rhodes	CTLFLAG_RD, &vm_swap_enabled, 0, "Enable entire process swapout");
ceb0cf87SJohn DysonSYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
6bd9cb1cSTom Rhodes	CTLFLAG_RD, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
38efa82bSJohn Dyson#else
ceb0cf87SJohn DysonSYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
b0359e2cSPeter Wemm	CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
ceb0cf87SJohn DysonSYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
b0359e2cSPeter Wemm	CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
38efa82bSJohn Dyson#endif
26f9a767SRodney W. Grimes
ceb0cf87SJohn DysonSYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
b0359e2cSPeter Wemm	CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
12ac6a1dSJohn Dyson
ceb0cf87SJohn DysonSYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
b0359e2cSPeter Wemm	CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
12ac6a1dSJohn Dyson
23b59018SMatthew Dillonstatic int pageout_lock_miss;
23b59018SMatthew DillonSYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
23b59018SMatthew Dillon	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
23b59018SMatthew Dillon
ffc82b0aSJohn Dyson#define VM_PAGEOUT_PAGE_COUNT 16
bbc0ec52SDavid Greenmanint vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
df8bae1dSRodney W. Grimes
c3cb3e12SDavid Greenmanint vm_page_max_wired;		/* XXX max # of wired pages system-wide */
5dfc2870SAlan CoxSYSCTL_INT(_vm, OID_AUTO, max_wired,
5dfc2870SAlan Cox	CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count");
df8bae1dSRodney W. Grimes
38efa82bSJohn Dyson#if !defined(NO_SWAPPING)
ecf6279fSAlan Coxstatic void vm_pageout_map_deactivate_pages(vm_map_t, long);
ecf6279fSAlan Coxstatic void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long);
97824da3SAlan Coxstatic void vm_req_vmdaemon(int req);
38efa82bSJohn Dyson#endif
dc2efb27SJohn Dysonstatic void vm_pageout_page_stats(void);
cd41fc12SDavid Greenman
a8229fa3SAlan Cox/*
a8229fa3SAlan Cox * Initialize a dummy page for marking the caller's place in the specified
a8229fa3SAlan Cox * paging queue.  In principle, this function only needs to set the flag
a8229fa3SAlan Cox * PG_MARKER.  Nonetheless, it sets the flag VPO_BUSY and initializes the hold
a8229fa3SAlan Cox * count to one as safety precautions.
a8229fa3SAlan Cox */
8c616246SKonstantin Belousovstatic void
8c616246SKonstantin Belousovvm_pageout_init_marker(vm_page_t marker, u_short queue)
8c616246SKonstantin Belousov{
8c616246SKonstantin Belousov
8c616246SKonstantin Belousov	bzero(marker, sizeof(*marker));
a8229fa3SAlan Cox	marker->flags = PG_MARKER;
8c616246SKonstantin Belousov	marker->oflags = VPO_BUSY;
8c616246SKonstantin Belousov	marker->queue = queue;
a8229fa3SAlan Cox	marker->hold_count = 1;
8c616246SKonstantin Belousov}
8c616246SKonstantin Belousov
26f9a767SRodney W. Grimes/*
8dbca793STor Egge * vm_pageout_fallback_object_lock:
8dbca793STor Egge *
8dbca793STor Egge * Lock vm object currently associated with `m'. VM_OBJECT_TRYLOCK is
8dbca793STor Egge * known to have failed and page queue must be either PQ_ACTIVE or
8dbca793STor Egge * PQ_INACTIVE.  To avoid lock order violation, unlock the page queues
8dbca793STor Egge * while locking the vm object.  Use marker page to detect page queue
8dbca793STor Egge * changes and maintain notion of next page on page queue.  Return
8dbca793STor Egge * TRUE if no changes were detected, FALSE otherwise.  vm object is
8dbca793STor Egge * locked on return.
8dbca793STor Egge *
8dbca793STor Egge * This function depends on both the lock portion of struct vm_object
8dbca793STor Egge * and normal struct vm_page being type stable.
8dbca793STor Egge */
da31e3aaSAlan Coxboolean_t
8dbca793STor Eggevm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next)
8dbca793STor Egge{
8dbca793STor Egge	struct vm_page marker;
8dbca793STor Egge	boolean_t unchanged;
8dbca793STor Egge	u_short queue;
8dbca793STor Egge	vm_object_t object;
8dbca793STor Egge
8dbca793STor Egge	queue = m->queue;
8c616246SKonstantin Belousov	vm_pageout_init_marker(&marker, queue);
8dbca793STor Egge	object = m->object;
8dbca793STor Egge
8dbca793STor Egge	TAILQ_INSERT_AFTER(&vm_page_queues[queue].pl,
8dbca793STor Egge			   m, &marker, pageq);
8dbca793STor Egge	vm_page_unlock_queues();
2965a453SKip Macy	vm_page_unlock(m);
8dbca793STor Egge	VM_OBJECT_LOCK(object);
2965a453SKip Macy	vm_page_lock(m);
8dbca793STor Egge	vm_page_lock_queues();
8dbca793STor Egge
8dbca793STor Egge	/* Page queue might have changed. */
8dbca793STor Egge	*next = TAILQ_NEXT(&marker, pageq);
8dbca793STor Egge	unchanged = (m->queue == queue &&
8dbca793STor Egge		     m->object == object &&
8dbca793STor Egge		     &marker == TAILQ_NEXT(m, pageq));
8dbca793STor Egge	TAILQ_REMOVE(&vm_page_queues[queue].pl,
8dbca793STor Egge		     &marker, pageq);
8dbca793STor Egge	return (unchanged);
8dbca793STor Egge}
8dbca793STor Egge
8dbca793STor Egge/*
8c616246SKonstantin Belousov * Lock the page while holding the page queue lock.  Use marker page
8c616246SKonstantin Belousov * to detect page queue changes and maintain notion of next page on
8c616246SKonstantin Belousov * page queue.  Return TRUE if no changes were detected, FALSE
8c616246SKonstantin Belousov * otherwise.  The page is locked on return. The page queue lock might
8c616246SKonstantin Belousov * be dropped and reacquired.
8c616246SKonstantin Belousov *
8c616246SKonstantin Belousov * This function depends on normal struct vm_page being type stable.
8c616246SKonstantin Belousov */
8c616246SKonstantin Belousovboolean_t
8c616246SKonstantin Belousovvm_pageout_page_lock(vm_page_t m, vm_page_t *next)
8c616246SKonstantin Belousov{
8c616246SKonstantin Belousov	struct vm_page marker;
8c616246SKonstantin Belousov	boolean_t unchanged;
8c616246SKonstantin Belousov	u_short queue;
8c616246SKonstantin Belousov
8c616246SKonstantin Belousov	vm_page_lock_assert(m, MA_NOTOWNED);
8c616246SKonstantin Belousov	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
8c616246SKonstantin Belousov
8c616246SKonstantin Belousov	if (vm_page_trylock(m))
8c616246SKonstantin Belousov		return (TRUE);
8c616246SKonstantin Belousov
8c616246SKonstantin Belousov	queue = m->queue;
8c616246SKonstantin Belousov	vm_pageout_init_marker(&marker, queue);
8c616246SKonstantin Belousov
8c616246SKonstantin Belousov	TAILQ_INSERT_AFTER(&vm_page_queues[queue].pl, m, &marker, pageq);
8c616246SKonstantin Belousov	vm_page_unlock_queues();
8c616246SKonstantin Belousov	vm_page_lock(m);
8c616246SKonstantin Belousov	vm_page_lock_queues();
8c616246SKonstantin Belousov
8c616246SKonstantin Belousov	/* Page queue might have changed. */
8c616246SKonstantin Belousov	*next = TAILQ_NEXT(&marker, pageq);
8c616246SKonstantin Belousov	unchanged = (m->queue == queue && &marker == TAILQ_NEXT(m, pageq));
8c616246SKonstantin Belousov	TAILQ_REMOVE(&vm_page_queues[queue].pl, &marker, pageq);
8c616246SKonstantin Belousov	return (unchanged);
8c616246SKonstantin Belousov}
8c616246SKonstantin Belousov
8c616246SKonstantin Belousov/*
26f9a767SRodney W. Grimes * vm_pageout_clean:
24a1cce3SDavid Greenman *
0d94caffSDavid Greenman * Clean the page and remove it from the laundry.
26f9a767SRodney W. Grimes *
0d94caffSDavid Greenman * We set the busy bit to cause potential page faults on this page to
1c7c3c6aSMatthew Dillon * block.  Note the careful timing, however, the busy bit isn't set till
1c7c3c6aSMatthew Dillon * late and we cannot do anything that will mess with the page.
26f9a767SRodney W. Grimes */
3af76890SPoul-Henning Kampstatic int
2965a453SKip Macyvm_pageout_clean(vm_page_t m)
24a1cce3SDavid Greenman{
54d92145SMatthew Dillon	vm_object_t object;
91b4f427SAlan Cox	vm_page_t mc[2*vm_pageout_page_count], pb, ps;
3562af12SAlan Cox	int pageout_count;
90ecac61SMatthew Dillon	int ib, is, page_base;
a316d390SJohn Dyson	vm_pindex_t pindex = m->pindex;
26f9a767SRodney W. Grimes
95976f3fSAlan Cox	vm_page_lock_assert(m, MA_OWNED);
17f6a17bSAlan Cox	object = m->object;
17f6a17bSAlan Cox	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
0cddd8f0SMatthew Dillon
26f9a767SRodney W. Grimes	/*
1c7c3c6aSMatthew Dillon	 * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
1c7c3c6aSMatthew Dillon	 * with the new swapper, but we could have serious problems paging
1c7c3c6aSMatthew Dillon	 * out other object types if there is insufficient memory.
1c7c3c6aSMatthew Dillon	 *
1c7c3c6aSMatthew Dillon	 * Unfortunately, checking free memory here is far too late, so the
1c7c3c6aSMatthew Dillon	 * check has been moved up a procedural level.
1c7c3c6aSMatthew Dillon	 */
1c7c3c6aSMatthew Dillon
24a1cce3SDavid Greenman	/*
9e897b1bSAlan Cox	 * Can't clean the page if it's busy or held.
24a1cce3SDavid Greenman	 */
95976f3fSAlan Cox	KASSERT(m->busy == 0 && (m->oflags & VPO_BUSY) == 0,
95976f3fSAlan Cox	    ("vm_pageout_clean: page %p is busy", m));
95976f3fSAlan Cox	KASSERT(m->hold_count == 0, ("vm_pageout_clean: page %p is held", m));
17f6a17bSAlan Cox	vm_page_unlock(m);
0d94caffSDavid Greenman
91b4f427SAlan Cox	mc[vm_pageout_page_count] = pb = ps = m;
26f9a767SRodney W. Grimes	pageout_count = 1;
f35329acSJohn Dyson	page_base = vm_pageout_page_count;
90ecac61SMatthew Dillon	ib = 1;
90ecac61SMatthew Dillon	is = 1;
90ecac61SMatthew Dillon
24a1cce3SDavid Greenman	/*
24a1cce3SDavid Greenman	 * Scan object for clusterable pages.
24a1cce3SDavid Greenman	 *
24a1cce3SDavid Greenman	 * We can cluster ONLY if: ->> the page is NOT
24a1cce3SDavid Greenman	 * clean, wired, busy, held, or mapped into a
24a1cce3SDavid Greenman	 * buffer, and one of the following:
24a1cce3SDavid Greenman	 * 1) The page is inactive, or a seldom used
24a1cce3SDavid Greenman	 *    active page.
24a1cce3SDavid Greenman	 * -or-
24a1cce3SDavid Greenman	 * 2) we force the issue.
90ecac61SMatthew Dillon	 *
90ecac61SMatthew Dillon	 * During heavy mmap/modification loads the pageout
90ecac61SMatthew Dillon	 * daemon can really fragment the underlying file
90ecac61SMatthew Dillon	 * due to flushing pages out of order and not trying
90ecac61SMatthew Dillon	 * align the clusters (which leave sporatic out-of-order
90ecac61SMatthew Dillon	 * holes).  To solve this problem we do the reverse scan
90ecac61SMatthew Dillon	 * first and attempt to align our cluster, then do a
90ecac61SMatthew Dillon	 * forward scan if room remains.
24a1cce3SDavid Greenman	 */
90ecac61SMatthew Dillonmore:
90ecac61SMatthew Dillon	while (ib && pageout_count < vm_pageout_page_count) {
24a1cce3SDavid Greenman		vm_page_t p;
f6b04d2bSDavid Greenman
90ecac61SMatthew Dillon		if (ib > pindex) {
90ecac61SMatthew Dillon			ib = 0;
90ecac61SMatthew Dillon			break;
f6b04d2bSDavid Greenman		}
90ecac61SMatthew Dillon
91b4f427SAlan Cox		if ((p = vm_page_prev(pb)) == NULL ||
91b4f427SAlan Cox		    (p->oflags & VPO_BUSY) != 0 || p->busy != 0) {
90ecac61SMatthew Dillon			ib = 0;
90ecac61SMatthew Dillon			break;
f6b04d2bSDavid Greenman		}
2965a453SKip Macy		vm_page_lock(p);
24a1cce3SDavid Greenman		vm_page_test_dirty(p);
26f4eea5SAlan Cox		if (p->dirty == 0 ||
90ecac61SMatthew Dillon		    p->queue != PQ_INACTIVE ||
57601bcbSMatthew Dillon		    p->hold_count != 0) {	/* may be undergoing I/O */
2965a453SKip Macy			vm_page_unlock(p);
90ecac61SMatthew Dillon			ib = 0;
24a1cce3SDavid Greenman			break;
f6b04d2bSDavid Greenman		}
2965a453SKip Macy		vm_page_unlock(p);
91b4f427SAlan Cox		mc[--page_base] = pb = p;
90ecac61SMatthew Dillon		++pageout_count;
90ecac61SMatthew Dillon		++ib;
24a1cce3SDavid Greenman		/*
90ecac61SMatthew Dillon		 * alignment boundry, stop here and switch directions.  Do
90ecac61SMatthew Dillon		 * not clear ib.
24a1cce3SDavid Greenman		 */
90ecac61SMatthew Dillon		if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
90ecac61SMatthew Dillon			break;
24a1cce3SDavid Greenman	}
90ecac61SMatthew Dillon
90ecac61SMatthew Dillon	while (pageout_count < vm_pageout_page_count &&
90ecac61SMatthew Dillon	    pindex + is < object->size) {
90ecac61SMatthew Dillon		vm_page_t p;
90ecac61SMatthew Dillon
91b4f427SAlan Cox		if ((p = vm_page_next(ps)) == NULL ||
91b4f427SAlan Cox		    (p->oflags & VPO_BUSY) != 0 || p->busy != 0)
90ecac61SMatthew Dillon			break;
2965a453SKip Macy		vm_page_lock(p);
24a1cce3SDavid Greenman		vm_page_test_dirty(p);
26f4eea5SAlan Cox		if (p->dirty == 0 ||
90ecac61SMatthew Dillon		    p->queue != PQ_INACTIVE ||
57601bcbSMatthew Dillon		    p->hold_count != 0) {	/* may be undergoing I/O */
2965a453SKip Macy			vm_page_unlock(p);
24a1cce3SDavid Greenman			break;
24a1cce3SDavid Greenman		}
2965a453SKip Macy		vm_page_unlock(p);
91b4f427SAlan Cox		mc[page_base + pageout_count] = ps = p;
90ecac61SMatthew Dillon		++pageout_count;
90ecac61SMatthew Dillon		++is;
24a1cce3SDavid Greenman	}
90ecac61SMatthew Dillon
90ecac61SMatthew Dillon	/*
90ecac61SMatthew Dillon	 * If we exhausted our forward scan, continue with the reverse scan
90ecac61SMatthew Dillon	 * when possible, even past a page boundry.  This catches boundry
90ecac61SMatthew Dillon	 * conditions.
90ecac61SMatthew Dillon	 */
90ecac61SMatthew Dillon	if (ib && pageout_count < vm_pageout_page_count)
90ecac61SMatthew Dillon		goto more;
f6b04d2bSDavid Greenman
67bf6868SJohn Dyson	/*
67bf6868SJohn Dyson	 * we allow reads during pageouts...
67bf6868SJohn Dyson	 */
126d6082SKonstantin Belousov	return (vm_pageout_flush(&mc[page_base], pageout_count, 0, 0, NULL,
126d6082SKonstantin Belousov	    NULL));
aef922f5SJohn Dyson}
aef922f5SJohn Dyson
1c7c3c6aSMatthew Dillon/*
1c7c3c6aSMatthew Dillon * vm_pageout_flush() - launder the given pages
1c7c3c6aSMatthew Dillon *
1c7c3c6aSMatthew Dillon *	The given pages are laundered.  Note that we setup for the start of
1c7c3c6aSMatthew Dillon *	I/O ( i.e. busy the page ), mark it read-only, and bump the object
1c7c3c6aSMatthew Dillon *	reference count all in here rather then in the parent.  If we want
1c7c3c6aSMatthew Dillon *	the parent to do more sophisticated things we may have to change
1c7c3c6aSMatthew Dillon *	the ordering.
1e8a675cSKonstantin Belousov *
1e8a675cSKonstantin Belousov *	Returned runlen is the count of pages between mreq and first
1e8a675cSKonstantin Belousov *	page after mreq with status VM_PAGER_AGAIN.
126d6082SKonstantin Belousov *	*eio is set to TRUE if pager returned VM_PAGER_ERROR or VM_PAGER_FAIL
126d6082SKonstantin Belousov *	for any page in runlen set.
1c7c3c6aSMatthew Dillon */
aef922f5SJohn Dysonint
126d6082SKonstantin Belousovvm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen,
126d6082SKonstantin Belousov    boolean_t *eio)
aef922f5SJohn Dyson{
2e3b314dSAlan Cox	vm_object_t object = mc[0]->object;
aef922f5SJohn Dyson	int pageout_status[count];
95461b45SJohn Dyson	int numpagedout = 0;
1e8a675cSKonstantin Belousov	int i, runlen;
aef922f5SJohn Dyson
2e3b314dSAlan Cox	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
7bec141bSKip Macy	mtx_assert(&vm_page_queue_mtx, MA_NOTOWNED);
7bec141bSKip Macy
1c7c3c6aSMatthew Dillon	/*
1c7c3c6aSMatthew Dillon	 * Initiate I/O.  Bump the vm_page_t->busy counter and
1c7c3c6aSMatthew Dillon	 * mark the pages read-only.
1c7c3c6aSMatthew Dillon	 *
1c7c3c6aSMatthew Dillon	 * We do not have to fixup the clean/dirty bits here... we can
1c7c3c6aSMatthew Dillon	 * allow the pager to do it after the I/O completes.
02fa91d3SMatthew Dillon	 *
02fa91d3SMatthew Dillon	 * NOTE! mc[i]->dirty may be partial or fragmented due to an
02fa91d3SMatthew Dillon	 * edge case with file fragments.
1c7c3c6aSMatthew Dillon	 */
8f9110f6SJohn Dyson	for (i = 0; i < count; i++) {
7a935082SAlan Cox		KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
7a935082SAlan Cox		    ("vm_pageout_flush: partially invalid page %p index %d/%d",
7a935082SAlan Cox			mc[i], i, count));
e69763a3SDoug Rabson		vm_page_io_start(mc[i]);
78985e42SAlan Cox		pmap_remove_write(mc[i]);
2965a453SKip Macy	}
d474eaaaSDoug Rabson	vm_object_pip_add(object, count);
aef922f5SJohn Dyson
d076fbeaSAlan Cox	vm_pager_put_pages(object, mc, count, flags, pageout_status);
26f9a767SRodney W. Grimes
1e8a675cSKonstantin Belousov	runlen = count - mreq;
126d6082SKonstantin Belousov	if (eio != NULL)
126d6082SKonstantin Belousov		*eio = FALSE;
aef922f5SJohn Dyson	for (i = 0; i < count; i++) {
aef922f5SJohn Dyson		vm_page_t mt = mc[i];
24a1cce3SDavid Greenman
4cd45723SAlan Cox		KASSERT(pageout_status[i] == VM_PAGER_PEND ||
*6031c68dSAlan Cox		    !pmap_page_is_write_mapped(mt),
9ea8d1a6SAlan Cox		    ("vm_pageout_flush: page %p is not write protected", mt));
26f9a767SRodney W. Grimes		switch (pageout_status[i]) {
26f9a767SRodney W. Grimes		case VM_PAGER_OK:
26f9a767SRodney W. Grimes		case VM_PAGER_PEND:
95461b45SJohn Dyson			numpagedout++;
26f9a767SRodney W. Grimes			break;
26f9a767SRodney W. Grimes		case VM_PAGER_BAD:
26f9a767SRodney W. Grimes			/*
0d94caffSDavid Greenman			 * Page outside of range of object. Right now we
0d94caffSDavid Greenman			 * essentially lose the changes by pretending it
0d94caffSDavid Greenman			 * worked.
26f9a767SRodney W. Grimes			 */
90ecac61SMatthew Dillon			vm_page_undirty(mt);
26f9a767SRodney W. Grimes			break;
26f9a767SRodney W. Grimes		case VM_PAGER_ERROR:
26f9a767SRodney W. Grimes		case VM_PAGER_FAIL:
26f9a767SRodney W. Grimes			/*
0d94caffSDavid Greenman			 * If page couldn't be paged out, then reactivate the
0d94caffSDavid Greenman			 * page so it doesn't clog the inactive list.  (We
0d94caffSDavid Greenman			 * will try paging out it again later).
26f9a767SRodney W. Grimes			 */
3c4a2440SAlan Cox			vm_page_lock(mt);
24a1cce3SDavid Greenman			vm_page_activate(mt);
3c4a2440SAlan Cox			vm_page_unlock(mt);
126d6082SKonstantin Belousov			if (eio != NULL && i >= mreq && i - mreq < runlen)
126d6082SKonstantin Belousov				*eio = TRUE;
26f9a767SRodney W. Grimes			break;
26f9a767SRodney W. Grimes		case VM_PAGER_AGAIN:
1e8a675cSKonstantin Belousov			if (i >= mreq && i - mreq < runlen)
1e8a675cSKonstantin Belousov				runlen = i - mreq;
26f9a767SRodney W. Grimes			break;
26f9a767SRodney W. Grimes		}
26f9a767SRodney W. Grimes
26f9a767SRodney W. Grimes		/*
0d94caffSDavid Greenman		 * If the operation is still going, leave the page busy to
0d94caffSDavid Greenman		 * block all other accesses. Also, leave the paging in
0d94caffSDavid Greenman		 * progress indicator set so that we don't attempt an object
0d94caffSDavid Greenman		 * collapse.
26f9a767SRodney W. Grimes		 */
26f9a767SRodney W. Grimes		if (pageout_status[i] != VM_PAGER_PEND) {
f919ebdeSDavid Greenman			vm_object_pip_wakeup(object);
e69763a3SDoug Rabson			vm_page_io_finish(mt);
3c4a2440SAlan Cox			if (vm_page_count_severe()) {
3c4a2440SAlan Cox				vm_page_lock(mt);
9ea8d1a6SAlan Cox				vm_page_try_to_cache(mt);
2965a453SKip Macy				vm_page_unlock(mt);
26f9a767SRodney W. Grimes			}
3c4a2440SAlan Cox		}
3c4a2440SAlan Cox	}
1e8a675cSKonstantin Belousov	if (prunlen != NULL)
1e8a675cSKonstantin Belousov		*prunlen = runlen;
3c4a2440SAlan Cox	return (numpagedout);
26f9a767SRodney W. Grimes}
26f9a767SRodney W. Grimes
38efa82bSJohn Dyson#if !defined(NO_SWAPPING)
26f9a767SRodney W. Grimes/*
26f9a767SRodney W. Grimes *	vm_pageout_object_deactivate_pages
26f9a767SRodney W. Grimes *
ce186587SAlan Cox *	Deactivate enough pages to satisfy the inactive target
ce186587SAlan Cox *	requirements.
26f9a767SRodney W. Grimes *
26f9a767SRodney W. Grimes *	The object and map must be locked.
26f9a767SRodney W. Grimes */
38efa82bSJohn Dysonstatic void
ce186587SAlan Coxvm_pageout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object,
ce186587SAlan Cox    long desired)
26f9a767SRodney W. Grimes{
ecf6279fSAlan Cox	vm_object_t backing_object, object;
ce186587SAlan Cox	vm_page_t p;
82bfb965SAlan Cox	int actcount, remove_mode;
26f9a767SRodney W. Grimes
ecf6279fSAlan Cox	VM_OBJECT_LOCK_ASSERT(first_object, MA_OWNED);
01381811SJohn Baldwin	if (first_object->type == OBJT_DEVICE ||
82bfb965SAlan Cox	    first_object->type == OBJT_SG)
38efa82bSJohn Dyson		return;
ecf6279fSAlan Cox	for (object = first_object;; object = backing_object) {
ecf6279fSAlan Cox		if (pmap_resident_count(pmap) <= desired)
ecf6279fSAlan Cox			goto unlock_return;
447fe2a4SAlan Cox		VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
82bfb965SAlan Cox		if (object->type == OBJT_PHYS || object->paging_in_progress)
ecf6279fSAlan Cox			goto unlock_return;
26f9a767SRodney W. Grimes
85b1dc89SAlan Cox		remove_mode = 0;
38efa82bSJohn Dyson		if (object->shadow_count > 1)
38efa82bSJohn Dyson			remove_mode = 1;
26f9a767SRodney W. Grimes		/*
ce186587SAlan Cox		 * Scan the object's entire memory queue.
26f9a767SRodney W. Grimes		 */
ce186587SAlan Cox		TAILQ_FOREACH(p, &object->memq, listq) {
447fe2a4SAlan Cox			if (pmap_resident_count(pmap) <= desired)
447fe2a4SAlan Cox				goto unlock_return;
ce186587SAlan Cox			if ((p->oflags & VPO_BUSY) != 0 || p->busy != 0)
447fe2a4SAlan Cox				continue;
ce186587SAlan Cox			PCPU_INC(cnt.v_pdpages);
2965a453SKip Macy			vm_page_lock(p);
ce186587SAlan Cox			if (p->wire_count != 0 || p->hold_count != 0 ||
ecf6279fSAlan Cox			    !pmap_page_exists_quick(pmap, p)) {
2965a453SKip Macy				vm_page_unlock(p);
0d94caffSDavid Greenman				continue;
0d94caffSDavid Greenman			}
0385347cSPeter Wemm			actcount = pmap_ts_referenced(p);
3407fefeSKonstantin Belousov			if ((p->aflags & PGA_REFERENCED) != 0) {
ce186587SAlan Cox				if (actcount == 0)
7e006499SJohn Dyson					actcount = 1;
3407fefeSKonstantin Belousov				vm_page_aflag_clear(p, PGA_REFERENCED);
ef743ce6SJohn Dyson			}
ce186587SAlan Cox			if (p->queue != PQ_ACTIVE && actcount != 0) {
ef743ce6SJohn Dyson				vm_page_activate(p);
7e006499SJohn Dyson				p->act_count += actcount;
c8c4b40cSJohn Dyson			} else if (p->queue == PQ_ACTIVE) {
ce186587SAlan Cox				if (actcount == 0) {
ce186587SAlan Cox					p->act_count -= min(p->act_count,
ce186587SAlan Cox					    ACT_DECLINE);
ce186587SAlan Cox					if (!remove_mode &&
ce186587SAlan Cox					    (vm_pageout_algorithm ||
ce186587SAlan Cox					    p->act_count == 0)) {
4fec79beSAlan Cox						pmap_remove_all(p);
26f9a767SRodney W. Grimes						vm_page_deactivate(p);
26f9a767SRodney W. Grimes					} else {
ce186587SAlan Cox						vm_page_lock_queues();
e5b006ffSAlan Cox						vm_page_requeue(p);
ce186587SAlan Cox						vm_page_unlock_queues();
c8c4b40cSJohn Dyson					}
c8c4b40cSJohn Dyson				} else {
eaf13dd7SJohn Dyson					vm_page_activate(p);
ce186587SAlan Cox					if (p->act_count < ACT_MAX -
ce186587SAlan Cox					    ACT_ADVANCE)
38efa82bSJohn Dyson						p->act_count += ACT_ADVANCE;
ce186587SAlan Cox					vm_page_lock_queues();
e5b006ffSAlan Cox					vm_page_requeue(p);
2965a453SKip Macy					vm_page_unlock_queues();
ce186587SAlan Cox				}
ce186587SAlan Cox			} else if (p->queue == PQ_INACTIVE)
ce186587SAlan Cox				pmap_remove_all(p);
2965a453SKip Macy			vm_page_unlock(p);
26f9a767SRodney W. Grimes		}
ecf6279fSAlan Cox		if ((backing_object = object->backing_object) == NULL)
ecf6279fSAlan Cox			goto unlock_return;
ecf6279fSAlan Cox		VM_OBJECT_LOCK(backing_object);
ecf6279fSAlan Cox		if (object != first_object)
ecf6279fSAlan Cox			VM_OBJECT_UNLOCK(object);
38efa82bSJohn Dyson	}
ecf6279fSAlan Coxunlock_return:
ecf6279fSAlan Cox	if (object != first_object)
ecf6279fSAlan Cox		VM_OBJECT_UNLOCK(object);
26f9a767SRodney W. Grimes}
26f9a767SRodney W. Grimes
26f9a767SRodney W. Grimes/*
26f9a767SRodney W. Grimes * deactivate some number of pages in a map, try to do it fairly, but
26f9a767SRodney W. Grimes * that is really hard to do.
26f9a767SRodney W. Grimes */
cd41fc12SDavid Greenmanstatic void
38efa82bSJohn Dysonvm_pageout_map_deactivate_pages(map, desired)
26f9a767SRodney W. Grimes	vm_map_t map;
ecf6279fSAlan Cox	long desired;
26f9a767SRodney W. Grimes{
26f9a767SRodney W. Grimes	vm_map_entry_t tmpe;
38efa82bSJohn Dyson	vm_object_t obj, bigobj;
30105b9eSTor Egge	int nothingwired;
0d94caffSDavid Greenman
d974f03cSAlan Cox	if (!vm_map_trylock(map))
26f9a767SRodney W. Grimes		return;
38efa82bSJohn Dyson
38efa82bSJohn Dyson	bigobj = NULL;
30105b9eSTor Egge	nothingwired = TRUE;
38efa82bSJohn Dyson
38efa82bSJohn Dyson	/*
38efa82bSJohn Dyson	 * first, search out the biggest object, and try to free pages from
38efa82bSJohn Dyson	 * that.
38efa82bSJohn Dyson	 */
26f9a767SRodney W. Grimes	tmpe = map->header.next;
38efa82bSJohn Dyson	while (tmpe != &map->header) {
9fdfe602SMatthew Dillon		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
38efa82bSJohn Dyson			obj = tmpe->object.vm_object;
0774dfb3SAlan Cox			if (obj != NULL && VM_OBJECT_TRYLOCK(obj)) {
0774dfb3SAlan Cox				if (obj->shadow_count <= 1 &&
0774dfb3SAlan Cox				    (bigobj == NULL ||
0774dfb3SAlan Cox				     bigobj->resident_page_count < obj->resident_page_count)) {
0774dfb3SAlan Cox					if (bigobj != NULL)
0774dfb3SAlan Cox						VM_OBJECT_UNLOCK(bigobj);
38efa82bSJohn Dyson					bigobj = obj;
0774dfb3SAlan Cox				} else
0774dfb3SAlan Cox					VM_OBJECT_UNLOCK(obj);
38efa82bSJohn Dyson			}
38efa82bSJohn Dyson		}
30105b9eSTor Egge		if (tmpe->wired_count > 0)
30105b9eSTor Egge			nothingwired = FALSE;
38efa82bSJohn Dyson		tmpe = tmpe->next;
38efa82bSJohn Dyson	}
38efa82bSJohn Dyson
0774dfb3SAlan Cox	if (bigobj != NULL) {
ecf6279fSAlan Cox		vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired);
0774dfb3SAlan Cox		VM_OBJECT_UNLOCK(bigobj);
0774dfb3SAlan Cox	}
38efa82bSJohn Dyson	/*
38efa82bSJohn Dyson	 * Next, hunt around for other pages to deactivate.  We actually
38efa82bSJohn Dyson	 * do this search sort of wrong -- .text first is not the best idea.
38efa82bSJohn Dyson	 */
38efa82bSJohn Dyson	tmpe = map->header.next;
38efa82bSJohn Dyson	while (tmpe != &map->header) {
b1028ad1SLuoqi Chen		if (pmap_resident_count(vm_map_pmap(map)) <= desired)
38efa82bSJohn Dyson			break;
9fdfe602SMatthew Dillon		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
38efa82bSJohn Dyson			obj = tmpe->object.vm_object;
0774dfb3SAlan Cox			if (obj != NULL) {
0774dfb3SAlan Cox				VM_OBJECT_LOCK(obj);
ecf6279fSAlan Cox				vm_pageout_object_deactivate_pages(map->pmap, obj, desired);
0774dfb3SAlan Cox				VM_OBJECT_UNLOCK(obj);
0774dfb3SAlan Cox			}
38efa82bSJohn Dyson		}
26f9a767SRodney W. Grimes		tmpe = tmpe->next;
38857e7fSAlan Cox	}
38efa82bSJohn Dyson
38efa82bSJohn Dyson	/*
38efa82bSJohn Dyson	 * Remove all mappings if a process is swapped out, this will free page
38efa82bSJohn Dyson	 * table pages.
38efa82bSJohn Dyson	 */
38857e7fSAlan Cox	if (desired == 0 && nothingwired) {
8d01a3b2SNathan Whitehorn		pmap_remove(vm_map_pmap(map), vm_map_min(map),
8d01a3b2SNathan Whitehorn		    vm_map_max(map));
38857e7fSAlan Cox	}
38efa82bSJohn Dyson	vm_map_unlock(map);
26f9a767SRodney W. Grimes}
a1287949SEivind Eklund#endif		/* !defined(NO_SWAPPING) */
df8bae1dSRodney W. Grimes
1c7c3c6aSMatthew Dillon/*
df8bae1dSRodney W. Grimes *	vm_pageout_scan does the dirty work for the pageout daemon.
df8bae1dSRodney W. Grimes */
2b6b0df7SMatthew Dillonstatic void
2b6b0df7SMatthew Dillonvm_pageout_scan(int pass)
df8bae1dSRodney W. Grimes{
502ba6e4SJohn Dyson	vm_page_t m, next;
936524aaSMatthew Dillon	struct vm_page marker;
1c7c3c6aSMatthew Dillon	int page_shortage, maxscan, pcount;
1c7c3c6aSMatthew Dillon	int addl_page_shortage, addl_page_shortage_init;
df8bae1dSRodney W. Grimes	vm_object_t object;
2446e4f0SAlan Cox	int actcount;
f6b04d2bSDavid Greenman	int vnodes_skipped = 0;
2b6b0df7SMatthew Dillon	int maxlaunder;
0d94caffSDavid Greenman
df8bae1dSRodney W. Grimes	/*
855a310fSJeff Roberson	 * Decrease registered cache sizes.
855a310fSJeff Roberson	 */
855a310fSJeff Roberson	EVENTHANDLER_INVOKE(vm_lowmem, 0);
855a310fSJeff Roberson	/*
855a310fSJeff Roberson	 * We do this explicitly after the caches have been drained above.
855a310fSJeff Roberson	 */
855a310fSJeff Roberson	uma_reclaim();
5985940eSJohn Dyson
b0ef8c5fSAlan Cox	addl_page_shortage_init = atomic_readandclear_int(&vm_pageout_deficit);
b182ec9eSJohn Dyson
1c7c3c6aSMatthew Dillon	/*
1c7c3c6aSMatthew Dillon	 * Calculate the number of pages we want to either free or move
2b6b0df7SMatthew Dillon	 * to the cache.
1c7c3c6aSMatthew Dillon	 */
2b6b0df7SMatthew Dillon	page_shortage = vm_paging_target() + addl_page_shortage_init;
1c7c3c6aSMatthew Dillon
af394cfaSJung-uk Kim	vm_pageout_init_marker(&marker, PQ_INACTIVE);
936524aaSMatthew Dillon
936524aaSMatthew Dillon	/*
1c7c3c6aSMatthew Dillon	 * Start scanning the inactive queue for pages we can move to the
1c7c3c6aSMatthew Dillon	 * cache or free.  The scan will stop when the target is reached or
936524aaSMatthew Dillon	 * we have scanned the entire inactive queue.  Note that m->act_count
936524aaSMatthew Dillon	 * is not used to form decisions for the inactive queue, only for the
936524aaSMatthew Dillon	 * active queue.
2b6b0df7SMatthew Dillon	 *
2b6b0df7SMatthew Dillon	 * maxlaunder limits the number of dirty pages we flush per scan.
2b6b0df7SMatthew Dillon	 * For most systems a smaller value (16 or 32) is more robust under
2b6b0df7SMatthew Dillon	 * extreme memory and disk pressure because any unnecessary writes
2b6b0df7SMatthew Dillon	 * to disk can result in extreme performance degredation.  However,
2b6b0df7SMatthew Dillon	 * systems with excessive dirty pages (especially when MAP_NOSYNC is
2b6b0df7SMatthew Dillon	 * used) will die horribly with limited laundering.  If the pageout
2b6b0df7SMatthew Dillon	 * daemon cannot clean enough pages in the first pass, we let it go
2b6b0df7SMatthew Dillon	 * all out in succeeding passes.
1c7c3c6aSMatthew Dillon	 */
2b6b0df7SMatthew Dillon	if ((maxlaunder = vm_max_launder) <= 1)
2b6b0df7SMatthew Dillon		maxlaunder = 1;
2b6b0df7SMatthew Dillon	if (pass)
2b6b0df7SMatthew Dillon		maxlaunder = 10000;
3e1b578aSAlan Cox	vm_page_lock_queues();
67bf6868SJohn Dysonrescan0:
1c7c3c6aSMatthew Dillon	addl_page_shortage = addl_page_shortage_init;
2feb50bfSAttilio Rao	maxscan = cnt.v_inactive_count;
6d03d577SMatthew Dillon
be72f788SAlan Cox	for (m = TAILQ_FIRST(&vm_page_queues[PQ_INACTIVE].pl);
1c7c3c6aSMatthew Dillon	     m != NULL && maxscan-- > 0 && page_shortage > 0;
e929c00dSKirk McKusick	     m = next) {
df8bae1dSRodney W. Grimes
393a081dSAttilio Rao		cnt.v_pdpages++;
b182ec9eSJohn Dyson
9cf51988SAlan Cox		if (m->queue != PQ_INACTIVE)
67bf6868SJohn Dyson			goto rescan0;
b182ec9eSJohn Dyson
b18bfc3dSJohn Dyson		next = TAILQ_NEXT(m, pageq);
df8bae1dSRodney W. Grimes
936524aaSMatthew Dillon		/*
936524aaSMatthew Dillon		 * skip marker pages
936524aaSMatthew Dillon		 */
936524aaSMatthew Dillon		if (m->flags & PG_MARKER)
936524aaSMatthew Dillon			continue;
936524aaSMatthew Dillon
7900f95dSKonstantin Belousov		KASSERT((m->flags & PG_FICTITIOUS) == 0,
7900f95dSKonstantin Belousov		    ("Fictitious page %p cannot be in inactive queue", m));
7900f95dSKonstantin Belousov		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7900f95dSKonstantin Belousov		    ("Unmanaged page %p cannot be in inactive queue", m));
7900f95dSKonstantin Belousov
8c616246SKonstantin Belousov		/*
8c616246SKonstantin Belousov		 * Lock the page.
8c616246SKonstantin Belousov		 */
8c616246SKonstantin Belousov		if (!vm_pageout_page_lock(m, &next)) {
8c616246SKonstantin Belousov			vm_page_unlock(m);
b182ec9eSJohn Dyson			addl_page_shortage++;
b182ec9eSJohn Dyson			continue;
df8bae1dSRodney W. Grimes		}
2965a453SKip Macy
e8f26319SKip Macy		/*
e8f26319SKip Macy		 * A held page may be undergoing I/O, so skip it.
e8f26319SKip Macy		 */
9ee2165fSAlan Cox		if (m->hold_count) {
e8f26319SKip Macy			vm_page_unlock(m);
e8f26319SKip Macy			vm_page_requeue(m);
2965a453SKip Macy			addl_page_shortage++;
2965a453SKip Macy			continue;
2965a453SKip Macy		}
2965a453SKip Macy
26f9a767SRodney W. Grimes		/*
a1287949SEivind Eklund		 * Don't mess with busy pages, keep in the front of the
b18bfc3dSJohn Dyson		 * queue, most likely are being paged out.
26f9a767SRodney W. Grimes		 */
9ee2165fSAlan Cox		object = m->object;
8dbca793STor Egge		if (!VM_OBJECT_TRYLOCK(object) &&
8dbca793STor Egge		    (!vm_pageout_fallback_object_lock(m, &next) ||
8dbca793STor Egge			m->hold_count != 0)) {
8dbca793STor Egge			VM_OBJECT_UNLOCK(object);
2965a453SKip Macy			vm_page_unlock(m);
34d9e6fdSAlan Cox			addl_page_shortage++;
34d9e6fdSAlan Cox			continue;
34d9e6fdSAlan Cox		}
9af80719SAlan Cox		if (m->busy || (m->oflags & VPO_BUSY)) {
2965a453SKip Macy			vm_page_unlock(m);
34d9e6fdSAlan Cox			VM_OBJECT_UNLOCK(object);
b182ec9eSJohn Dyson			addl_page_shortage++;
26f9a767SRodney W. Grimes			continue;
26f9a767SRodney W. Grimes		}
bd7e5f99SJohn Dyson
7e006499SJohn Dyson		/*
1c7c3c6aSMatthew Dillon		 * If the object is not being used, we ignore previous
1c7c3c6aSMatthew Dillon		 * references.
7e006499SJohn Dyson		 */
34d9e6fdSAlan Cox		if (object->ref_count == 0) {
3407fefeSKonstantin Belousov			vm_page_aflag_clear(m, PGA_REFERENCED);
47916d0cSAlan Cox			KASSERT(!pmap_page_is_mapped(m),
47916d0cSAlan Cox			    ("vm_pageout_scan: page %p is mapped", m));
7e006499SJohn Dyson
7e006499SJohn Dyson		/*
1c7c3c6aSMatthew Dillon		 * Otherwise, if the page has been referenced while in the
1c7c3c6aSMatthew Dillon		 * inactive queue, we bump the "activation count" upwards,
1c7c3c6aSMatthew Dillon		 * making it less likely that the page will be added back to
1c7c3c6aSMatthew Dillon		 * the inactive queue prematurely again.  Here we check the
1c7c3c6aSMatthew Dillon		 * page tables (or emulated bits, if any), given the upper
1c7c3c6aSMatthew Dillon		 * level VM system not knowing anything about existing
1c7c3c6aSMatthew Dillon		 * references.
7e006499SJohn Dyson		 */
3407fefeSKonstantin Belousov		} else if (((m->aflags & PGA_REFERENCED) == 0) &&
0385347cSPeter Wemm			(actcount = pmap_ts_referenced(m))) {
ef743ce6SJohn Dyson			vm_page_activate(m);
2965a453SKip Macy			vm_page_unlock(m);
4c6a2e7aSAlan Cox			m->act_count += actcount + ACT_ADVANCE;
4c6a2e7aSAlan Cox			VM_OBJECT_UNLOCK(object);
ef743ce6SJohn Dyson			continue;
2fe6e4d7SDavid Greenman		}
ef743ce6SJohn Dyson
7e006499SJohn Dyson		/*
1c7c3c6aSMatthew Dillon		 * If the upper level VM system knows about any page
1c7c3c6aSMatthew Dillon		 * references, we activate the page.  We also set the
1c7c3c6aSMatthew Dillon		 * "activation count" higher than normal so that we will less
1c7c3c6aSMatthew Dillon		 * likely place pages back onto the inactive queue again.
7e006499SJohn Dyson		 */
3407fefeSKonstantin Belousov		if ((m->aflags & PGA_REFERENCED) != 0) {
3407fefeSKonstantin Belousov			vm_page_aflag_clear(m, PGA_REFERENCED);
0385347cSPeter Wemm			actcount = pmap_ts_referenced(m);
26f9a767SRodney W. Grimes			vm_page_activate(m);
2965a453SKip Macy			vm_page_unlock(m);
4c6a2e7aSAlan Cox			m->act_count += actcount + ACT_ADVANCE + 1;
4c6a2e7aSAlan Cox			VM_OBJECT_UNLOCK(object);
0d94caffSDavid Greenman			continue;
0d94caffSDavid Greenman		}
67bf6868SJohn Dyson
7e006499SJohn Dyson		/*
b78ddb0bSAlan Cox		 * If the upper level VM system does not believe that the page
b78ddb0bSAlan Cox		 * is fully dirty, but it is mapped for write access, then we
b78ddb0bSAlan Cox		 * consult the pmap to see if the page's dirty status should
b78ddb0bSAlan Cox		 * be updated.
7e006499SJohn Dyson		 */
b78ddb0bSAlan Cox		if (m->dirty != VM_PAGE_BITS_ALL &&
*6031c68dSAlan Cox		    pmap_page_is_write_mapped(m)) {
a3dfacb5SAlan Cox			/*
a3dfacb5SAlan Cox			 * Avoid a race condition: Unless write access is
a3dfacb5SAlan Cox			 * removed from the page, another processor could
a3dfacb5SAlan Cox			 * modify it before all access is removed by the call
a3dfacb5SAlan Cox			 * to vm_page_cache() below.  If vm_page_cache() finds
a3dfacb5SAlan Cox			 * that the page has been modified when it removes all
a3dfacb5SAlan Cox			 * access, it panics because it cannot cache dirty
a3dfacb5SAlan Cox			 * pages.  In principle, we could eliminate just write
a3dfacb5SAlan Cox			 * access here rather than all access.  In the expected
a3dfacb5SAlan Cox			 * case, when there are no last instant modifications
a3dfacb5SAlan Cox			 * to the page, removing all access will be cheaper
a3dfacb5SAlan Cox			 * overall.
a3dfacb5SAlan Cox			 */
b78ddb0bSAlan Cox			if (pmap_is_modified(m))
7dbf82dcSMatthew Dillon				vm_page_dirty(m);
b78ddb0bSAlan Cox			else if (m->dirty == 0)
b78ddb0bSAlan Cox				pmap_remove_all(m);
30dcfc09SJohn Dyson		}
dcbcd518SBruce Evans
6989c456SAlan Cox		if (m->valid == 0) {
7e006499SJohn Dyson			/*
7e006499SJohn Dyson			 * Invalid pages can be easily freed
7e006499SJohn Dyson			 */
6989c456SAlan Cox			vm_page_free(m);
393a081dSAttilio Rao			cnt.v_dfree++;
1c7c3c6aSMatthew Dillon			--page_shortage;
bd7e5f99SJohn Dyson		} else if (m->dirty == 0) {
6989c456SAlan Cox			/*
6989c456SAlan Cox			 * Clean pages can be placed onto the cache queue.
6989c456SAlan Cox			 * This effectively frees them.
6989c456SAlan Cox			 */
bd7e5f99SJohn Dyson			vm_page_cache(m);
1c7c3c6aSMatthew Dillon			--page_shortage;
2b6b0df7SMatthew Dillon		} else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
7e006499SJohn Dyson			/*
2b6b0df7SMatthew Dillon			 * Dirty pages need to be paged out, but flushing
2b6b0df7SMatthew Dillon			 * a page is extremely expensive verses freeing
2b6b0df7SMatthew Dillon			 * a clean page.  Rather then artificially limiting
2b6b0df7SMatthew Dillon			 * the number of pages we can flush, we instead give
2b6b0df7SMatthew Dillon			 * dirty pages extra priority on the inactive queue
2b6b0df7SMatthew Dillon			 * by forcing them to be cycled through the queue
2b6b0df7SMatthew Dillon			 * twice before being flushed, after which the
2b6b0df7SMatthew Dillon			 * (now clean) page will cycle through once more
2b6b0df7SMatthew Dillon			 * before being freed.  This significantly extends
2b6b0df7SMatthew Dillon			 * the thrash point for a heavily loaded machine.
7e006499SJohn Dyson			 */
3407fefeSKonstantin Belousov			m->flags |= PG_WINATCFLS;
e5b006ffSAlan Cox			vm_page_requeue(m);
0d94caffSDavid Greenman		} else if (maxlaunder > 0) {
2b6b0df7SMatthew Dillon			/*
2b6b0df7SMatthew Dillon			 * We always want to try to flush some dirty pages if
2b6b0df7SMatthew Dillon			 * we encounter them, to keep the system stable.
2b6b0df7SMatthew Dillon			 * Normally this number is small, but under extreme
2b6b0df7SMatthew Dillon			 * pressure where there are insufficient clean pages
2b6b0df7SMatthew Dillon			 * on the inactive queue, we may have to go all out.
2b6b0df7SMatthew Dillon			 */
97824da3SAlan Cox			int swap_pageouts_ok, vfslocked = 0;
f6b04d2bSDavid Greenman			struct vnode *vp = NULL;
14137dc0SAlan Cox			struct mount *mp = NULL;
0d94caffSDavid Greenman
12ac6a1dSJohn Dyson			if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) {
12ac6a1dSJohn Dyson				swap_pageouts_ok = 1;
12ac6a1dSJohn Dyson			} else {
12ac6a1dSJohn Dyson				swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts);
12ac6a1dSJohn Dyson				swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts &&
90ecac61SMatthew Dillon				vm_page_count_min());
12ac6a1dSJohn Dyson
12ac6a1dSJohn Dyson			}
70111b90SJohn Dyson
70111b90SJohn Dyson			/*
1c7c3c6aSMatthew Dillon			 * We don't bother paging objects that are "dead".
1c7c3c6aSMatthew Dillon			 * Those objects are in a "rundown" state.
70111b90SJohn Dyson			 */
70111b90SJohn Dyson			if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) {
2965a453SKip Macy				vm_page_unlock(m);
3562af12SAlan Cox				VM_OBJECT_UNLOCK(object);
e5b006ffSAlan Cox				vm_page_requeue(m);
12ac6a1dSJohn Dyson				continue;
12ac6a1dSJohn Dyson			}
12ac6a1dSJohn Dyson
1c7c3c6aSMatthew Dillon			/*
625e6c0aSTor Egge			 * Following operations may unlock
625e6c0aSTor Egge			 * vm_page_queue_mtx, invalidating the 'next'
625e6c0aSTor Egge			 * pointer.  To prevent an inordinate number
625e6c0aSTor Egge			 * of restarts we use our marker to remember
625e6c0aSTor Egge			 * our place.
625e6c0aSTor Egge			 *
625e6c0aSTor Egge			 */
625e6c0aSTor Egge			TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl,
625e6c0aSTor Egge					   m, &marker, pageq);
625e6c0aSTor Egge			/*
2b6b0df7SMatthew Dillon			 * The object is already known NOT to be dead.   It
2b6b0df7SMatthew Dillon			 * is possible for the vget() to block the whole
2b6b0df7SMatthew Dillon			 * pageout daemon, but the new low-memory handling
2b6b0df7SMatthew Dillon			 * code should prevent it.
1c7c3c6aSMatthew Dillon			 *
2b6b0df7SMatthew Dillon			 * The previous code skipped locked vnodes and, worse,
2b6b0df7SMatthew Dillon			 * reordered pages in the queue.  This results in
2b6b0df7SMatthew Dillon			 * completely non-deterministic operation and, on a
2b6b0df7SMatthew Dillon			 * busy system, can lead to extremely non-optimal
2b6b0df7SMatthew Dillon			 * pageouts.  For example, it can cause clean pages
2b6b0df7SMatthew Dillon			 * to be freed and dirty pages to be moved to the end
2b6b0df7SMatthew Dillon			 * of the queue.  Since dirty pages are also moved to
2b6b0df7SMatthew Dillon			 * the end of the queue once-cleaned, this gives
2b6b0df7SMatthew Dillon			 * way too large a weighting to defering the freeing
2b6b0df7SMatthew Dillon			 * of dirty pages.
1c7c3c6aSMatthew Dillon			 *
23b59018SMatthew Dillon			 * We can't wait forever for the vnode lock, we might
23b59018SMatthew Dillon			 * deadlock due to a vn_read() getting stuck in
23b59018SMatthew Dillon			 * vm_wait while holding this vnode.  We skip the
23b59018SMatthew Dillon			 * vnode if we can't get it in a reasonable amount
23b59018SMatthew Dillon			 * of time.
1c7c3c6aSMatthew Dillon			 */
1c7c3c6aSMatthew Dillon			if (object->type == OBJT_VNODE) {
2965a453SKip Macy				vm_page_unlock_queues();
2965a453SKip Macy				vm_page_unlock(m);
24a1cce3SDavid Greenman				vp = object->handle;
db27dcc0STor Egge				if (vp->v_type == VREG &&
db27dcc0STor Egge				    vn_start_write(vp, &mp, V_NOWAIT) != 0) {
6129343dSKonstantin Belousov					mp = NULL;
db27dcc0STor Egge					++pageout_lock_miss;
db27dcc0STor Egge					if (object->flags & OBJ_MIGHTBEDIRTY)
db27dcc0STor Egge						vnodes_skipped++;
2965a453SKip Macy					vm_page_lock_queues();
625e6c0aSTor Egge					goto unlock_and_continue;
db27dcc0STor Egge				}
b9f180d1SKonstantin Belousov				KASSERT(mp != NULL,
b9f180d1SKonstantin Belousov				    ("vp %p with NULL v_mount", vp));
14137dc0SAlan Cox				vm_object_reference_locked(object);
3562af12SAlan Cox				VM_OBJECT_UNLOCK(object);
97824da3SAlan Cox				vfslocked = VFS_LOCK_GIANT(vp->v_mount);
97824da3SAlan Cox				if (vget(vp, LK_EXCLUSIVE | LK_TIMELOCK,
97824da3SAlan Cox				    curthread)) {
3562af12SAlan Cox					VM_OBJECT_LOCK(object);
3e1b578aSAlan Cox					vm_page_lock_queues();
23b59018SMatthew Dillon					++pageout_lock_miss;
aef922f5SJohn Dyson					if (object->flags & OBJ_MIGHTBEDIRTY)
925a3a41SJohn Dyson						vnodes_skipped++;
625e6c0aSTor Egge					vp = NULL;
625e6c0aSTor Egge					goto unlock_and_continue;
85a376ebSJohn Dyson				}
3562af12SAlan Cox				VM_OBJECT_LOCK(object);
2965a453SKip Macy				vm_page_lock(m);
3e1b578aSAlan Cox				vm_page_lock_queues();
f35329acSJohn Dyson				/*
936524aaSMatthew Dillon				 * The page might have been moved to another
936524aaSMatthew Dillon				 * queue during potential blocking in vget()
936524aaSMatthew Dillon				 * above.  The page might have been freed and
14137dc0SAlan Cox				 * reused for another vnode.
f35329acSJohn Dyson				 */
9cf51988SAlan Cox				if (m->queue != PQ_INACTIVE ||
936524aaSMatthew Dillon				    m->object != object ||
625e6c0aSTor Egge				    TAILQ_NEXT(m, pageq) != &marker) {
2965a453SKip Macy					vm_page_unlock(m);
b182ec9eSJohn Dyson					if (object->flags & OBJ_MIGHTBEDIRTY)
925a3a41SJohn Dyson						vnodes_skipped++;
3562af12SAlan Cox					goto unlock_and_continue;
b182ec9eSJohn Dyson				}
b182ec9eSJohn Dyson
f35329acSJohn Dyson				/*
936524aaSMatthew Dillon				 * The page may have been busied during the
14137dc0SAlan Cox				 * blocking in vget().  We don't move the
936524aaSMatthew Dillon				 * page back onto the end of the queue so that
936524aaSMatthew Dillon				 * statistics are more correct if we don't.
f35329acSJohn Dyson				 */
9af80719SAlan Cox				if (m->busy || (m->oflags & VPO_BUSY)) {
2965a453SKip Macy					vm_page_unlock(m);
3562af12SAlan Cox					goto unlock_and_continue;
b182ec9eSJohn Dyson				}
b182ec9eSJohn Dyson
f35329acSJohn Dyson				/*
57601bcbSMatthew Dillon				 * If the page has become held it might
57601bcbSMatthew Dillon				 * be undergoing I/O, so skip it
f35329acSJohn Dyson				 */
b182ec9eSJohn Dyson				if (m->hold_count) {
2965a453SKip Macy					vm_page_unlock(m);
e5b006ffSAlan Cox					vm_page_requeue(m);
b182ec9eSJohn Dyson					if (object->flags & OBJ_MIGHTBEDIRTY)
925a3a41SJohn Dyson						vnodes_skipped++;
3562af12SAlan Cox					goto unlock_and_continue;
f6b04d2bSDavid Greenman				}
f6b04d2bSDavid Greenman			}
f6b04d2bSDavid Greenman
0d94caffSDavid Greenman			/*
0d94caffSDavid Greenman			 * If a page is dirty, then it is either being washed
0d94caffSDavid Greenman			 * (but not yet cleaned) or it is still in the
0d94caffSDavid Greenman			 * laundry.  If it is still in the laundry, then we
2b6b0df7SMatthew Dillon			 * start the cleaning operation.
936524aaSMatthew Dillon			 *
2b6b0df7SMatthew Dillon			 * decrement page_shortage on success to account for
2b6b0df7SMatthew Dillon			 * the (future) cleaned page.  Otherwise we could wind
2b6b0df7SMatthew Dillon			 * up laundering or cleaning too many pages.
0d94caffSDavid Greenman			 */
2965a453SKip Macy			vm_page_unlock_queues();
2b6b0df7SMatthew Dillon			if (vm_pageout_clean(m) != 0) {
2b6b0df7SMatthew Dillon				--page_shortage;
936524aaSMatthew Dillon				--maxlaunder;
2b6b0df7SMatthew Dillon			}
2965a453SKip Macy			vm_page_lock_queues();
3562af12SAlan Coxunlock_and_continue:
2965a453SKip Macy			vm_page_lock_assert(m, MA_NOTOWNED);
6989c456SAlan Cox			VM_OBJECT_UNLOCK(object);
14137dc0SAlan Cox			if (mp != NULL) {
6989c456SAlan Cox				vm_page_unlock_queues();
14137dc0SAlan Cox				if (vp != NULL)
f6b04d2bSDavid Greenman					vput(vp);
97824da3SAlan Cox				VFS_UNLOCK_GIANT(vfslocked);
14137dc0SAlan Cox				vm_object_deallocate(object);
f2a2857bSKirk McKusick				vn_finished_write(mp);
6989c456SAlan Cox				vm_page_lock_queues();
6989c456SAlan Cox			}
625e6c0aSTor Egge			next = TAILQ_NEXT(&marker, pageq);
625e6c0aSTor Egge			TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl,
625e6c0aSTor Egge				     &marker, pageq);
2965a453SKip Macy			vm_page_lock_assert(m, MA_NOTOWNED);
6989c456SAlan Cox			continue;
f2a2857bSKirk McKusick		}
2965a453SKip Macy		vm_page_unlock(m);
3562af12SAlan Cox		VM_OBJECT_UNLOCK(object);
0d94caffSDavid Greenman	}
26f9a767SRodney W. Grimes
df8bae1dSRodney W. Grimes	/*
936524aaSMatthew Dillon	 * Compute the number of pages we want to try to move from the
936524aaSMatthew Dillon	 * active queue to the inactive queue.
1c7c3c6aSMatthew Dillon	 */
2feb50bfSAttilio Rao	page_shortage = vm_paging_target() +
2feb50bfSAttilio Rao		cnt.v_inactive_target - cnt.v_inactive_count;
b182ec9eSJohn Dyson	page_shortage += addl_page_shortage;
1c7c3c6aSMatthew Dillon
1c7c3c6aSMatthew Dillon	/*
936524aaSMatthew Dillon	 * Scan the active queue for things we can deactivate. We nominally
936524aaSMatthew Dillon	 * track the per-page activity counter and use it to locate
936524aaSMatthew Dillon	 * deactivation candidates.
1c7c3c6aSMatthew Dillon	 */
2feb50bfSAttilio Rao	pcount = cnt.v_active_count;
be72f788SAlan Cox	m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
2965a453SKip Macy	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1c7c3c6aSMatthew Dillon
b18bfc3dSJohn Dyson	while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) {
f35329acSJohn Dyson
9cf51988SAlan Cox		KASSERT(m->queue == PQ_ACTIVE,
d3c09dd7SAlan Cox		    ("vm_pageout_scan: page %p isn't active", m));
f35329acSJohn Dyson
b18bfc3dSJohn Dyson		next = TAILQ_NEXT(m, pageq);
8dbca793STor Egge		if ((m->flags & PG_MARKER) != 0) {
8dbca793STor Egge			m = next;
8dbca793STor Egge			continue;
8dbca793STor Egge		}
7900f95dSKonstantin Belousov		KASSERT((m->flags & PG_FICTITIOUS) == 0,
7900f95dSKonstantin Belousov		    ("Fictitious page %p cannot be in active queue", m));
7900f95dSKonstantin Belousov		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7900f95dSKonstantin Belousov		    ("Unmanaged page %p cannot be in active queue", m));
9ee2165fSAlan Cox		if (!vm_pageout_page_lock(m, &next)) {
8c616246SKonstantin Belousov			vm_page_unlock(m);
2965a453SKip Macy			m = next;
2965a453SKip Macy			continue;
2965a453SKip Macy		}
9ee2165fSAlan Cox		object = m->object;
8dbca793STor Egge		if (!VM_OBJECT_TRYLOCK(object) &&
8dbca793STor Egge		    !vm_pageout_fallback_object_lock(m, &next)) {
8dbca793STor Egge			VM_OBJECT_UNLOCK(object);
2965a453SKip Macy			vm_page_unlock(m);
4b8a5c40SAlan Cox			m = next;
b08abf6cSAlan Cox			continue;
b08abf6cSAlan Cox		}
b08abf6cSAlan Cox
df8bae1dSRodney W. Grimes		/*
26f9a767SRodney W. Grimes		 * Don't deactivate pages that are busy.
df8bae1dSRodney W. Grimes		 */
a647a309SDavid Greenman		if ((m->busy != 0) ||
9af80719SAlan Cox		    (m->oflags & VPO_BUSY) ||
f6b04d2bSDavid Greenman		    (m->hold_count != 0)) {
2965a453SKip Macy			vm_page_unlock(m);
b08abf6cSAlan Cox			VM_OBJECT_UNLOCK(object);
e5b006ffSAlan Cox			vm_page_requeue(m);
26f9a767SRodney W. Grimes			m = next;
26f9a767SRodney W. Grimes			continue;
df8bae1dSRodney W. Grimes		}
b18bfc3dSJohn Dyson
b18bfc3dSJohn Dyson		/*
b18bfc3dSJohn Dyson		 * The count for pagedaemon pages is done after checking the
956f3135SPhilippe Charnier		 * page for eligibility...
b18bfc3dSJohn Dyson		 */
393a081dSAttilio Rao		cnt.v_pdpages++;
ef743ce6SJohn Dyson
7e006499SJohn Dyson		/*
7e006499SJohn Dyson		 * Check to see "how much" the page has been used.
7e006499SJohn Dyson		 */
7e006499SJohn Dyson		actcount = 0;
b08abf6cSAlan Cox		if (object->ref_count != 0) {
3407fefeSKonstantin Belousov			if (m->aflags & PGA_REFERENCED) {
7e006499SJohn Dyson				actcount += 1;
0d94caffSDavid Greenman			}
0385347cSPeter Wemm			actcount += pmap_ts_referenced(m);
7e006499SJohn Dyson			if (actcount) {
7e006499SJohn Dyson				m->act_count += ACT_ADVANCE + actcount;
38efa82bSJohn Dyson				if (m->act_count > ACT_MAX)
38efa82bSJohn Dyson					m->act_count = ACT_MAX;
38efa82bSJohn Dyson			}
b18bfc3dSJohn Dyson		}
ef743ce6SJohn Dyson
7e006499SJohn Dyson		/*
7e006499SJohn Dyson		 * Since we have "tested" this bit, we need to clear it now.
7e006499SJohn Dyson		 */
3407fefeSKonstantin Belousov		vm_page_aflag_clear(m, PGA_REFERENCED);
ef743ce6SJohn Dyson
7e006499SJohn Dyson		/*
7e006499SJohn Dyson		 * Only if an object is currently being used, do we use the
7e006499SJohn Dyson		 * page activation count stats.
7e006499SJohn Dyson		 */
b08abf6cSAlan Cox		if (actcount && (object->ref_count != 0)) {
e5b006ffSAlan Cox			vm_page_requeue(m);
26f9a767SRodney W. Grimes		} else {
38efa82bSJohn Dyson			m->act_count -= min(m->act_count, ACT_DECLINE);
2b6b0df7SMatthew Dillon			if (vm_pageout_algorithm ||
b08abf6cSAlan Cox			    object->ref_count == 0 ||
2b6b0df7SMatthew Dillon			    m->act_count == 0) {
925a3a41SJohn Dyson				page_shortage--;
b08abf6cSAlan Cox				if (object->ref_count == 0) {
5d4a7b79SAlan Cox					KASSERT(!pmap_page_is_mapped(m),
5d4a7b79SAlan Cox				    ("vm_pageout_scan: page %p is mapped", m));
d4a272dbSJohn Dyson					if (m->dirty == 0)
0d94caffSDavid Greenman						vm_page_cache(m);
d4a272dbSJohn Dyson					else
d4a272dbSJohn Dyson						vm_page_deactivate(m);
0d94caffSDavid Greenman				} else {
26f9a767SRodney W. Grimes					vm_page_deactivate(m);
df8bae1dSRodney W. Grimes				}
38efa82bSJohn Dyson			} else {
e5b006ffSAlan Cox				vm_page_requeue(m);
38efa82bSJohn Dyson			}
df8bae1dSRodney W. Grimes		}
2965a453SKip Macy		vm_page_unlock(m);
b08abf6cSAlan Cox		VM_OBJECT_UNLOCK(object);
26f9a767SRodney W. Grimes		m = next;
26f9a767SRodney W. Grimes	}
8ffc1519SAlan Cox	vm_page_unlock_queues();
ceb0cf87SJohn Dyson#if !defined(NO_SWAPPING)
ceb0cf87SJohn Dyson	/*
ceb0cf87SJohn Dyson	 * Idle process swapout -- run once per second.
ceb0cf87SJohn Dyson	 */
ceb0cf87SJohn Dyson	if (vm_swap_idle_enabled) {
ceb0cf87SJohn Dyson		static long lsec;
227ee8a1SPoul-Henning Kamp		if (time_second != lsec) {
97824da3SAlan Cox			vm_req_vmdaemon(VM_SWAP_IDLE);
227ee8a1SPoul-Henning Kamp			lsec = time_second;
ceb0cf87SJohn Dyson		}
ceb0cf87SJohn Dyson	}
ceb0cf87SJohn Dyson#endif
ceb0cf87SJohn Dyson
5663e6deSDavid Greenman	/*
f6b04d2bSDavid Greenman	 * If we didn't get enough free pages, and we have skipped a vnode
4c1f8ee9SDavid Greenman	 * in a writeable object, wakeup the sync daemon.  And kick swapout
4c1f8ee9SDavid Greenman	 * if we did not get enough free pages.
f6b04d2bSDavid Greenman	 */
90ecac61SMatthew Dillon	if (vm_paging_target() > 0) {
90ecac61SMatthew Dillon		if (vnodes_skipped && vm_page_count_min())
d50c1994SPeter Wemm			(void) speedup_syncer();
38efa82bSJohn Dyson#if !defined(NO_SWAPPING)
97824da3SAlan Cox		if (vm_swap_enabled && vm_page_count_target())
97824da3SAlan Cox			vm_req_vmdaemon(VM_SWAP_NORMAL);
5afce282SDavid Greenman#endif
4c1f8ee9SDavid Greenman	}
4c1f8ee9SDavid Greenman
f6b04d2bSDavid Greenman	/*
e92686d0SDavid Schultz	 * If we are critically low on one of RAM or swap and low on
e92686d0SDavid Schultz	 * the other, kill the largest process.  However, we avoid
e92686d0SDavid Schultz	 * doing this on the first pass in order to give ourselves a
e92686d0SDavid Schultz	 * chance to flush out dirty vnode-backed pages and to allow
e92686d0SDavid Schultz	 * active pages to be moved to the inactive queue and reclaimed.
2025d69bSKonstantin Belousov	 */
2025d69bSKonstantin Belousov	if (pass != 0 &&
2025d69bSKonstantin Belousov	    ((swap_pager_avail < 64 && vm_page_count_min()) ||
2025d69bSKonstantin Belousov	     (swap_pager_full && vm_paging_target() > 0)))
2025d69bSKonstantin Belousov		vm_pageout_oom(VM_OOM_MEM);
2025d69bSKonstantin Belousov}
2025d69bSKonstantin Belousov
2025d69bSKonstantin Belousov
2025d69bSKonstantin Belousovvoid
2025d69bSKonstantin Belousovvm_pageout_oom(int shortage)
2025d69bSKonstantin Belousov{
2025d69bSKonstantin Belousov	struct proc *p, *bigproc;
2025d69bSKonstantin Belousov	vm_offset_t size, bigsize;
2025d69bSKonstantin Belousov	struct thread *td;
6bed074cSKonstantin Belousov	struct vmspace *vm;
2025d69bSKonstantin Belousov
2025d69bSKonstantin Belousov	/*
1c58e4e5SJohn Baldwin	 * We keep the process bigproc locked once we find it to keep anyone
1c58e4e5SJohn Baldwin	 * from messing with it; however, there is a possibility of
1c58e4e5SJohn Baldwin	 * deadlock if process B is bigproc and one of it's child processes
1c58e4e5SJohn Baldwin	 * attempts to propagate a signal to B while we are waiting for A's
1c58e4e5SJohn Baldwin	 * lock while walking this list.  To avoid this, we don't block on
1c58e4e5SJohn Baldwin	 * the process lock but just skip a process if it is already locked.
5663e6deSDavid Greenman	 */
5663e6deSDavid Greenman	bigproc = NULL;
5663e6deSDavid Greenman	bigsize = 0;
1005a129SJohn Baldwin	sx_slock(&allproc_lock);
e602ba25SJulian Elischer	FOREACH_PROC_IN_SYSTEM(p) {
e602ba25SJulian Elischer		int breakout;
dcbcd518SBruce Evans
1c58e4e5SJohn Baldwin		if (PROC_TRYLOCK(p) == 0)
1c58e4e5SJohn Baldwin			continue;
1c58e4e5SJohn Baldwin		/*
3f1c4c4fSKonstantin Belousov		 * If this is a system, protected or killed process, skip it.
5663e6deSDavid Greenman		 */
8e6fa660SJohn Baldwin		if (p->p_state != PRS_NORMAL ||
8e6fa660SJohn Baldwin		    (p->p_flag & (P_INEXEC | P_PROTECTED | P_SYSTEM)) ||
3f1c4c4fSKonstantin Belousov		    (p->p_pid == 1) || P_KILLED(p) ||
8f60c087SPoul-Henning Kamp		    ((p->p_pid < 48) && (swap_pager_avail != 0))) {
8606d880SJohn Baldwin			PROC_UNLOCK(p);
5663e6deSDavid Greenman			continue;
5663e6deSDavid Greenman		}
5663e6deSDavid Greenman		/*
dcbcd518SBruce Evans		 * If the process is in a non-running type state,
e602ba25SJulian Elischer		 * don't touch it.  Check all the threads individually.
5663e6deSDavid Greenman		 */
e602ba25SJulian Elischer		breakout = 0;
e602ba25SJulian Elischer		FOREACH_THREAD_IN_PROC(p, td) {
982d11f8SJeff Roberson			thread_lock(td);
71fad9fdSJulian Elischer			if (!TD_ON_RUNQ(td) &&
71fad9fdSJulian Elischer			    !TD_IS_RUNNING(td) &&
f497cda2SEdward Tomasz Napierala			    !TD_IS_SLEEPING(td) &&
f497cda2SEdward Tomasz Napierala			    !TD_IS_SUSPENDED(td)) {
982d11f8SJeff Roberson				thread_unlock(td);
e602ba25SJulian Elischer				breakout = 1;
e602ba25SJulian Elischer				break;
e602ba25SJulian Elischer			}
982d11f8SJeff Roberson			thread_unlock(td);
e602ba25SJulian Elischer		}
e602ba25SJulian Elischer		if (breakout) {
1c58e4e5SJohn Baldwin			PROC_UNLOCK(p);
5663e6deSDavid Greenman			continue;
5663e6deSDavid Greenman		}
5663e6deSDavid Greenman		/*
5663e6deSDavid Greenman		 * get the process size
5663e6deSDavid Greenman		 */
6bed074cSKonstantin Belousov		vm = vmspace_acquire_ref(p);
6bed074cSKonstantin Belousov		if (vm == NULL) {
6bed074cSKonstantin Belousov			PROC_UNLOCK(p);
6bed074cSKonstantin Belousov			continue;
6bed074cSKonstantin Belousov		}
6bed074cSKonstantin Belousov		if (!vm_map_trylock_read(&vm->vm_map)) {
6bed074cSKonstantin Belousov			vmspace_free(vm);
72d97679SDavid Schultz			PROC_UNLOCK(p);
72d97679SDavid Schultz			continue;
72d97679SDavid Schultz		}
7981aa24SKonstantin Belousov		size = vmspace_swap_count(vm);
6bed074cSKonstantin Belousov		vm_map_unlock_read(&vm->vm_map);
2025d69bSKonstantin Belousov		if (shortage == VM_OOM_MEM)
6bed074cSKonstantin Belousov			size += vmspace_resident_count(vm);
6bed074cSKonstantin Belousov		vmspace_free(vm);
5663e6deSDavid Greenman		/*
5663e6deSDavid Greenman		 * if the this process is bigger than the biggest one
5663e6deSDavid Greenman		 * remember it.
5663e6deSDavid Greenman		 */
5663e6deSDavid Greenman		if (size > bigsize) {
1c58e4e5SJohn Baldwin			if (bigproc != NULL)
1c58e4e5SJohn Baldwin				PROC_UNLOCK(bigproc);
5663e6deSDavid Greenman			bigproc = p;
5663e6deSDavid Greenman			bigsize = size;
1c58e4e5SJohn Baldwin		} else
1c58e4e5SJohn Baldwin			PROC_UNLOCK(p);
5663e6deSDavid Greenman	}
1005a129SJohn Baldwin	sx_sunlock(&allproc_lock);
5663e6deSDavid Greenman	if (bigproc != NULL) {
729b1e51SDavid Greenman		killproc(bigproc, "out of swap space");
fa885116SJulian Elischer		sched_nice(bigproc, PRIO_MIN);
1c58e4e5SJohn Baldwin		PROC_UNLOCK(bigproc);
2feb50bfSAttilio Rao		wakeup(&cnt.v_free_count);
5663e6deSDavid Greenman	}
5663e6deSDavid Greenman}
26f9a767SRodney W. Grimes
dc2efb27SJohn Dyson/*
dc2efb27SJohn Dyson * This routine tries to maintain the pseudo LRU active queue,
dc2efb27SJohn Dyson * so that during long periods of time where there is no paging,
956f3135SPhilippe Charnier * that some statistic accumulation still occurs.  This code
dc2efb27SJohn Dyson * helps the situation where paging just starts to occur.
dc2efb27SJohn Dyson */
dc2efb27SJohn Dysonstatic void
dc2efb27SJohn Dysonvm_pageout_page_stats()
dc2efb27SJohn Dyson{
b86e6ec0SAlan Cox	vm_object_t object;
dc2efb27SJohn Dyson	vm_page_t m,next;
dc2efb27SJohn Dyson	int pcount,tpcount;		/* Number of pages to check */
dc2efb27SJohn Dyson	static int fullintervalcount = 0;
bef608bdSJohn Dyson	int page_shortage;
bef608bdSJohn Dyson
90ecac61SMatthew Dillon	page_shortage =
2feb50bfSAttilio Rao	    (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) -
2feb50bfSAttilio Rao	    (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count);
90ecac61SMatthew Dillon
bef608bdSJohn Dyson	if (page_shortage <= 0)
bef608bdSJohn Dyson		return;
dc2efb27SJohn Dyson
cdaba1f2SAlan Cox	vm_page_lock_queues();
2feb50bfSAttilio Rao	pcount = cnt.v_active_count;
dc2efb27SJohn Dyson	fullintervalcount += vm_pageout_stats_interval;
dc2efb27SJohn Dyson	if (fullintervalcount < vm_pageout_full_stats_interval) {
8d28bf04SAlan Cox		tpcount = (int64_t)vm_pageout_stats_max * cnt.v_active_count /
8d28bf04SAlan Cox		    cnt.v_page_count;
dc2efb27SJohn Dyson		if (pcount > tpcount)
dc2efb27SJohn Dyson			pcount = tpcount;
883f3caaSMatthew Dillon	} else {
883f3caaSMatthew Dillon		fullintervalcount = 0;
dc2efb27SJohn Dyson	}
dc2efb27SJohn Dyson
be72f788SAlan Cox	m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
dc2efb27SJohn Dyson	while ((m != NULL) && (pcount-- > 0)) {
7e006499SJohn Dyson		int actcount;
dc2efb27SJohn Dyson
9cf51988SAlan Cox		KASSERT(m->queue == PQ_ACTIVE,
ab42316cSAlan Cox		    ("vm_pageout_page_stats: page %p isn't active", m));
dc2efb27SJohn Dyson
dc2efb27SJohn Dyson		next = TAILQ_NEXT(m, pageq);
8dbca793STor Egge		if ((m->flags & PG_MARKER) != 0) {
8dbca793STor Egge			m = next;
8dbca793STor Egge			continue;
8dbca793STor Egge		}
2965a453SKip Macy		vm_page_lock_assert(m, MA_NOTOWNED);
9ee2165fSAlan Cox		if (!vm_pageout_page_lock(m, &next)) {
8c616246SKonstantin Belousov			vm_page_unlock(m);
2965a453SKip Macy			m = next;
2965a453SKip Macy			continue;
2965a453SKip Macy		}
9ee2165fSAlan Cox		object = m->object;
8dbca793STor Egge		if (!VM_OBJECT_TRYLOCK(object) &&
8dbca793STor Egge		    !vm_pageout_fallback_object_lock(m, &next)) {
8dbca793STor Egge			VM_OBJECT_UNLOCK(object);
2965a453SKip Macy			vm_page_unlock(m);
b86e6ec0SAlan Cox			m = next;
b86e6ec0SAlan Cox			continue;
b86e6ec0SAlan Cox		}
b86e6ec0SAlan Cox
dc2efb27SJohn Dyson		/*
dc2efb27SJohn Dyson		 * Don't deactivate pages that are busy.
dc2efb27SJohn Dyson		 */
dc2efb27SJohn Dyson		if ((m->busy != 0) ||
9af80719SAlan Cox		    (m->oflags & VPO_BUSY) ||
dc2efb27SJohn Dyson		    (m->hold_count != 0)) {
2965a453SKip Macy			vm_page_unlock(m);
b86e6ec0SAlan Cox			VM_OBJECT_UNLOCK(object);
e5b006ffSAlan Cox			vm_page_requeue(m);
dc2efb27SJohn Dyson			m = next;
dc2efb27SJohn Dyson			continue;
dc2efb27SJohn Dyson		}
dc2efb27SJohn Dyson
7e006499SJohn Dyson		actcount = 0;
3407fefeSKonstantin Belousov		if (m->aflags & PGA_REFERENCED) {
3407fefeSKonstantin Belousov			vm_page_aflag_clear(m, PGA_REFERENCED);
7e006499SJohn Dyson			actcount += 1;
dc2efb27SJohn Dyson		}
dc2efb27SJohn Dyson
0385347cSPeter Wemm		actcount += pmap_ts_referenced(m);
7e006499SJohn Dyson		if (actcount) {
7e006499SJohn Dyson			m->act_count += ACT_ADVANCE + actcount;
dc2efb27SJohn Dyson			if (m->act_count > ACT_MAX)
dc2efb27SJohn Dyson				m->act_count = ACT_MAX;
e5b006ffSAlan Cox			vm_page_requeue(m);
dc2efb27SJohn Dyson		} else {
dc2efb27SJohn Dyson			if (m->act_count == 0) {
7e006499SJohn Dyson				/*
2b6b0df7SMatthew Dillon				 * We turn off page access, so that we have
2b6b0df7SMatthew Dillon				 * more accurate RSS stats.  We don't do this
2b6b0df7SMatthew Dillon				 * in the normal page deactivation when the
2b6b0df7SMatthew Dillon				 * system is loaded VM wise, because the
2b6b0df7SMatthew Dillon				 * cost of the large number of page protect
2b6b0df7SMatthew Dillon				 * operations would be higher than the value
2b6b0df7SMatthew Dillon				 * of doing the operation.
7e006499SJohn Dyson				 */
4fec79beSAlan Cox				pmap_remove_all(m);
dc2efb27SJohn Dyson				vm_page_deactivate(m);
dc2efb27SJohn Dyson			} else {
dc2efb27SJohn Dyson				m->act_count -= min(m->act_count, ACT_DECLINE);
e5b006ffSAlan Cox				vm_page_requeue(m);
dc2efb27SJohn Dyson			}
dc2efb27SJohn Dyson		}
2965a453SKip Macy		vm_page_unlock(m);
b86e6ec0SAlan Cox		VM_OBJECT_UNLOCK(object);
dc2efb27SJohn Dyson		m = next;
dc2efb27SJohn Dyson	}
cdaba1f2SAlan Cox	vm_page_unlock_queues();
dc2efb27SJohn Dyson}
dc2efb27SJohn Dyson
df8bae1dSRodney W. Grimes/*
df8bae1dSRodney W. Grimes *	vm_pageout is the high level pageout daemon.
df8bae1dSRodney W. Grimes */
2b14f991SJulian Elischerstatic void
26f9a767SRodney W. Grimesvm_pageout()
df8bae1dSRodney W. Grimes{
1aab16a6SAlan Cox	int error, pass;
0384fff8SJason Evans
df8bae1dSRodney W. Grimes	/*
df8bae1dSRodney W. Grimes	 * Initialize some paging parameters.
df8bae1dSRodney W. Grimes	 */
2feb50bfSAttilio Rao	cnt.v_interrupt_free_min = 2;
2feb50bfSAttilio Rao	if (cnt.v_page_count < 2000)
f35329acSJohn Dyson		vm_pageout_page_count = 8;
f6b04d2bSDavid Greenman
45ae1d91SAlan Cox	/*
45ae1d91SAlan Cox	 * v_free_reserved needs to include enough for the largest
45ae1d91SAlan Cox	 * swap pager structures plus enough for any pv_entry structs
45ae1d91SAlan Cox	 * when paging.
45ae1d91SAlan Cox	 */
2feb50bfSAttilio Rao	if (cnt.v_page_count > 1024)
2feb50bfSAttilio Rao		cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200;
2feb50bfSAttilio Rao	else
2feb50bfSAttilio Rao		cnt.v_free_min = 4;
2feb50bfSAttilio Rao	cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
2feb50bfSAttilio Rao	    cnt.v_interrupt_free_min;
2feb50bfSAttilio Rao	cnt.v_free_reserved = vm_pageout_page_count +
2446e4f0SAlan Cox	    cnt.v_pageout_free_min + (cnt.v_page_count / 768);
2feb50bfSAttilio Rao	cnt.v_free_severe = cnt.v_free_min / 2;
2feb50bfSAttilio Rao	cnt.v_free_min += cnt.v_free_reserved;
2feb50bfSAttilio Rao	cnt.v_free_severe += cnt.v_free_reserved;
45ae1d91SAlan Cox
ed74321bSDavid Greenman	/*
2b6b0df7SMatthew Dillon	 * v_free_target and v_cache_min control pageout hysteresis.  Note
2b6b0df7SMatthew Dillon	 * that these are more a measure of the VM cache queue hysteresis
2b6b0df7SMatthew Dillon	 * then the VM free queue.  Specifically, v_free_target is the
2b6b0df7SMatthew Dillon	 * high water mark (free+cache pages).
2b6b0df7SMatthew Dillon	 *
2b6b0df7SMatthew Dillon	 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the
2b6b0df7SMatthew Dillon	 * low water mark, while v_free_min is the stop.  v_cache_min must
2b6b0df7SMatthew Dillon	 * be big enough to handle memory needs while the pageout daemon
2b6b0df7SMatthew Dillon	 * is signalled and run to free more pages.
ed74321bSDavid Greenman	 */
2feb50bfSAttilio Rao	if (cnt.v_free_count > 6144)
2feb50bfSAttilio Rao		cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved;
2feb50bfSAttilio Rao	else
2feb50bfSAttilio Rao		cnt.v_free_target = 2 * cnt.v_free_min + cnt.v_free_reserved;
6f2b142eSDavid Greenman
2feb50bfSAttilio Rao	if (cnt.v_free_count > 2048) {
2feb50bfSAttilio Rao		cnt.v_cache_min = cnt.v_free_target;
2feb50bfSAttilio Rao		cnt.v_cache_max = 2 * cnt.v_cache_min;
2feb50bfSAttilio Rao		cnt.v_inactive_target = (3 * cnt.v_free_target) / 2;
0d94caffSDavid Greenman	} else {
2feb50bfSAttilio Rao		cnt.v_cache_min = 0;
2feb50bfSAttilio Rao		cnt.v_cache_max = 0;
2feb50bfSAttilio Rao		cnt.v_inactive_target = cnt.v_free_count / 4;
0d94caffSDavid Greenman	}
2feb50bfSAttilio Rao	if (cnt.v_inactive_target > cnt.v_free_count / 3)
2feb50bfSAttilio Rao		cnt.v_inactive_target = cnt.v_free_count / 3;
df8bae1dSRodney W. Grimes
df8bae1dSRodney W. Grimes	/* XXX does not really belong here */
df8bae1dSRodney W. Grimes	if (vm_page_max_wired == 0)
2feb50bfSAttilio Rao		vm_page_max_wired = cnt.v_free_count / 3;
df8bae1dSRodney W. Grimes
dc2efb27SJohn Dyson	if (vm_pageout_stats_max == 0)
2feb50bfSAttilio Rao		vm_pageout_stats_max = cnt.v_free_target;
dc2efb27SJohn Dyson
dc2efb27SJohn Dyson	/*
dc2efb27SJohn Dyson	 * Set interval in seconds for stats scan.
dc2efb27SJohn Dyson	 */
dc2efb27SJohn Dyson	if (vm_pageout_stats_interval == 0)
bef608bdSJohn Dyson		vm_pageout_stats_interval = 5;
dc2efb27SJohn Dyson	if (vm_pageout_full_stats_interval == 0)
dc2efb27SJohn Dyson		vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;
dc2efb27SJohn Dyson
24a1cce3SDavid Greenman	swap_pager_swap_init();
2b6b0df7SMatthew Dillon	pass = 0;
df8bae1dSRodney W. Grimes	/*
0d94caffSDavid Greenman	 * The pageout daemon is never done, so loop forever.
df8bae1dSRodney W. Grimes	 */
df8bae1dSRodney W. Grimes	while (TRUE) {
936524aaSMatthew Dillon		/*
936524aaSMatthew Dillon		 * If we have enough free memory, wakeup waiters.  Do
936524aaSMatthew Dillon		 * not clear vm_pages_needed until we reach our target,
936524aaSMatthew Dillon		 * otherwise we may be woken up over and over again and
936524aaSMatthew Dillon		 * waste a lot of cpu.
936524aaSMatthew Dillon		 */
e9f995d8SAlan Cox		mtx_lock(&vm_page_queue_free_mtx);
936524aaSMatthew Dillon		if (vm_pages_needed && !vm_page_count_min()) {
a1c0a785SAlan Cox			if (!vm_paging_needed())
936524aaSMatthew Dillon				vm_pages_needed = 0;
2feb50bfSAttilio Rao			wakeup(&cnt.v_free_count);
936524aaSMatthew Dillon		}
936524aaSMatthew Dillon		if (vm_pages_needed) {
90ecac61SMatthew Dillon			/*
2b6b0df7SMatthew Dillon			 * Still not done, take a second pass without waiting
2b6b0df7SMatthew Dillon			 * (unlimited dirty cleaning), otherwise sleep a bit
2b6b0df7SMatthew Dillon			 * and try again.
90ecac61SMatthew Dillon			 */
2b6b0df7SMatthew Dillon			++pass;
2b6b0df7SMatthew Dillon			if (pass > 1)
e9f995d8SAlan Cox				msleep(&vm_pages_needed,
e9f995d8SAlan Cox				    &vm_page_queue_free_mtx, PVM, "psleep",
e9f995d8SAlan Cox				    hz / 2);
90ecac61SMatthew Dillon		} else {
90ecac61SMatthew Dillon			/*
2b6b0df7SMatthew Dillon			 * Good enough, sleep & handle stats.  Prime the pass
2b6b0df7SMatthew Dillon			 * for the next run.
90ecac61SMatthew Dillon			 */
2b6b0df7SMatthew Dillon			if (pass > 1)
2b6b0df7SMatthew Dillon				pass = 1;
2b6b0df7SMatthew Dillon			else
2b6b0df7SMatthew Dillon				pass = 0;
e9f995d8SAlan Cox			error = msleep(&vm_pages_needed,
e9f995d8SAlan Cox			    &vm_page_queue_free_mtx, PVM, "psleep",
e9f995d8SAlan Cox			    vm_pageout_stats_interval * hz);
dc2efb27SJohn Dyson			if (error && !vm_pages_needed) {
e9f995d8SAlan Cox				mtx_unlock(&vm_page_queue_free_mtx);
2b6b0df7SMatthew Dillon				pass = 0;
dc2efb27SJohn Dyson				vm_pageout_page_stats();
dc2efb27SJohn Dyson				continue;
dc2efb27SJohn Dyson			}
f919ebdeSDavid Greenman		}
b18bfc3dSJohn Dyson		if (vm_pages_needed)
393a081dSAttilio Rao			cnt.v_pdwakeups++;
e9f995d8SAlan Cox		mtx_unlock(&vm_page_queue_free_mtx);
2b6b0df7SMatthew Dillon		vm_pageout_scan(pass);
df8bae1dSRodney W. Grimes	}
df8bae1dSRodney W. Grimes}
26f9a767SRodney W. Grimes
6b4b77adSAlan Cox/*
e9f995d8SAlan Cox * Unless the free page queue lock is held by the caller, this function
6b4b77adSAlan Cox * should be regarded as advisory.  Specifically, the caller should
6b4b77adSAlan Cox * not msleep() on &cnt.v_free_count following this function unless
e9f995d8SAlan Cox * the free page queue lock is held until the msleep() is performed.
6b4b77adSAlan Cox */
e0c5a895SJohn Dysonvoid
e0c5a895SJohn Dysonpagedaemon_wakeup()
e0c5a895SJohn Dyson{
a1c0a785SAlan Cox
b40ce416SJulian Elischer	if (!vm_pages_needed && curthread->td_proc != pageproc) {
a1c0a785SAlan Cox		vm_pages_needed = 1;
e0c5a895SJohn Dyson		wakeup(&vm_pages_needed);
e0c5a895SJohn Dyson	}
e0c5a895SJohn Dyson}
e0c5a895SJohn Dyson
38efa82bSJohn Dyson#if !defined(NO_SWAPPING)
5afce282SDavid Greenmanstatic void
97824da3SAlan Coxvm_req_vmdaemon(int req)
5afce282SDavid Greenman{
5afce282SDavid Greenman	static int lastrun = 0;
5afce282SDavid Greenman
97824da3SAlan Cox	mtx_lock(&vm_daemon_mtx);
97824da3SAlan Cox	vm_pageout_req_swapout |= req;
b18bfc3dSJohn Dyson	if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
5afce282SDavid Greenman		wakeup(&vm_daemon_needed);
5afce282SDavid Greenman		lastrun = ticks;
5afce282SDavid Greenman	}
97824da3SAlan Cox	mtx_unlock(&vm_daemon_mtx);
5afce282SDavid Greenman}
5afce282SDavid Greenman
2b14f991SJulian Elischerstatic void
4f9fb771SBruce Evansvm_daemon()
0d94caffSDavid Greenman{
91d5354aSJohn Baldwin	struct rlimit rsslim;
dcbcd518SBruce Evans	struct proc *p;
dcbcd518SBruce Evans	struct thread *td;
6bed074cSKonstantin Belousov	struct vmspace *vm;
099e7e95SEdward Tomasz Napierala	int breakout, swapout_flags, tryagain, attempts;
afcc55f3SEdward Tomasz Napierala#ifdef RACCT
099e7e95SEdward Tomasz Napierala	uint64_t rsize, ravailable;
afcc55f3SEdward Tomasz Napierala#endif
0d94caffSDavid Greenman
2fe6e4d7SDavid Greenman	while (TRUE) {
97824da3SAlan Cox		mtx_lock(&vm_daemon_mtx);
099e7e95SEdward Tomasz Napierala#ifdef RACCT
099e7e95SEdward Tomasz Napierala		msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", hz);
099e7e95SEdward Tomasz Napierala#else
97824da3SAlan Cox		msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", 0);
099e7e95SEdward Tomasz Napierala#endif
97824da3SAlan Cox		swapout_flags = vm_pageout_req_swapout;
4c1f8ee9SDavid Greenman		vm_pageout_req_swapout = 0;
97824da3SAlan Cox		mtx_unlock(&vm_daemon_mtx);
97824da3SAlan Cox		if (swapout_flags)
97824da3SAlan Cox			swapout_procs(swapout_flags);
97824da3SAlan Cox
2fe6e4d7SDavid Greenman		/*
0d94caffSDavid Greenman		 * scan the processes for exceeding their rlimits or if
0d94caffSDavid Greenman		 * process is swapped out -- deactivate pages
2fe6e4d7SDavid Greenman		 */
099e7e95SEdward Tomasz Napierala		tryagain = 0;
099e7e95SEdward Tomasz Napierala		attempts = 0;
099e7e95SEdward Tomasz Napieralaagain:
099e7e95SEdward Tomasz Napierala		attempts++;
1005a129SJohn Baldwin		sx_slock(&allproc_lock);
f67af5c9SXin LI		FOREACH_PROC_IN_SYSTEM(p) {
fe2144fdSLuoqi Chen			vm_pindex_t limit, size;
2fe6e4d7SDavid Greenman
2fe6e4d7SDavid Greenman			/*
2fe6e4d7SDavid Greenman			 * if this is a system process or if we have already
2fe6e4d7SDavid Greenman			 * looked at this process, skip it.
2fe6e4d7SDavid Greenman			 */
897ecacdSJohn Baldwin			PROC_LOCK(p);
8e6fa660SJohn Baldwin			if (p->p_state != PRS_NORMAL ||
8e6fa660SJohn Baldwin			    p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) {
897ecacdSJohn Baldwin				PROC_UNLOCK(p);
2fe6e4d7SDavid Greenman				continue;
2fe6e4d7SDavid Greenman			}
2fe6e4d7SDavid Greenman			/*
2fe6e4d7SDavid Greenman			 * if the process is in a non-running type state,
2fe6e4d7SDavid Greenman			 * don't touch it.
2fe6e4d7SDavid Greenman			 */
e602ba25SJulian Elischer			breakout = 0;
e602ba25SJulian Elischer			FOREACH_THREAD_IN_PROC(p, td) {
982d11f8SJeff Roberson				thread_lock(td);
71fad9fdSJulian Elischer				if (!TD_ON_RUNQ(td) &&
71fad9fdSJulian Elischer				    !TD_IS_RUNNING(td) &&
f497cda2SEdward Tomasz Napierala				    !TD_IS_SLEEPING(td) &&
f497cda2SEdward Tomasz Napierala				    !TD_IS_SUSPENDED(td)) {
982d11f8SJeff Roberson					thread_unlock(td);
e602ba25SJulian Elischer					breakout = 1;
e602ba25SJulian Elischer					break;
e602ba25SJulian Elischer				}
982d11f8SJeff Roberson				thread_unlock(td);
e602ba25SJulian Elischer			}
897ecacdSJohn Baldwin			if (breakout) {
897ecacdSJohn Baldwin				PROC_UNLOCK(p);
2fe6e4d7SDavid Greenman				continue;
2fe6e4d7SDavid Greenman			}
2fe6e4d7SDavid Greenman			/*
2fe6e4d7SDavid Greenman			 * get a limit
2fe6e4d7SDavid Greenman			 */
dcbcd518SBruce Evans			lim_rlimit(p, RLIMIT_RSS, &rsslim);
fe2144fdSLuoqi Chen			limit = OFF_TO_IDX(
91d5354aSJohn Baldwin			    qmin(rsslim.rlim_cur, rsslim.rlim_max));
2fe6e4d7SDavid Greenman
2fe6e4d7SDavid Greenman			/*
0d94caffSDavid Greenman			 * let processes that are swapped out really be
0d94caffSDavid Greenman			 * swapped out set the limit to nothing (will force a
0d94caffSDavid Greenman			 * swap-out.)
2fe6e4d7SDavid Greenman			 */
b61ce5b0SJeff Roberson			if ((p->p_flag & P_INMEM) == 0)
0d94caffSDavid Greenman				limit = 0;	/* XXX */
6bed074cSKonstantin Belousov			vm = vmspace_acquire_ref(p);
897ecacdSJohn Baldwin			PROC_UNLOCK(p);
6bed074cSKonstantin Belousov			if (vm == NULL)
6bed074cSKonstantin Belousov				continue;
2fe6e4d7SDavid Greenman
6bed074cSKonstantin Belousov			size = vmspace_resident_count(vm);
2fe6e4d7SDavid Greenman			if (limit >= 0 && size >= limit) {
fe2144fdSLuoqi Chen				vm_pageout_map_deactivate_pages(
6bed074cSKonstantin Belousov				    &vm->vm_map, limit);
2fe6e4d7SDavid Greenman			}
afcc55f3SEdward Tomasz Napierala#ifdef RACCT
099e7e95SEdward Tomasz Napierala			rsize = IDX_TO_OFF(size);
099e7e95SEdward Tomasz Napierala			PROC_LOCK(p);
099e7e95SEdward Tomasz Napierala			racct_set(p, RACCT_RSS, rsize);
099e7e95SEdward Tomasz Napierala			ravailable = racct_get_available(p, RACCT_RSS);
099e7e95SEdward Tomasz Napierala			PROC_UNLOCK(p);
099e7e95SEdward Tomasz Napierala			if (rsize > ravailable) {
099e7e95SEdward Tomasz Napierala				/*
099e7e95SEdward Tomasz Napierala				 * Don't be overly aggressive; this might be
099e7e95SEdward Tomasz Napierala				 * an innocent process, and the limit could've
099e7e95SEdward Tomasz Napierala				 * been exceeded by some memory hog.  Don't
099e7e95SEdward Tomasz Napierala				 * try to deactivate more than 1/4th of process'
099e7e95SEdward Tomasz Napierala				 * resident set size.
099e7e95SEdward Tomasz Napierala				 */
099e7e95SEdward Tomasz Napierala				if (attempts <= 8) {
099e7e95SEdward Tomasz Napierala					if (ravailable < rsize - (rsize / 4))
099e7e95SEdward Tomasz Napierala						ravailable = rsize - (rsize / 4);
099e7e95SEdward Tomasz Napierala				}
099e7e95SEdward Tomasz Napierala				vm_pageout_map_deactivate_pages(
099e7e95SEdward Tomasz Napierala				    &vm->vm_map, OFF_TO_IDX(ravailable));
099e7e95SEdward Tomasz Napierala				/* Update RSS usage after paging out. */
099e7e95SEdward Tomasz Napierala				size = vmspace_resident_count(vm);
099e7e95SEdward Tomasz Napierala				rsize = IDX_TO_OFF(size);
099e7e95SEdward Tomasz Napierala				PROC_LOCK(p);
099e7e95SEdward Tomasz Napierala				racct_set(p, RACCT_RSS, rsize);
099e7e95SEdward Tomasz Napierala				PROC_UNLOCK(p);
099e7e95SEdward Tomasz Napierala				if (rsize > ravailable)
099e7e95SEdward Tomasz Napierala					tryagain = 1;
099e7e95SEdward Tomasz Napierala			}
afcc55f3SEdward Tomasz Napierala#endif
6bed074cSKonstantin Belousov			vmspace_free(vm);
2fe6e4d7SDavid Greenman		}
1005a129SJohn Baldwin		sx_sunlock(&allproc_lock);
099e7e95SEdward Tomasz Napierala		if (tryagain != 0 && attempts <= 10)
099e7e95SEdward Tomasz Napierala			goto again;
24a1cce3SDavid Greenman	}
2fe6e4d7SDavid Greenman}
a1287949SEivind Eklund#endif			/* !defined(NO_SWAPPING) */