xref: /freebsd/sys/vm/vm_pageout.c (revision cdebaff820b2a4915a16cedfd511823d78aab171)
1 /*-
2  * Copyright (c) 1991 Regents of the University of California.
3  * All rights reserved.
4  * Copyright (c) 1994 John S. Dyson
5  * All rights reserved.
6  * Copyright (c) 1994 David Greenman
7  * All rights reserved.
8  * Copyright (c) 2005 Yahoo! Technologies Norway AS
9  * All rights reserved.
10  *
11  * This code is derived from software contributed to Berkeley by
12  * The Mach Operating System project at Carnegie-Mellon University.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. All advertising materials mentioning features or use of this software
23  *    must display the following acknowledgement:
24  *	This product includes software developed by the University of
25  *	California, Berkeley and its contributors.
26  * 4. Neither the name of the University nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40  * SUCH DAMAGE.
41  *
42  *	from: @(#)vm_pageout.c	7.4 (Berkeley) 5/7/91
43  *
44  *
45  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
46  * All rights reserved.
47  *
48  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
49  *
50  * Permission to use, copy, modify and distribute this software and
51  * its documentation is hereby granted, provided that both the copyright
52  * notice and this permission notice appear in all copies of the
53  * software, derivative works or modified versions, and any portions
54  * thereof, and that both notices appear in supporting documentation.
55  *
56  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
57  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
58  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
59  *
60  * Carnegie Mellon requests users of this software to return to
61  *
62  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
63  *  School of Computer Science
64  *  Carnegie Mellon University
65  *  Pittsburgh PA 15213-3890
66  *
67  * any improvements or extensions that they make and grant Carnegie the
68  * rights to redistribute these changes.
69  */
70 
71 /*
72  *	The proverbial page-out daemon.
73  */
74 
75 #include <sys/cdefs.h>
76 __FBSDID("$FreeBSD$");
77 
78 #include "opt_vm.h"
79 
80 #include <sys/param.h>
81 #include <sys/systm.h>
82 #include <sys/kernel.h>
83 #include <sys/eventhandler.h>
84 #include <sys/lock.h>
85 #include <sys/mutex.h>
86 #include <sys/proc.h>
87 #include <sys/kthread.h>
88 #include <sys/ktr.h>
89 #include <sys/mount.h>
90 #include <sys/racct.h>
91 #include <sys/resourcevar.h>
92 #include <sys/sched.h>
93 #include <sys/sdt.h>
94 #include <sys/signalvar.h>
95 #include <sys/smp.h>
96 #include <sys/time.h>
97 #include <sys/vnode.h>
98 #include <sys/vmmeter.h>
99 #include <sys/rwlock.h>
100 #include <sys/sx.h>
101 #include <sys/sysctl.h>
102 
103 #include <vm/vm.h>
104 #include <vm/vm_param.h>
105 #include <vm/vm_object.h>
106 #include <vm/vm_page.h>
107 #include <vm/vm_map.h>
108 #include <vm/vm_pageout.h>
109 #include <vm/vm_pager.h>
110 #include <vm/vm_phys.h>
111 #include <vm/swap_pager.h>
112 #include <vm/vm_extern.h>
113 #include <vm/uma.h>
114 
115 /*
116  * System initialization
117  */
118 
119 /* the kernel process "vm_pageout"*/
120 static void vm_pageout(void);
121 static void vm_pageout_init(void);
122 static int vm_pageout_clean(vm_page_t m, int *numpagedout);
123 static int vm_pageout_cluster(vm_page_t m);
124 static bool vm_pageout_scan(struct vm_domain *vmd, int pass);
125 static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
126     int starting_page_shortage);
127 
128 SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init,
129     NULL);
130 
131 struct proc *pageproc;
132 
133 static struct kproc_desc page_kp = {
134 	"pagedaemon",
135 	vm_pageout,
136 	&pageproc
137 };
138 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start,
139     &page_kp);
140 
141 SDT_PROVIDER_DEFINE(vm);
142 SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan);
143 
144 #if !defined(NO_SWAPPING)
145 /* the kernel process "vm_daemon"*/
146 static void vm_daemon(void);
147 static struct	proc *vmproc;
148 
149 static struct kproc_desc vm_kp = {
150 	"vmdaemon",
151 	vm_daemon,
152 	&vmproc
153 };
154 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
155 #endif
156 
157 /* Pagedaemon activity rates, in subdivisions of one second. */
158 #define	VM_LAUNDER_RATE		10
159 #define	VM_INACT_SCAN_RATE	2
160 
161 int vm_pageout_deficit;		/* Estimated number of pages deficit */
162 u_int vm_pageout_wakeup_thresh;
163 static int vm_pageout_oom_seq = 12;
164 bool vm_pageout_wanted;		/* Event on which pageout daemon sleeps */
165 bool vm_pages_needed;		/* Are threads waiting for free pages? */
166 
167 /* Pending request for dirty page laundering. */
168 static enum {
169 	VM_LAUNDRY_IDLE,
170 	VM_LAUNDRY_BACKGROUND,
171 	VM_LAUNDRY_SHORTFALL
172 } vm_laundry_request = VM_LAUNDRY_IDLE;
173 
174 #if !defined(NO_SWAPPING)
175 static int vm_pageout_req_swapout;	/* XXX */
176 static int vm_daemon_needed;
177 static struct mtx vm_daemon_mtx;
178 /* Allow for use by vm_pageout before vm_daemon is initialized. */
179 MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF);
180 #endif
181 static int vm_pageout_update_period;
182 static int disable_swap_pageouts;
183 static int lowmem_period = 10;
184 static time_t lowmem_uptime;
185 static int swapdev_enabled;
186 
187 #if defined(NO_SWAPPING)
188 static int vm_swap_enabled = 0;
189 static int vm_swap_idle_enabled = 0;
190 #else
191 static int vm_swap_enabled = 1;
192 static int vm_swap_idle_enabled = 0;
193 #endif
194 
195 static int vm_panic_on_oom = 0;
196 
197 SYSCTL_INT(_vm, OID_AUTO, panic_on_oom,
198 	CTLFLAG_RWTUN, &vm_panic_on_oom, 0,
199 	"panic on out of memory instead of killing the largest process");
200 
201 SYSCTL_INT(_vm, OID_AUTO, pageout_wakeup_thresh,
202 	CTLFLAG_RW, &vm_pageout_wakeup_thresh, 0,
203 	"free page threshold for waking up the pageout daemon");
204 
205 SYSCTL_INT(_vm, OID_AUTO, pageout_update_period,
206 	CTLFLAG_RW, &vm_pageout_update_period, 0,
207 	"Maximum active LRU update period");
208 
209 SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RW, &lowmem_period, 0,
210 	"Low memory callback period");
211 
212 #if defined(NO_SWAPPING)
213 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
214 	CTLFLAG_RD, &vm_swap_enabled, 0, "Enable entire process swapout");
215 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
216 	CTLFLAG_RD, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
217 #else
218 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
219 	CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
220 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
221 	CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
222 #endif
223 
224 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
225 	CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
226 
227 static int pageout_lock_miss;
228 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
229 	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
230 
231 SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq,
232 	CTLFLAG_RW, &vm_pageout_oom_seq, 0,
233 	"back-to-back calls to oom detector to start OOM");
234 
235 static int act_scan_laundry_weight = 3;
236 SYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RW,
237     &act_scan_laundry_weight, 0,
238     "weight given to clean vs. dirty pages in active queue scans");
239 
240 static u_int vm_background_launder_target;
241 SYSCTL_UINT(_vm, OID_AUTO, background_launder_target, CTLFLAG_RW,
242     &vm_background_launder_target, 0,
243     "background laundering target, in pages");
244 
245 static u_int vm_background_launder_rate = 4096;
246 SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RW,
247     &vm_background_launder_rate, 0,
248     "background laundering rate, in kilobytes per second");
249 
250 static u_int vm_background_launder_max = 20 * 1024;
251 SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RW,
252     &vm_background_launder_max, 0, "background laundering cap, in kilobytes");
253 
254 #define VM_PAGEOUT_PAGE_COUNT 16
255 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
256 
257 int vm_page_max_wired;		/* XXX max # of wired pages system-wide */
258 SYSCTL_INT(_vm, OID_AUTO, max_wired,
259 	CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count");
260 
261 static u_int isqrt(u_int num);
262 static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *);
263 static int vm_pageout_launder(struct vm_domain *vmd, int launder,
264     bool in_shortfall);
265 static void vm_pageout_laundry_worker(void *arg);
266 #if !defined(NO_SWAPPING)
267 static void vm_pageout_map_deactivate_pages(vm_map_t, long);
268 static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long);
269 static void vm_req_vmdaemon(int req);
270 #endif
271 static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *);
272 
273 /*
274  * Initialize a dummy page for marking the caller's place in the specified
275  * paging queue.  In principle, this function only needs to set the flag
276  * PG_MARKER.  Nonetheless, it write busies and initializes the hold count
277  * to one as safety precautions.
278  */
279 static void
280 vm_pageout_init_marker(vm_page_t marker, u_short queue)
281 {
282 
283 	bzero(marker, sizeof(*marker));
284 	marker->flags = PG_MARKER;
285 	marker->busy_lock = VPB_SINGLE_EXCLUSIVER;
286 	marker->queue = queue;
287 	marker->hold_count = 1;
288 }
289 
290 /*
291  * vm_pageout_fallback_object_lock:
292  *
293  * Lock vm object currently associated with `m'. VM_OBJECT_TRYWLOCK is
294  * known to have failed and page queue must be either PQ_ACTIVE or
295  * PQ_INACTIVE.  To avoid lock order violation, unlock the page queue
296  * while locking the vm object.  Use marker page to detect page queue
297  * changes and maintain notion of next page on page queue.  Return
298  * TRUE if no changes were detected, FALSE otherwise.  vm object is
299  * locked on return.
300  *
301  * This function depends on both the lock portion of struct vm_object
302  * and normal struct vm_page being type stable.
303  */
304 static boolean_t
305 vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next)
306 {
307 	struct vm_page marker;
308 	struct vm_pagequeue *pq;
309 	boolean_t unchanged;
310 	u_short queue;
311 	vm_object_t object;
312 
313 	queue = m->queue;
314 	vm_pageout_init_marker(&marker, queue);
315 	pq = vm_page_pagequeue(m);
316 	object = m->object;
317 
318 	TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q);
319 	vm_pagequeue_unlock(pq);
320 	vm_page_unlock(m);
321 	VM_OBJECT_WLOCK(object);
322 	vm_page_lock(m);
323 	vm_pagequeue_lock(pq);
324 
325 	/*
326 	 * The page's object might have changed, and/or the page might
327 	 * have moved from its original position in the queue.  If the
328 	 * page's object has changed, then the caller should abandon
329 	 * processing the page because the wrong object lock was
330 	 * acquired.  Use the marker's plinks.q, not the page's, to
331 	 * determine if the page has been moved.  The state of the
332 	 * page's plinks.q can be indeterminate; whereas, the marker's
333 	 * plinks.q must be valid.
334 	 */
335 	*next = TAILQ_NEXT(&marker, plinks.q);
336 	unchanged = m->object == object &&
337 	    m == TAILQ_PREV(&marker, pglist, plinks.q);
338 	KASSERT(!unchanged || m->queue == queue,
339 	    ("page %p queue %d %d", m, queue, m->queue));
340 	TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q);
341 	return (unchanged);
342 }
343 
344 /*
345  * Lock the page while holding the page queue lock.  Use marker page
346  * to detect page queue changes and maintain notion of next page on
347  * page queue.  Return TRUE if no changes were detected, FALSE
348  * otherwise.  The page is locked on return. The page queue lock might
349  * be dropped and reacquired.
350  *
351  * This function depends on normal struct vm_page being type stable.
352  */
353 static boolean_t
354 vm_pageout_page_lock(vm_page_t m, vm_page_t *next)
355 {
356 	struct vm_page marker;
357 	struct vm_pagequeue *pq;
358 	boolean_t unchanged;
359 	u_short queue;
360 
361 	vm_page_lock_assert(m, MA_NOTOWNED);
362 	if (vm_page_trylock(m))
363 		return (TRUE);
364 
365 	queue = m->queue;
366 	vm_pageout_init_marker(&marker, queue);
367 	pq = vm_page_pagequeue(m);
368 
369 	TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q);
370 	vm_pagequeue_unlock(pq);
371 	vm_page_lock(m);
372 	vm_pagequeue_lock(pq);
373 
374 	/* Page queue might have changed. */
375 	*next = TAILQ_NEXT(&marker, plinks.q);
376 	unchanged = m == TAILQ_PREV(&marker, pglist, plinks.q);
377 	KASSERT(!unchanged || m->queue == queue,
378 	    ("page %p queue %d %d", m, queue, m->queue));
379 	TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q);
380 	return (unchanged);
381 }
382 
383 /*
384  * Scan for pages at adjacent offsets within the given page's object that are
385  * eligible for laundering, form a cluster of these pages and the given page,
386  * and launder that cluster.
387  */
388 static int
389 vm_pageout_cluster(vm_page_t m)
390 {
391 	vm_object_t object;
392 	vm_page_t mc[2 * vm_pageout_page_count], p, pb, ps;
393 	vm_pindex_t pindex;
394 	int ib, is, page_base, pageout_count;
395 
396 	vm_page_assert_locked(m);
397 	object = m->object;
398 	VM_OBJECT_ASSERT_WLOCKED(object);
399 	pindex = m->pindex;
400 
401 	/*
402 	 * We can't clean the page if it is busy or held.
403 	 */
404 	vm_page_assert_unbusied(m);
405 	KASSERT(m->hold_count == 0, ("page %p is held", m));
406 	vm_page_unlock(m);
407 
408 	mc[vm_pageout_page_count] = pb = ps = m;
409 	pageout_count = 1;
410 	page_base = vm_pageout_page_count;
411 	ib = 1;
412 	is = 1;
413 
414 	/*
415 	 * We can cluster only if the page is not clean, busy, or held, and
416 	 * the page is in the laundry queue.
417 	 *
418 	 * During heavy mmap/modification loads the pageout
419 	 * daemon can really fragment the underlying file
420 	 * due to flushing pages out of order and not trying to
421 	 * align the clusters (which leaves sporadic out-of-order
422 	 * holes).  To solve this problem we do the reverse scan
423 	 * first and attempt to align our cluster, then do a
424 	 * forward scan if room remains.
425 	 */
426 more:
427 	while (ib != 0 && pageout_count < vm_pageout_page_count) {
428 		if (ib > pindex) {
429 			ib = 0;
430 			break;
431 		}
432 		if ((p = vm_page_prev(pb)) == NULL || vm_page_busied(p)) {
433 			ib = 0;
434 			break;
435 		}
436 		vm_page_test_dirty(p);
437 		if (p->dirty == 0) {
438 			ib = 0;
439 			break;
440 		}
441 		vm_page_lock(p);
442 		if (!vm_page_in_laundry(p) ||
443 		    p->hold_count != 0) {	/* may be undergoing I/O */
444 			vm_page_unlock(p);
445 			ib = 0;
446 			break;
447 		}
448 		vm_page_unlock(p);
449 		mc[--page_base] = pb = p;
450 		++pageout_count;
451 		++ib;
452 
453 		/*
454 		 * We are at an alignment boundary.  Stop here, and switch
455 		 * directions.  Do not clear ib.
456 		 */
457 		if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
458 			break;
459 	}
460 	while (pageout_count < vm_pageout_page_count &&
461 	    pindex + is < object->size) {
462 		if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p))
463 			break;
464 		vm_page_test_dirty(p);
465 		if (p->dirty == 0)
466 			break;
467 		vm_page_lock(p);
468 		if (!vm_page_in_laundry(p) ||
469 		    p->hold_count != 0) {	/* may be undergoing I/O */
470 			vm_page_unlock(p);
471 			break;
472 		}
473 		vm_page_unlock(p);
474 		mc[page_base + pageout_count] = ps = p;
475 		++pageout_count;
476 		++is;
477 	}
478 
479 	/*
480 	 * If we exhausted our forward scan, continue with the reverse scan
481 	 * when possible, even past an alignment boundary.  This catches
482 	 * boundary conditions.
483 	 */
484 	if (ib != 0 && pageout_count < vm_pageout_page_count)
485 		goto more;
486 
487 	return (vm_pageout_flush(&mc[page_base], pageout_count,
488 	    VM_PAGER_PUT_NOREUSE, 0, NULL, NULL));
489 }
490 
491 /*
492  * vm_pageout_flush() - launder the given pages
493  *
494  *	The given pages are laundered.  Note that we setup for the start of
495  *	I/O ( i.e. busy the page ), mark it read-only, and bump the object
496  *	reference count all in here rather then in the parent.  If we want
497  *	the parent to do more sophisticated things we may have to change
498  *	the ordering.
499  *
500  *	Returned runlen is the count of pages between mreq and first
501  *	page after mreq with status VM_PAGER_AGAIN.
502  *	*eio is set to TRUE if pager returned VM_PAGER_ERROR or VM_PAGER_FAIL
503  *	for any page in runlen set.
504  */
505 int
506 vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen,
507     boolean_t *eio)
508 {
509 	vm_object_t object = mc[0]->object;
510 	int pageout_status[count];
511 	int numpagedout = 0;
512 	int i, runlen;
513 
514 	VM_OBJECT_ASSERT_WLOCKED(object);
515 
516 	/*
517 	 * Initiate I/O.  Bump the vm_page_t->busy counter and
518 	 * mark the pages read-only.
519 	 *
520 	 * We do not have to fixup the clean/dirty bits here... we can
521 	 * allow the pager to do it after the I/O completes.
522 	 *
523 	 * NOTE! mc[i]->dirty may be partial or fragmented due to an
524 	 * edge case with file fragments.
525 	 */
526 	for (i = 0; i < count; i++) {
527 		KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
528 		    ("vm_pageout_flush: partially invalid page %p index %d/%d",
529 			mc[i], i, count));
530 		vm_page_sbusy(mc[i]);
531 		pmap_remove_write(mc[i]);
532 	}
533 	vm_object_pip_add(object, count);
534 
535 	vm_pager_put_pages(object, mc, count, flags, pageout_status);
536 
537 	runlen = count - mreq;
538 	if (eio != NULL)
539 		*eio = FALSE;
540 	for (i = 0; i < count; i++) {
541 		vm_page_t mt = mc[i];
542 
543 		KASSERT(pageout_status[i] == VM_PAGER_PEND ||
544 		    !pmap_page_is_write_mapped(mt),
545 		    ("vm_pageout_flush: page %p is not write protected", mt));
546 		switch (pageout_status[i]) {
547 		case VM_PAGER_OK:
548 			vm_page_lock(mt);
549 			if (vm_page_in_laundry(mt))
550 				vm_page_deactivate_noreuse(mt);
551 			vm_page_unlock(mt);
552 			/* FALLTHROUGH */
553 		case VM_PAGER_PEND:
554 			numpagedout++;
555 			break;
556 		case VM_PAGER_BAD:
557 			/*
558 			 * The page is outside the object's range.  We pretend
559 			 * that the page out worked and clean the page, so the
560 			 * changes will be lost if the page is reclaimed by
561 			 * the page daemon.
562 			 */
563 			vm_page_undirty(mt);
564 			vm_page_lock(mt);
565 			if (vm_page_in_laundry(mt))
566 				vm_page_deactivate_noreuse(mt);
567 			vm_page_unlock(mt);
568 			break;
569 		case VM_PAGER_ERROR:
570 		case VM_PAGER_FAIL:
571 			/*
572 			 * If the page couldn't be paged out to swap because the
573 			 * pager wasn't able to find space, place the page in
574 			 * the PQ_UNSWAPPABLE holding queue.  This is an
575 			 * optimization that prevents the page daemon from
576 			 * wasting CPU cycles on pages that cannot be reclaimed
577 			 * becase no swap device is configured.
578 			 *
579 			 * Otherwise, reactivate the page so that it doesn't
580 			 * clog the laundry and inactive queues.  (We will try
581 			 * paging it out again later.)
582 			 */
583 			vm_page_lock(mt);
584 			if (object->type == OBJT_SWAP &&
585 			    pageout_status[i] == VM_PAGER_FAIL) {
586 				vm_page_unswappable(mt);
587 				numpagedout++;
588 			} else
589 				vm_page_activate(mt);
590 			vm_page_unlock(mt);
591 			if (eio != NULL && i >= mreq && i - mreq < runlen)
592 				*eio = TRUE;
593 			break;
594 		case VM_PAGER_AGAIN:
595 			if (i >= mreq && i - mreq < runlen)
596 				runlen = i - mreq;
597 			break;
598 		}
599 
600 		/*
601 		 * If the operation is still going, leave the page busy to
602 		 * block all other accesses. Also, leave the paging in
603 		 * progress indicator set so that we don't attempt an object
604 		 * collapse.
605 		 */
606 		if (pageout_status[i] != VM_PAGER_PEND) {
607 			vm_object_pip_wakeup(object);
608 			vm_page_sunbusy(mt);
609 		}
610 	}
611 	if (prunlen != NULL)
612 		*prunlen = runlen;
613 	return (numpagedout);
614 }
615 
616 static void
617 vm_pageout_swapon(void *arg __unused, struct swdevt *sp __unused)
618 {
619 
620 	atomic_store_rel_int(&swapdev_enabled, 1);
621 }
622 
623 static void
624 vm_pageout_swapoff(void *arg __unused, struct swdevt *sp __unused)
625 {
626 
627 	if (swap_pager_nswapdev() == 1)
628 		atomic_store_rel_int(&swapdev_enabled, 0);
629 }
630 
631 #if !defined(NO_SWAPPING)
632 /*
633  *	vm_pageout_object_deactivate_pages
634  *
635  *	Deactivate enough pages to satisfy the inactive target
636  *	requirements.
637  *
638  *	The object and map must be locked.
639  */
640 static void
641 vm_pageout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object,
642     long desired)
643 {
644 	vm_object_t backing_object, object;
645 	vm_page_t p;
646 	int act_delta, remove_mode;
647 
648 	VM_OBJECT_ASSERT_LOCKED(first_object);
649 	if ((first_object->flags & OBJ_FICTITIOUS) != 0)
650 		return;
651 	for (object = first_object;; object = backing_object) {
652 		if (pmap_resident_count(pmap) <= desired)
653 			goto unlock_return;
654 		VM_OBJECT_ASSERT_LOCKED(object);
655 		if ((object->flags & OBJ_UNMANAGED) != 0 ||
656 		    object->paging_in_progress != 0)
657 			goto unlock_return;
658 
659 		remove_mode = 0;
660 		if (object->shadow_count > 1)
661 			remove_mode = 1;
662 		/*
663 		 * Scan the object's entire memory queue.
664 		 */
665 		TAILQ_FOREACH(p, &object->memq, listq) {
666 			if (pmap_resident_count(pmap) <= desired)
667 				goto unlock_return;
668 			if (vm_page_busied(p))
669 				continue;
670 			PCPU_INC(cnt.v_pdpages);
671 			vm_page_lock(p);
672 			if (p->wire_count != 0 || p->hold_count != 0 ||
673 			    !pmap_page_exists_quick(pmap, p)) {
674 				vm_page_unlock(p);
675 				continue;
676 			}
677 			act_delta = pmap_ts_referenced(p);
678 			if ((p->aflags & PGA_REFERENCED) != 0) {
679 				if (act_delta == 0)
680 					act_delta = 1;
681 				vm_page_aflag_clear(p, PGA_REFERENCED);
682 			}
683 			if (!vm_page_active(p) && act_delta != 0) {
684 				vm_page_activate(p);
685 				p->act_count += act_delta;
686 			} else if (vm_page_active(p)) {
687 				if (act_delta == 0) {
688 					p->act_count -= min(p->act_count,
689 					    ACT_DECLINE);
690 					if (!remove_mode && p->act_count == 0) {
691 						pmap_remove_all(p);
692 						vm_page_deactivate(p);
693 					} else
694 						vm_page_requeue(p);
695 				} else {
696 					vm_page_activate(p);
697 					if (p->act_count < ACT_MAX -
698 					    ACT_ADVANCE)
699 						p->act_count += ACT_ADVANCE;
700 					vm_page_requeue(p);
701 				}
702 			} else if (vm_page_inactive(p))
703 				pmap_remove_all(p);
704 			vm_page_unlock(p);
705 		}
706 		if ((backing_object = object->backing_object) == NULL)
707 			goto unlock_return;
708 		VM_OBJECT_RLOCK(backing_object);
709 		if (object != first_object)
710 			VM_OBJECT_RUNLOCK(object);
711 	}
712 unlock_return:
713 	if (object != first_object)
714 		VM_OBJECT_RUNLOCK(object);
715 }
716 
717 /*
718  * deactivate some number of pages in a map, try to do it fairly, but
719  * that is really hard to do.
720  */
721 static void
722 vm_pageout_map_deactivate_pages(map, desired)
723 	vm_map_t map;
724 	long desired;
725 {
726 	vm_map_entry_t tmpe;
727 	vm_object_t obj, bigobj;
728 	int nothingwired;
729 
730 	if (!vm_map_trylock(map))
731 		return;
732 
733 	bigobj = NULL;
734 	nothingwired = TRUE;
735 
736 	/*
737 	 * first, search out the biggest object, and try to free pages from
738 	 * that.
739 	 */
740 	tmpe = map->header.next;
741 	while (tmpe != &map->header) {
742 		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
743 			obj = tmpe->object.vm_object;
744 			if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) {
745 				if (obj->shadow_count <= 1 &&
746 				    (bigobj == NULL ||
747 				     bigobj->resident_page_count < obj->resident_page_count)) {
748 					if (bigobj != NULL)
749 						VM_OBJECT_RUNLOCK(bigobj);
750 					bigobj = obj;
751 				} else
752 					VM_OBJECT_RUNLOCK(obj);
753 			}
754 		}
755 		if (tmpe->wired_count > 0)
756 			nothingwired = FALSE;
757 		tmpe = tmpe->next;
758 	}
759 
760 	if (bigobj != NULL) {
761 		vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired);
762 		VM_OBJECT_RUNLOCK(bigobj);
763 	}
764 	/*
765 	 * Next, hunt around for other pages to deactivate.  We actually
766 	 * do this search sort of wrong -- .text first is not the best idea.
767 	 */
768 	tmpe = map->header.next;
769 	while (tmpe != &map->header) {
770 		if (pmap_resident_count(vm_map_pmap(map)) <= desired)
771 			break;
772 		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
773 			obj = tmpe->object.vm_object;
774 			if (obj != NULL) {
775 				VM_OBJECT_RLOCK(obj);
776 				vm_pageout_object_deactivate_pages(map->pmap, obj, desired);
777 				VM_OBJECT_RUNLOCK(obj);
778 			}
779 		}
780 		tmpe = tmpe->next;
781 	}
782 
783 	/*
784 	 * Remove all mappings if a process is swapped out, this will free page
785 	 * table pages.
786 	 */
787 	if (desired == 0 && nothingwired) {
788 		pmap_remove(vm_map_pmap(map), vm_map_min(map),
789 		    vm_map_max(map));
790 	}
791 
792 	vm_map_unlock(map);
793 }
794 #endif		/* !defined(NO_SWAPPING) */
795 
796 /*
797  * Attempt to acquire all of the necessary locks to launder a page and
798  * then call through the clustering layer to PUTPAGES.  Wait a short
799  * time for a vnode lock.
800  *
801  * Requires the page and object lock on entry, releases both before return.
802  * Returns 0 on success and an errno otherwise.
803  */
804 static int
805 vm_pageout_clean(vm_page_t m, int *numpagedout)
806 {
807 	struct vnode *vp;
808 	struct mount *mp;
809 	vm_object_t object;
810 	vm_pindex_t pindex;
811 	int error, lockmode;
812 
813 	vm_page_assert_locked(m);
814 	object = m->object;
815 	VM_OBJECT_ASSERT_WLOCKED(object);
816 	error = 0;
817 	vp = NULL;
818 	mp = NULL;
819 
820 	/*
821 	 * The object is already known NOT to be dead.   It
822 	 * is possible for the vget() to block the whole
823 	 * pageout daemon, but the new low-memory handling
824 	 * code should prevent it.
825 	 *
826 	 * We can't wait forever for the vnode lock, we might
827 	 * deadlock due to a vn_read() getting stuck in
828 	 * vm_wait while holding this vnode.  We skip the
829 	 * vnode if we can't get it in a reasonable amount
830 	 * of time.
831 	 */
832 	if (object->type == OBJT_VNODE) {
833 		vm_page_unlock(m);
834 		vp = object->handle;
835 		if (vp->v_type == VREG &&
836 		    vn_start_write(vp, &mp, V_NOWAIT) != 0) {
837 			mp = NULL;
838 			error = EDEADLK;
839 			goto unlock_all;
840 		}
841 		KASSERT(mp != NULL,
842 		    ("vp %p with NULL v_mount", vp));
843 		vm_object_reference_locked(object);
844 		pindex = m->pindex;
845 		VM_OBJECT_WUNLOCK(object);
846 		lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
847 		    LK_SHARED : LK_EXCLUSIVE;
848 		if (vget(vp, lockmode | LK_TIMELOCK, curthread)) {
849 			vp = NULL;
850 			error = EDEADLK;
851 			goto unlock_mp;
852 		}
853 		VM_OBJECT_WLOCK(object);
854 		vm_page_lock(m);
855 		/*
856 		 * While the object and page were unlocked, the page
857 		 * may have been:
858 		 * (1) moved to a different queue,
859 		 * (2) reallocated to a different object,
860 		 * (3) reallocated to a different offset, or
861 		 * (4) cleaned.
862 		 */
863 		if (!vm_page_in_laundry(m) || m->object != object ||
864 		    m->pindex != pindex || m->dirty == 0) {
865 			vm_page_unlock(m);
866 			error = ENXIO;
867 			goto unlock_all;
868 		}
869 
870 		/*
871 		 * The page may have been busied or held while the object
872 		 * and page locks were released.
873 		 */
874 		if (vm_page_busied(m) || m->hold_count != 0) {
875 			vm_page_unlock(m);
876 			error = EBUSY;
877 			goto unlock_all;
878 		}
879 	}
880 
881 	/*
882 	 * If a page is dirty, then it is either being washed
883 	 * (but not yet cleaned) or it is still in the
884 	 * laundry.  If it is still in the laundry, then we
885 	 * start the cleaning operation.
886 	 */
887 	if ((*numpagedout = vm_pageout_cluster(m)) == 0)
888 		error = EIO;
889 
890 unlock_all:
891 	VM_OBJECT_WUNLOCK(object);
892 
893 unlock_mp:
894 	vm_page_lock_assert(m, MA_NOTOWNED);
895 	if (mp != NULL) {
896 		if (vp != NULL)
897 			vput(vp);
898 		vm_object_deallocate(object);
899 		vn_finished_write(mp);
900 	}
901 
902 	return (error);
903 }
904 
905 /*
906  * Attempt to launder the specified number of pages.
907  *
908  * Returns the number of pages successfully laundered.
909  */
910 static int
911 vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)
912 {
913 	struct vm_pagequeue *pq;
914 	vm_object_t object;
915 	vm_page_t m, next;
916 	int act_delta, error, maxscan, numpagedout, starting_target;
917 	int vnodes_skipped;
918 	bool pageout_ok, queue_locked;
919 
920 	starting_target = launder;
921 	vnodes_skipped = 0;
922 
923 	/*
924 	 * Scan the laundry queues for pages eligible to be laundered.  We stop
925 	 * once the target number of dirty pages have been laundered, or once
926 	 * we've reached the end of the queue.  A single iteration of this loop
927 	 * may cause more than one page to be laundered because of clustering.
928 	 *
929 	 * maxscan ensures that we don't re-examine requeued pages.  Any
930 	 * additional pages written as part of a cluster are subtracted from
931 	 * maxscan since they must be taken from the laundry queue.
932 	 *
933 	 * As an optimization, we avoid laundering from PQ_UNSWAPPABLE when no
934 	 * swap devices are configured.
935 	 */
936 	if (atomic_load_acq_int(&swapdev_enabled))
937 		pq = &vmd->vmd_pagequeues[PQ_UNSWAPPABLE];
938 	else
939 		pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
940 
941 scan:
942 	vm_pagequeue_lock(pq);
943 	maxscan = pq->pq_cnt;
944 	queue_locked = true;
945 	for (m = TAILQ_FIRST(&pq->pq_pl);
946 	    m != NULL && maxscan-- > 0 && launder > 0;
947 	    m = next) {
948 		vm_pagequeue_assert_locked(pq);
949 		KASSERT(queue_locked, ("unlocked laundry queue"));
950 		KASSERT(vm_page_in_laundry(m),
951 		    ("page %p has an inconsistent queue", m));
952 		next = TAILQ_NEXT(m, plinks.q);
953 		if ((m->flags & PG_MARKER) != 0)
954 			continue;
955 		KASSERT((m->flags & PG_FICTITIOUS) == 0,
956 		    ("PG_FICTITIOUS page %p cannot be in laundry queue", m));
957 		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
958 		    ("VPO_UNMANAGED page %p cannot be in laundry queue", m));
959 		if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) {
960 			vm_page_unlock(m);
961 			continue;
962 		}
963 		object = m->object;
964 		if ((!VM_OBJECT_TRYWLOCK(object) &&
965 		    (!vm_pageout_fallback_object_lock(m, &next) ||
966 		    m->hold_count != 0)) || vm_page_busied(m)) {
967 			VM_OBJECT_WUNLOCK(object);
968 			vm_page_unlock(m);
969 			continue;
970 		}
971 
972 		/*
973 		 * Unlock the laundry queue, invalidating the 'next' pointer.
974 		 * Use a marker to remember our place in the laundry queue.
975 		 */
976 		TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_laundry_marker,
977 		    plinks.q);
978 		vm_pagequeue_unlock(pq);
979 		queue_locked = false;
980 
981 		/*
982 		 * Invalid pages can be easily freed.  They cannot be
983 		 * mapped; vm_page_free() asserts this.
984 		 */
985 		if (m->valid == 0)
986 			goto free_page;
987 
988 		/*
989 		 * If the page has been referenced and the object is not dead,
990 		 * reactivate or requeue the page depending on whether the
991 		 * object is mapped.
992 		 */
993 		if ((m->aflags & PGA_REFERENCED) != 0) {
994 			vm_page_aflag_clear(m, PGA_REFERENCED);
995 			act_delta = 1;
996 		} else
997 			act_delta = 0;
998 		if (object->ref_count != 0)
999 			act_delta += pmap_ts_referenced(m);
1000 		else {
1001 			KASSERT(!pmap_page_is_mapped(m),
1002 			    ("page %p is mapped", m));
1003 		}
1004 		if (act_delta != 0) {
1005 			if (object->ref_count != 0) {
1006 				PCPU_INC(cnt.v_reactivated);
1007 				vm_page_activate(m);
1008 
1009 				/*
1010 				 * Increase the activation count if the page
1011 				 * was referenced while in the laundry queue.
1012 				 * This makes it less likely that the page will
1013 				 * be returned prematurely to the inactive
1014 				 * queue.
1015  				 */
1016 				m->act_count += act_delta + ACT_ADVANCE;
1017 
1018 				/*
1019 				 * If this was a background laundering, count
1020 				 * activated pages towards our target.  The
1021 				 * purpose of background laundering is to ensure
1022 				 * that pages are eventually cycled through the
1023 				 * laundry queue, and an activation is a valid
1024 				 * way out.
1025 				 */
1026 				if (!in_shortfall)
1027 					launder--;
1028 				goto drop_page;
1029 			} else if ((object->flags & OBJ_DEAD) == 0)
1030 				goto requeue_page;
1031 		}
1032 
1033 		/*
1034 		 * If the page appears to be clean at the machine-independent
1035 		 * layer, then remove all of its mappings from the pmap in
1036 		 * anticipation of freeing it.  If, however, any of the page's
1037 		 * mappings allow write access, then the page may still be
1038 		 * modified until the last of those mappings are removed.
1039 		 */
1040 		if (object->ref_count != 0) {
1041 			vm_page_test_dirty(m);
1042 			if (m->dirty == 0)
1043 				pmap_remove_all(m);
1044 		}
1045 
1046 		/*
1047 		 * Clean pages are freed, and dirty pages are paged out unless
1048 		 * they belong to a dead object.  Requeueing dirty pages from
1049 		 * dead objects is pointless, as they are being paged out and
1050 		 * freed by the thread that destroyed the object.
1051 		 */
1052 		if (m->dirty == 0) {
1053 free_page:
1054 			vm_page_free(m);
1055 			PCPU_INC(cnt.v_dfree);
1056 		} else if ((object->flags & OBJ_DEAD) == 0) {
1057 			if (object->type != OBJT_SWAP &&
1058 			    object->type != OBJT_DEFAULT)
1059 				pageout_ok = true;
1060 			else if (disable_swap_pageouts)
1061 				pageout_ok = false;
1062 			else
1063 				pageout_ok = true;
1064 			if (!pageout_ok) {
1065 requeue_page:
1066 				vm_pagequeue_lock(pq);
1067 				queue_locked = true;
1068 				vm_page_requeue_locked(m);
1069 				goto drop_page;
1070 			}
1071 
1072 			/*
1073 			 * Form a cluster with adjacent, dirty pages from the
1074 			 * same object, and page out that entire cluster.
1075 			 *
1076 			 * The adjacent, dirty pages must also be in the
1077 			 * laundry.  However, their mappings are not checked
1078 			 * for new references.  Consequently, a recently
1079 			 * referenced page may be paged out.  However, that
1080 			 * page will not be prematurely reclaimed.  After page
1081 			 * out, the page will be placed in the inactive queue,
1082 			 * where any new references will be detected and the
1083 			 * page reactivated.
1084 			 */
1085 			error = vm_pageout_clean(m, &numpagedout);
1086 			if (error == 0) {
1087 				launder -= numpagedout;
1088 				maxscan -= numpagedout - 1;
1089 			} else if (error == EDEADLK) {
1090 				pageout_lock_miss++;
1091 				vnodes_skipped++;
1092 			}
1093 			goto relock_queue;
1094 		}
1095 drop_page:
1096 		vm_page_unlock(m);
1097 		VM_OBJECT_WUNLOCK(object);
1098 relock_queue:
1099 		if (!queue_locked) {
1100 			vm_pagequeue_lock(pq);
1101 			queue_locked = true;
1102 		}
1103 		next = TAILQ_NEXT(&vmd->vmd_laundry_marker, plinks.q);
1104 		TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_laundry_marker, plinks.q);
1105 	}
1106 	vm_pagequeue_unlock(pq);
1107 
1108 	if (launder > 0 && pq == &vmd->vmd_pagequeues[PQ_UNSWAPPABLE]) {
1109 		pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
1110 		goto scan;
1111 	}
1112 
1113 	/*
1114 	 * Wakeup the sync daemon if we skipped a vnode in a writeable object
1115 	 * and we didn't launder enough pages.
1116 	 */
1117 	if (vnodes_skipped > 0 && launder > 0)
1118 		(void)speedup_syncer();
1119 
1120 	return (starting_target - launder);
1121 }
1122 
1123 /*
1124  * Compute the integer square root.
1125  */
1126 static u_int
1127 isqrt(u_int num)
1128 {
1129 	u_int bit, root, tmp;
1130 
1131 	bit = 1u << ((NBBY * sizeof(u_int)) - 2);
1132 	while (bit > num)
1133 		bit >>= 2;
1134 	root = 0;
1135 	while (bit != 0) {
1136 		tmp = root + bit;
1137 		root >>= 1;
1138 		if (num >= tmp) {
1139 			num -= tmp;
1140 			root += bit;
1141 		}
1142 		bit >>= 2;
1143 	}
1144 	return (root);
1145 }
1146 
1147 /*
1148  * Perform the work of the laundry thread: periodically wake up and determine
1149  * whether any pages need to be laundered.  If so, determine the number of pages
1150  * that need to be laundered, and launder them.
1151  */
1152 static void
1153 vm_pageout_laundry_worker(void *arg)
1154 {
1155 	struct vm_domain *domain;
1156 	struct vm_pagequeue *pq;
1157 	uint64_t nclean, ndirty;
1158 	u_int last_launder, wakeups;
1159 	int domidx, last_target, launder, shortfall, shortfall_cycle, target;
1160 	bool in_shortfall;
1161 
1162 	domidx = (uintptr_t)arg;
1163 	domain = &vm_dom[domidx];
1164 	pq = &domain->vmd_pagequeues[PQ_LAUNDRY];
1165 	KASSERT(domain->vmd_segs != 0, ("domain without segments"));
1166 	vm_pageout_init_marker(&domain->vmd_laundry_marker, PQ_LAUNDRY);
1167 
1168 	shortfall = 0;
1169 	in_shortfall = false;
1170 	shortfall_cycle = 0;
1171 	target = 0;
1172 	last_launder = 0;
1173 
1174 	/*
1175 	 * Calls to these handlers are serialized by the swap syscall lock.
1176 	 */
1177 	(void)EVENTHANDLER_REGISTER(swapon, vm_pageout_swapon, domain,
1178 	    EVENTHANDLER_PRI_ANY);
1179 	(void)EVENTHANDLER_REGISTER(swapoff, vm_pageout_swapoff, domain,
1180 	    EVENTHANDLER_PRI_ANY);
1181 
1182 	/*
1183 	 * The pageout laundry worker is never done, so loop forever.
1184 	 */
1185 	for (;;) {
1186 		KASSERT(target >= 0, ("negative target %d", target));
1187 		KASSERT(shortfall_cycle >= 0,
1188 		    ("negative cycle %d", shortfall_cycle));
1189 		launder = 0;
1190 		wakeups = VM_METER_PCPU_CNT(v_pdwakeups);
1191 
1192 		/*
1193 		 * First determine whether we need to launder pages to meet a
1194 		 * shortage of free pages.
1195 		 */
1196 		if (shortfall > 0) {
1197 			in_shortfall = true;
1198 			shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE;
1199 			target = shortfall;
1200 		} else if (!in_shortfall)
1201 			goto trybackground;
1202 		else if (shortfall_cycle == 0 || vm_laundry_target() <= 0) {
1203 			/*
1204 			 * We recently entered shortfall and began laundering
1205 			 * pages.  If we have completed that laundering run
1206 			 * (and we are no longer in shortfall) or we have met
1207 			 * our laundry target through other activity, then we
1208 			 * can stop laundering pages.
1209 			 */
1210 			in_shortfall = false;
1211 			target = 0;
1212 			goto trybackground;
1213 		}
1214 		last_launder = wakeups;
1215 		launder = target / shortfall_cycle--;
1216 		goto dolaundry;
1217 
1218 		/*
1219 		 * There's no immediate need to launder any pages; see if we
1220 		 * meet the conditions to perform background laundering:
1221 		 *
1222 		 * 1. The ratio of dirty to clean inactive pages exceeds the
1223 		 *    background laundering threshold and the pagedaemon has
1224 		 *    been woken up to reclaim pages since our last
1225 		 *    laundering, or
1226 		 * 2. we haven't yet reached the target of the current
1227 		 *    background laundering run.
1228 		 *
1229 		 * The background laundering threshold is not a constant.
1230 		 * Instead, it is a slowly growing function of the number of
1231 		 * page daemon wakeups since the last laundering.  Thus, as the
1232 		 * ratio of dirty to clean inactive pages grows, the amount of
1233 		 * memory pressure required to trigger laundering decreases.
1234 		 */
1235 trybackground:
1236 		nclean = vm_cnt.v_inactive_count + vm_cnt.v_free_count;
1237 		ndirty = vm_cnt.v_laundry_count;
1238 		if (target == 0 && wakeups != last_launder &&
1239 		    ndirty * isqrt(wakeups - last_launder) >= nclean) {
1240 			target = vm_background_launder_target;
1241 		}
1242 
1243 		/*
1244 		 * We have a non-zero background laundering target.  If we've
1245 		 * laundered up to our maximum without observing a page daemon
1246 		 * wakeup, just stop.  This is a safety belt that ensures we
1247 		 * don't launder an excessive amount if memory pressure is low
1248 		 * and the ratio of dirty to clean pages is large.  Otherwise,
1249 		 * proceed at the background laundering rate.
1250 		 */
1251 		if (target > 0) {
1252 			if (wakeups != last_launder) {
1253 				last_launder = wakeups;
1254 				last_target = target;
1255 			} else if (last_target - target >=
1256 			    vm_background_launder_max * PAGE_SIZE / 1024) {
1257 				target = 0;
1258 			}
1259 			launder = vm_background_launder_rate * PAGE_SIZE / 1024;
1260 			launder /= VM_LAUNDER_RATE;
1261 			if (launder > target)
1262 				launder = target;
1263 		}
1264 
1265 dolaundry:
1266 		if (launder > 0) {
1267 			/*
1268 			 * Because of I/O clustering, the number of laundered
1269 			 * pages could exceed "target" by the maximum size of
1270 			 * a cluster minus one.
1271 			 */
1272 			target -= min(vm_pageout_launder(domain, launder,
1273 			    in_shortfall), target);
1274 			pause("laundp", hz / VM_LAUNDER_RATE);
1275 		}
1276 
1277 		/*
1278 		 * If we're not currently laundering pages and the page daemon
1279 		 * hasn't posted a new request, sleep until the page daemon
1280 		 * kicks us.
1281 		 */
1282 		vm_pagequeue_lock(pq);
1283 		if (target == 0 && vm_laundry_request == VM_LAUNDRY_IDLE)
1284 			(void)mtx_sleep(&vm_laundry_request,
1285 			    vm_pagequeue_lockptr(pq), PVM, "launds", 0);
1286 
1287 		/*
1288 		 * If the pagedaemon has indicated that it's in shortfall, start
1289 		 * a shortfall laundering unless we're already in the middle of
1290 		 * one.  This may preempt a background laundering.
1291 		 */
1292 		if (vm_laundry_request == VM_LAUNDRY_SHORTFALL &&
1293 		    (!in_shortfall || shortfall_cycle == 0)) {
1294 			shortfall = vm_laundry_target() + vm_pageout_deficit;
1295 			target = 0;
1296 		} else
1297 			shortfall = 0;
1298 
1299 		if (target == 0)
1300 			vm_laundry_request = VM_LAUNDRY_IDLE;
1301 		vm_pagequeue_unlock(pq);
1302 	}
1303 }
1304 
1305 /*
1306  *	vm_pageout_scan does the dirty work for the pageout daemon.
1307  *
1308  *	pass == 0: Update active LRU/deactivate pages
1309  *	pass >= 1: Free inactive pages
1310  *
1311  * Returns true if pass was zero or enough pages were freed by the inactive
1312  * queue scan to meet the target.
1313  */
1314 static bool
1315 vm_pageout_scan(struct vm_domain *vmd, int pass)
1316 {
1317 	vm_page_t m, next;
1318 	struct vm_pagequeue *pq;
1319 	vm_object_t object;
1320 	long min_scan;
1321 	int act_delta, addl_page_shortage, deficit, inactq_shortage, maxscan;
1322 	int page_shortage, scan_tick, scanned, starting_page_shortage;
1323 	boolean_t queue_locked;
1324 
1325 	/*
1326 	 * If we need to reclaim memory ask kernel caches to return
1327 	 * some.  We rate limit to avoid thrashing.
1328 	 */
1329 	if (vmd == &vm_dom[0] && pass > 0 &&
1330 	    (time_uptime - lowmem_uptime) >= lowmem_period) {
1331 		/*
1332 		 * Decrease registered cache sizes.
1333 		 */
1334 		SDT_PROBE0(vm, , , vm__lowmem_scan);
1335 		EVENTHANDLER_INVOKE(vm_lowmem, 0);
1336 		/*
1337 		 * We do this explicitly after the caches have been
1338 		 * drained above.
1339 		 */
1340 		uma_reclaim();
1341 		lowmem_uptime = time_uptime;
1342 	}
1343 
1344 	/*
1345 	 * The addl_page_shortage is the number of temporarily
1346 	 * stuck pages in the inactive queue.  In other words, the
1347 	 * number of pages from the inactive count that should be
1348 	 * discounted in setting the target for the active queue scan.
1349 	 */
1350 	addl_page_shortage = 0;
1351 
1352 	/*
1353 	 * Calculate the number of pages that we want to free.  This number
1354 	 * can be negative if many pages are freed between the wakeup call to
1355 	 * the page daemon and this calculation.
1356 	 */
1357 	if (pass > 0) {
1358 		deficit = atomic_readandclear_int(&vm_pageout_deficit);
1359 		page_shortage = vm_paging_target() + deficit;
1360 	} else
1361 		page_shortage = deficit = 0;
1362 	starting_page_shortage = page_shortage;
1363 
1364 	/*
1365 	 * Start scanning the inactive queue for pages that we can free.  The
1366 	 * scan will stop when we reach the target or we have scanned the
1367 	 * entire queue.  (Note that m->act_count is not used to make
1368 	 * decisions for the inactive queue, only for the active queue.)
1369 	 */
1370 	pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
1371 	maxscan = pq->pq_cnt;
1372 	vm_pagequeue_lock(pq);
1373 	queue_locked = TRUE;
1374 	for (m = TAILQ_FIRST(&pq->pq_pl);
1375 	     m != NULL && maxscan-- > 0 && page_shortage > 0;
1376 	     m = next) {
1377 		vm_pagequeue_assert_locked(pq);
1378 		KASSERT(queue_locked, ("unlocked inactive queue"));
1379 		KASSERT(vm_page_inactive(m), ("Inactive queue %p", m));
1380 
1381 		PCPU_INC(cnt.v_pdpages);
1382 		next = TAILQ_NEXT(m, plinks.q);
1383 
1384 		/*
1385 		 * skip marker pages
1386 		 */
1387 		if (m->flags & PG_MARKER)
1388 			continue;
1389 
1390 		KASSERT((m->flags & PG_FICTITIOUS) == 0,
1391 		    ("Fictitious page %p cannot be in inactive queue", m));
1392 		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1393 		    ("Unmanaged page %p cannot be in inactive queue", m));
1394 
1395 		/*
1396 		 * The page or object lock acquisitions fail if the
1397 		 * page was removed from the queue or moved to a
1398 		 * different position within the queue.  In either
1399 		 * case, addl_page_shortage should not be incremented.
1400 		 */
1401 		if (!vm_pageout_page_lock(m, &next))
1402 			goto unlock_page;
1403 		else if (m->hold_count != 0) {
1404 			/*
1405 			 * Held pages are essentially stuck in the
1406 			 * queue.  So, they ought to be discounted
1407 			 * from the inactive count.  See the
1408 			 * calculation of inactq_shortage before the
1409 			 * loop over the active queue below.
1410 			 */
1411 			addl_page_shortage++;
1412 			goto unlock_page;
1413 		}
1414 		object = m->object;
1415 		if (!VM_OBJECT_TRYWLOCK(object)) {
1416 			if (!vm_pageout_fallback_object_lock(m, &next))
1417 				goto unlock_object;
1418 			else if (m->hold_count != 0) {
1419 				addl_page_shortage++;
1420 				goto unlock_object;
1421 			}
1422 		}
1423 		if (vm_page_busied(m)) {
1424 			/*
1425 			 * Don't mess with busy pages.  Leave them at
1426 			 * the front of the queue.  Most likely, they
1427 			 * are being paged out and will leave the
1428 			 * queue shortly after the scan finishes.  So,
1429 			 * they ought to be discounted from the
1430 			 * inactive count.
1431 			 */
1432 			addl_page_shortage++;
1433 unlock_object:
1434 			VM_OBJECT_WUNLOCK(object);
1435 unlock_page:
1436 			vm_page_unlock(m);
1437 			continue;
1438 		}
1439 		KASSERT(m->hold_count == 0, ("Held page %p", m));
1440 
1441 		/*
1442 		 * Dequeue the inactive page and unlock the inactive page
1443 		 * queue, invalidating the 'next' pointer.  Dequeueing the
1444 		 * page here avoids a later reacquisition (and release) of
1445 		 * the inactive page queue lock when vm_page_activate(),
1446 		 * vm_page_free(), or vm_page_launder() is called.  Use a
1447 		 * marker to remember our place in the inactive queue.
1448 		 */
1449 		TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q);
1450 		vm_page_dequeue_locked(m);
1451 		vm_pagequeue_unlock(pq);
1452 		queue_locked = FALSE;
1453 
1454 		/*
1455 		 * Invalid pages can be easily freed. They cannot be
1456 		 * mapped, vm_page_free() asserts this.
1457 		 */
1458 		if (m->valid == 0)
1459 			goto free_page;
1460 
1461 		/*
1462 		 * If the page has been referenced and the object is not dead,
1463 		 * reactivate or requeue the page depending on whether the
1464 		 * object is mapped.
1465 		 */
1466 		if ((m->aflags & PGA_REFERENCED) != 0) {
1467 			vm_page_aflag_clear(m, PGA_REFERENCED);
1468 			act_delta = 1;
1469 		} else
1470 			act_delta = 0;
1471 		if (object->ref_count != 0) {
1472 			act_delta += pmap_ts_referenced(m);
1473 		} else {
1474 			KASSERT(!pmap_page_is_mapped(m),
1475 			    ("vm_pageout_scan: page %p is mapped", m));
1476 		}
1477 		if (act_delta != 0) {
1478 			if (object->ref_count != 0) {
1479 				PCPU_INC(cnt.v_reactivated);
1480 				vm_page_activate(m);
1481 
1482 				/*
1483 				 * Increase the activation count if the page
1484 				 * was referenced while in the inactive queue.
1485 				 * This makes it less likely that the page will
1486 				 * be returned prematurely to the inactive
1487 				 * queue.
1488  				 */
1489 				m->act_count += act_delta + ACT_ADVANCE;
1490 				goto drop_page;
1491 			} else if ((object->flags & OBJ_DEAD) == 0) {
1492 				vm_pagequeue_lock(pq);
1493 				queue_locked = TRUE;
1494 				m->queue = PQ_INACTIVE;
1495 				TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
1496 				vm_pagequeue_cnt_inc(pq);
1497 				goto drop_page;
1498 			}
1499 		}
1500 
1501 		/*
1502 		 * If the page appears to be clean at the machine-independent
1503 		 * layer, then remove all of its mappings from the pmap in
1504 		 * anticipation of freeing it.  If, however, any of the page's
1505 		 * mappings allow write access, then the page may still be
1506 		 * modified until the last of those mappings are removed.
1507 		 */
1508 		if (object->ref_count != 0) {
1509 			vm_page_test_dirty(m);
1510 			if (m->dirty == 0)
1511 				pmap_remove_all(m);
1512 		}
1513 
1514 		/*
1515 		 * Clean pages can be freed, but dirty pages must be sent back
1516 		 * to the laundry, unless they belong to a dead object.
1517 		 * Requeueing dirty pages from dead objects is pointless, as
1518 		 * they are being paged out and freed by the thread that
1519 		 * destroyed the object.
1520 		 */
1521 		if (m->dirty == 0) {
1522 free_page:
1523 			vm_page_free(m);
1524 			PCPU_INC(cnt.v_dfree);
1525 			--page_shortage;
1526 		} else if ((object->flags & OBJ_DEAD) == 0)
1527 			vm_page_launder(m);
1528 drop_page:
1529 		vm_page_unlock(m);
1530 		VM_OBJECT_WUNLOCK(object);
1531 		if (!queue_locked) {
1532 			vm_pagequeue_lock(pq);
1533 			queue_locked = TRUE;
1534 		}
1535 		next = TAILQ_NEXT(&vmd->vmd_marker, plinks.q);
1536 		TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_marker, plinks.q);
1537 	}
1538 	vm_pagequeue_unlock(pq);
1539 
1540 	/*
1541 	 * Wake up the laundry thread so that it can perform any needed
1542 	 * laundering.  If we didn't meet our target, we're in shortfall and
1543 	 * need to launder more aggressively.  If PQ_LAUNDRY is empty and no
1544 	 * swap devices are configured, the laundry thread has no work to do, so
1545 	 * don't bother waking it up.
1546 	 */
1547 	if (vm_laundry_request == VM_LAUNDRY_IDLE &&
1548 	    starting_page_shortage > 0) {
1549 		pq = &vm_dom[0].vmd_pagequeues[PQ_LAUNDRY];
1550 		vm_pagequeue_lock(pq);
1551 		if (pq->pq_cnt > 0 || atomic_load_acq_int(&swapdev_enabled)) {
1552 			if (page_shortage > 0) {
1553 				vm_laundry_request = VM_LAUNDRY_SHORTFALL;
1554 				PCPU_INC(cnt.v_pdshortfalls);
1555 			} else if (vm_laundry_request != VM_LAUNDRY_SHORTFALL)
1556 				vm_laundry_request = VM_LAUNDRY_BACKGROUND;
1557 			wakeup(&vm_laundry_request);
1558 		}
1559 		vm_pagequeue_unlock(pq);
1560 	}
1561 
1562 #if !defined(NO_SWAPPING)
1563 	/*
1564 	 * Wakeup the swapout daemon if we didn't free the targeted number of
1565 	 * pages.
1566 	 */
1567 	if (vm_swap_enabled && page_shortage > 0)
1568 		vm_req_vmdaemon(VM_SWAP_NORMAL);
1569 #endif
1570 
1571 	/*
1572 	 * If the inactive queue scan fails repeatedly to meet its
1573 	 * target, kill the largest process.
1574 	 */
1575 	vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage);
1576 
1577 	/*
1578 	 * Compute the number of pages we want to try to move from the
1579 	 * active queue to either the inactive or laundry queue.
1580 	 *
1581 	 * When scanning active pages, we make clean pages count more heavily
1582 	 * towards the page shortage than dirty pages.  This is because dirty
1583 	 * pages must be laundered before they can be reused and thus have less
1584 	 * utility when attempting to quickly alleviate a shortage.  However,
1585 	 * this weighting also causes the scan to deactivate dirty pages more
1586 	 * more aggressively, improving the effectiveness of clustering and
1587 	 * ensuring that they can eventually be reused.
1588 	 */
1589 	inactq_shortage = vm_cnt.v_inactive_target - (vm_cnt.v_inactive_count +
1590 	    vm_cnt.v_laundry_count / act_scan_laundry_weight) +
1591 	    vm_paging_target() + deficit + addl_page_shortage;
1592 	page_shortage *= act_scan_laundry_weight;
1593 
1594 	pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
1595 	vm_pagequeue_lock(pq);
1596 	maxscan = pq->pq_cnt;
1597 
1598 	/*
1599 	 * If we're just idle polling attempt to visit every
1600 	 * active page within 'update_period' seconds.
1601 	 */
1602 	scan_tick = ticks;
1603 	if (vm_pageout_update_period != 0) {
1604 		min_scan = pq->pq_cnt;
1605 		min_scan *= scan_tick - vmd->vmd_last_active_scan;
1606 		min_scan /= hz * vm_pageout_update_period;
1607 	} else
1608 		min_scan = 0;
1609 	if (min_scan > 0 || (inactq_shortage > 0 && maxscan > 0))
1610 		vmd->vmd_last_active_scan = scan_tick;
1611 
1612 	/*
1613 	 * Scan the active queue for pages that can be deactivated.  Update
1614 	 * the per-page activity counter and use it to identify deactivation
1615 	 * candidates.  Held pages may be deactivated.
1616 	 */
1617 	for (m = TAILQ_FIRST(&pq->pq_pl), scanned = 0; m != NULL && (scanned <
1618 	    min_scan || (inactq_shortage > 0 && scanned < maxscan)); m = next,
1619 	    scanned++) {
1620 		KASSERT(m->queue == PQ_ACTIVE,
1621 		    ("vm_pageout_scan: page %p isn't active", m));
1622 		next = TAILQ_NEXT(m, plinks.q);
1623 		if ((m->flags & PG_MARKER) != 0)
1624 			continue;
1625 		KASSERT((m->flags & PG_FICTITIOUS) == 0,
1626 		    ("Fictitious page %p cannot be in active queue", m));
1627 		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1628 		    ("Unmanaged page %p cannot be in active queue", m));
1629 		if (!vm_pageout_page_lock(m, &next)) {
1630 			vm_page_unlock(m);
1631 			continue;
1632 		}
1633 
1634 		/*
1635 		 * The count for page daemon pages is updated after checking
1636 		 * the page for eligibility.
1637 		 */
1638 		PCPU_INC(cnt.v_pdpages);
1639 
1640 		/*
1641 		 * Check to see "how much" the page has been used.
1642 		 */
1643 		if ((m->aflags & PGA_REFERENCED) != 0) {
1644 			vm_page_aflag_clear(m, PGA_REFERENCED);
1645 			act_delta = 1;
1646 		} else
1647 			act_delta = 0;
1648 
1649 		/*
1650 		 * Perform an unsynchronized object ref count check.  While
1651 		 * the page lock ensures that the page is not reallocated to
1652 		 * another object, in particular, one with unmanaged mappings
1653 		 * that cannot support pmap_ts_referenced(), two races are,
1654 		 * nonetheless, possible:
1655 		 * 1) The count was transitioning to zero, but we saw a non-
1656 		 *    zero value.  pmap_ts_referenced() will return zero
1657 		 *    because the page is not mapped.
1658 		 * 2) The count was transitioning to one, but we saw zero.
1659 		 *    This race delays the detection of a new reference.  At
1660 		 *    worst, we will deactivate and reactivate the page.
1661 		 */
1662 		if (m->object->ref_count != 0)
1663 			act_delta += pmap_ts_referenced(m);
1664 
1665 		/*
1666 		 * Advance or decay the act_count based on recent usage.
1667 		 */
1668 		if (act_delta != 0) {
1669 			m->act_count += ACT_ADVANCE + act_delta;
1670 			if (m->act_count > ACT_MAX)
1671 				m->act_count = ACT_MAX;
1672 		} else
1673 			m->act_count -= min(m->act_count, ACT_DECLINE);
1674 
1675 		/*
1676 		 * Move this page to the tail of the active, inactive or laundry
1677 		 * queue depending on usage.
1678 		 */
1679 		if (m->act_count == 0) {
1680 			/* Dequeue to avoid later lock recursion. */
1681 			vm_page_dequeue_locked(m);
1682 
1683 			/*
1684 			 * When not short for inactive pages, let dirty pages go
1685 			 * through the inactive queue before moving to the
1686 			 * laundry queues.  This gives them some extra time to
1687 			 * be reactivated, potentially avoiding an expensive
1688 			 * pageout.  During a page shortage, the inactive queue
1689 			 * is necessarily small, so we may move dirty pages
1690 			 * directly to the laundry queue.
1691 			 */
1692 			if (inactq_shortage <= 0)
1693 				vm_page_deactivate(m);
1694 			else {
1695 				/*
1696 				 * Calling vm_page_test_dirty() here would
1697 				 * require acquisition of the object's write
1698 				 * lock.  However, during a page shortage,
1699 				 * directing dirty pages into the laundry
1700 				 * queue is only an optimization and not a
1701 				 * requirement.  Therefore, we simply rely on
1702 				 * the opportunistic updates to the page's
1703 				 * dirty field by the pmap.
1704 				 */
1705 				if (m->dirty == 0) {
1706 					vm_page_deactivate(m);
1707 					inactq_shortage -=
1708 					    act_scan_laundry_weight;
1709 				} else {
1710 					vm_page_launder(m);
1711 					inactq_shortage--;
1712 				}
1713 			}
1714 		} else
1715 			vm_page_requeue_locked(m);
1716 		vm_page_unlock(m);
1717 	}
1718 	vm_pagequeue_unlock(pq);
1719 #if !defined(NO_SWAPPING)
1720 	/*
1721 	 * Idle process swapout -- run once per second when we are reclaiming
1722 	 * pages.
1723 	 */
1724 	if (vm_swap_idle_enabled && pass > 0) {
1725 		static long lsec;
1726 		if (time_second != lsec) {
1727 			vm_req_vmdaemon(VM_SWAP_IDLE);
1728 			lsec = time_second;
1729 		}
1730 	}
1731 #endif
1732 	return (page_shortage <= 0);
1733 }
1734 
1735 static int vm_pageout_oom_vote;
1736 
1737 /*
1738  * The pagedaemon threads randlomly select one to perform the
1739  * OOM.  Trying to kill processes before all pagedaemons
1740  * failed to reach free target is premature.
1741  */
1742 static void
1743 vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
1744     int starting_page_shortage)
1745 {
1746 	int old_vote;
1747 
1748 	if (starting_page_shortage <= 0 || starting_page_shortage !=
1749 	    page_shortage)
1750 		vmd->vmd_oom_seq = 0;
1751 	else
1752 		vmd->vmd_oom_seq++;
1753 	if (vmd->vmd_oom_seq < vm_pageout_oom_seq) {
1754 		if (vmd->vmd_oom) {
1755 			vmd->vmd_oom = FALSE;
1756 			atomic_subtract_int(&vm_pageout_oom_vote, 1);
1757 		}
1758 		return;
1759 	}
1760 
1761 	/*
1762 	 * Do not follow the call sequence until OOM condition is
1763 	 * cleared.
1764 	 */
1765 	vmd->vmd_oom_seq = 0;
1766 
1767 	if (vmd->vmd_oom)
1768 		return;
1769 
1770 	vmd->vmd_oom = TRUE;
1771 	old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1);
1772 	if (old_vote != vm_ndomains - 1)
1773 		return;
1774 
1775 	/*
1776 	 * The current pagedaemon thread is the last in the quorum to
1777 	 * start OOM.  Initiate the selection and signaling of the
1778 	 * victim.
1779 	 */
1780 	vm_pageout_oom(VM_OOM_MEM);
1781 
1782 	/*
1783 	 * After one round of OOM terror, recall our vote.  On the
1784 	 * next pass, current pagedaemon would vote again if the low
1785 	 * memory condition is still there, due to vmd_oom being
1786 	 * false.
1787 	 */
1788 	vmd->vmd_oom = FALSE;
1789 	atomic_subtract_int(&vm_pageout_oom_vote, 1);
1790 }
1791 
1792 /*
1793  * The OOM killer is the page daemon's action of last resort when
1794  * memory allocation requests have been stalled for a prolonged period
1795  * of time because it cannot reclaim memory.  This function computes
1796  * the approximate number of physical pages that could be reclaimed if
1797  * the specified address space is destroyed.
1798  *
1799  * Private, anonymous memory owned by the address space is the
1800  * principal resource that we expect to recover after an OOM kill.
1801  * Since the physical pages mapped by the address space's COW entries
1802  * are typically shared pages, they are unlikely to be released and so
1803  * they are not counted.
1804  *
1805  * To get to the point where the page daemon runs the OOM killer, its
1806  * efforts to write-back vnode-backed pages may have stalled.  This
1807  * could be caused by a memory allocation deadlock in the write path
1808  * that might be resolved by an OOM kill.  Therefore, physical pages
1809  * belonging to vnode-backed objects are counted, because they might
1810  * be freed without being written out first if the address space holds
1811  * the last reference to an unlinked vnode.
1812  *
1813  * Similarly, physical pages belonging to OBJT_PHYS objects are
1814  * counted because the address space might hold the last reference to
1815  * the object.
1816  */
1817 static long
1818 vm_pageout_oom_pagecount(struct vmspace *vmspace)
1819 {
1820 	vm_map_t map;
1821 	vm_map_entry_t entry;
1822 	vm_object_t obj;
1823 	long res;
1824 
1825 	map = &vmspace->vm_map;
1826 	KASSERT(!map->system_map, ("system map"));
1827 	sx_assert(&map->lock, SA_LOCKED);
1828 	res = 0;
1829 	for (entry = map->header.next; entry != &map->header;
1830 	    entry = entry->next) {
1831 		if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
1832 			continue;
1833 		obj = entry->object.vm_object;
1834 		if (obj == NULL)
1835 			continue;
1836 		if ((entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0 &&
1837 		    obj->ref_count != 1)
1838 			continue;
1839 		switch (obj->type) {
1840 		case OBJT_DEFAULT:
1841 		case OBJT_SWAP:
1842 		case OBJT_PHYS:
1843 		case OBJT_VNODE:
1844 			res += obj->resident_page_count;
1845 			break;
1846 		}
1847 	}
1848 	return (res);
1849 }
1850 
1851 void
1852 vm_pageout_oom(int shortage)
1853 {
1854 	struct proc *p, *bigproc;
1855 	vm_offset_t size, bigsize;
1856 	struct thread *td;
1857 	struct vmspace *vm;
1858 
1859 	/*
1860 	 * We keep the process bigproc locked once we find it to keep anyone
1861 	 * from messing with it; however, there is a possibility of
1862 	 * deadlock if process B is bigproc and one of its child processes
1863 	 * attempts to propagate a signal to B while we are waiting for A's
1864 	 * lock while walking this list.  To avoid this, we don't block on
1865 	 * the process lock but just skip a process if it is already locked.
1866 	 */
1867 	bigproc = NULL;
1868 	bigsize = 0;
1869 	sx_slock(&allproc_lock);
1870 	FOREACH_PROC_IN_SYSTEM(p) {
1871 		int breakout;
1872 
1873 		PROC_LOCK(p);
1874 
1875 		/*
1876 		 * If this is a system, protected or killed process, skip it.
1877 		 */
1878 		if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC |
1879 		    P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 ||
1880 		    p->p_pid == 1 || P_KILLED(p) ||
1881 		    (p->p_pid < 48 && swap_pager_avail != 0)) {
1882 			PROC_UNLOCK(p);
1883 			continue;
1884 		}
1885 		/*
1886 		 * If the process is in a non-running type state,
1887 		 * don't touch it.  Check all the threads individually.
1888 		 */
1889 		breakout = 0;
1890 		FOREACH_THREAD_IN_PROC(p, td) {
1891 			thread_lock(td);
1892 			if (!TD_ON_RUNQ(td) &&
1893 			    !TD_IS_RUNNING(td) &&
1894 			    !TD_IS_SLEEPING(td) &&
1895 			    !TD_IS_SUSPENDED(td) &&
1896 			    !TD_IS_SWAPPED(td)) {
1897 				thread_unlock(td);
1898 				breakout = 1;
1899 				break;
1900 			}
1901 			thread_unlock(td);
1902 		}
1903 		if (breakout) {
1904 			PROC_UNLOCK(p);
1905 			continue;
1906 		}
1907 		/*
1908 		 * get the process size
1909 		 */
1910 		vm = vmspace_acquire_ref(p);
1911 		if (vm == NULL) {
1912 			PROC_UNLOCK(p);
1913 			continue;
1914 		}
1915 		_PHOLD_LITE(p);
1916 		PROC_UNLOCK(p);
1917 		sx_sunlock(&allproc_lock);
1918 		if (!vm_map_trylock_read(&vm->vm_map)) {
1919 			vmspace_free(vm);
1920 			sx_slock(&allproc_lock);
1921 			PRELE(p);
1922 			continue;
1923 		}
1924 		size = vmspace_swap_count(vm);
1925 		if (shortage == VM_OOM_MEM)
1926 			size += vm_pageout_oom_pagecount(vm);
1927 		vm_map_unlock_read(&vm->vm_map);
1928 		vmspace_free(vm);
1929 		sx_slock(&allproc_lock);
1930 
1931 		/*
1932 		 * If this process is bigger than the biggest one,
1933 		 * remember it.
1934 		 */
1935 		if (size > bigsize) {
1936 			if (bigproc != NULL)
1937 				PRELE(bigproc);
1938 			bigproc = p;
1939 			bigsize = size;
1940 		} else {
1941 			PRELE(p);
1942 		}
1943 	}
1944 	sx_sunlock(&allproc_lock);
1945 	if (bigproc != NULL) {
1946 		if (vm_panic_on_oom != 0)
1947 			panic("out of swap space");
1948 		PROC_LOCK(bigproc);
1949 		killproc(bigproc, "out of swap space");
1950 		sched_nice(bigproc, PRIO_MIN);
1951 		_PRELE(bigproc);
1952 		PROC_UNLOCK(bigproc);
1953 		wakeup(&vm_cnt.v_free_count);
1954 	}
1955 }
1956 
1957 static void
1958 vm_pageout_worker(void *arg)
1959 {
1960 	struct vm_domain *domain;
1961 	int domidx, pass;
1962 	bool target_met;
1963 
1964 	domidx = (uintptr_t)arg;
1965 	domain = &vm_dom[domidx];
1966 	pass = 0;
1967 	target_met = true;
1968 
1969 	/*
1970 	 * XXXKIB It could be useful to bind pageout daemon threads to
1971 	 * the cores belonging to the domain, from which vm_page_array
1972 	 * is allocated.
1973 	 */
1974 
1975 	KASSERT(domain->vmd_segs != 0, ("domain without segments"));
1976 	domain->vmd_last_active_scan = ticks;
1977 	vm_pageout_init_marker(&domain->vmd_marker, PQ_INACTIVE);
1978 	vm_pageout_init_marker(&domain->vmd_inacthead, PQ_INACTIVE);
1979 	TAILQ_INSERT_HEAD(&domain->vmd_pagequeues[PQ_INACTIVE].pq_pl,
1980 	    &domain->vmd_inacthead, plinks.q);
1981 
1982 	/*
1983 	 * The pageout daemon worker is never done, so loop forever.
1984 	 */
1985 	while (TRUE) {
1986 		mtx_lock(&vm_page_queue_free_mtx);
1987 
1988 		/*
1989 		 * Generally, after a level >= 1 scan, if there are enough
1990 		 * free pages to wakeup the waiters, then they are already
1991 		 * awake.  A call to vm_page_free() during the scan awakened
1992 		 * them.  However, in the following case, this wakeup serves
1993 		 * to bound the amount of time that a thread might wait.
1994 		 * Suppose a thread's call to vm_page_alloc() fails, but
1995 		 * before that thread calls VM_WAIT, enough pages are freed by
1996 		 * other threads to alleviate the free page shortage.  The
1997 		 * thread will, nonetheless, wait until another page is freed
1998 		 * or this wakeup is performed.
1999 		 */
2000 		if (vm_pages_needed && !vm_page_count_min()) {
2001 			vm_pages_needed = false;
2002 			wakeup(&vm_cnt.v_free_count);
2003 		}
2004 
2005 		/*
2006 		 * Do not clear vm_pageout_wanted until we reach our free page
2007 		 * target.  Otherwise, we may be awakened over and over again,
2008 		 * wasting CPU time.
2009 		 */
2010 		if (vm_pageout_wanted && target_met)
2011 			vm_pageout_wanted = false;
2012 
2013 		/*
2014 		 * Might the page daemon receive a wakeup call?
2015 		 */
2016 		if (vm_pageout_wanted) {
2017 			/*
2018 			 * No.  Either vm_pageout_wanted was set by another
2019 			 * thread during the previous scan, which must have
2020 			 * been a level 0 scan, or vm_pageout_wanted was
2021 			 * already set and the scan failed to free enough
2022 			 * pages.  If we haven't yet performed a level >= 1
2023 			 * (page reclamation) scan, then increase the level
2024 			 * and scan again now.  Otherwise, sleep a bit and
2025 			 * try again later.
2026 			 */
2027 			mtx_unlock(&vm_page_queue_free_mtx);
2028 			if (pass >= 1)
2029 				pause("psleep", hz / VM_INACT_SCAN_RATE);
2030 			pass++;
2031 		} else {
2032 			/*
2033 			 * Yes.  Sleep until pages need to be reclaimed or
2034 			 * have their reference stats updated.
2035 			 */
2036 			if (mtx_sleep(&vm_pageout_wanted,
2037 			    &vm_page_queue_free_mtx, PDROP | PVM, "psleep",
2038 			    hz) == 0) {
2039 				PCPU_INC(cnt.v_pdwakeups);
2040 				pass = 1;
2041 			} else
2042 				pass = 0;
2043 		}
2044 
2045 		target_met = vm_pageout_scan(domain, pass);
2046 	}
2047 }
2048 
2049 /*
2050  *	vm_pageout_init initialises basic pageout daemon settings.
2051  */
2052 static void
2053 vm_pageout_init(void)
2054 {
2055 	/*
2056 	 * Initialize some paging parameters.
2057 	 */
2058 	vm_cnt.v_interrupt_free_min = 2;
2059 	if (vm_cnt.v_page_count < 2000)
2060 		vm_pageout_page_count = 8;
2061 
2062 	/*
2063 	 * v_free_reserved needs to include enough for the largest
2064 	 * swap pager structures plus enough for any pv_entry structs
2065 	 * when paging.
2066 	 */
2067 	if (vm_cnt.v_page_count > 1024)
2068 		vm_cnt.v_free_min = 4 + (vm_cnt.v_page_count - 1024) / 200;
2069 	else
2070 		vm_cnt.v_free_min = 4;
2071 	vm_cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
2072 	    vm_cnt.v_interrupt_free_min;
2073 	vm_cnt.v_free_reserved = vm_pageout_page_count +
2074 	    vm_cnt.v_pageout_free_min + (vm_cnt.v_page_count / 768);
2075 	vm_cnt.v_free_severe = vm_cnt.v_free_min / 2;
2076 	vm_cnt.v_free_target = 4 * vm_cnt.v_free_min + vm_cnt.v_free_reserved;
2077 	vm_cnt.v_free_min += vm_cnt.v_free_reserved;
2078 	vm_cnt.v_free_severe += vm_cnt.v_free_reserved;
2079 	vm_cnt.v_inactive_target = (3 * vm_cnt.v_free_target) / 2;
2080 	if (vm_cnt.v_inactive_target > vm_cnt.v_free_count / 3)
2081 		vm_cnt.v_inactive_target = vm_cnt.v_free_count / 3;
2082 
2083 	/*
2084 	 * Set the default wakeup threshold to be 10% above the minimum
2085 	 * page limit.  This keeps the steady state out of shortfall.
2086 	 */
2087 	vm_pageout_wakeup_thresh = (vm_cnt.v_free_min / 10) * 11;
2088 
2089 	/*
2090 	 * Set interval in seconds for active scan.  We want to visit each
2091 	 * page at least once every ten minutes.  This is to prevent worst
2092 	 * case paging behaviors with stale active LRU.
2093 	 */
2094 	if (vm_pageout_update_period == 0)
2095 		vm_pageout_update_period = 600;
2096 
2097 	/* XXX does not really belong here */
2098 	if (vm_page_max_wired == 0)
2099 		vm_page_max_wired = vm_cnt.v_free_count / 3;
2100 
2101 	/*
2102 	 * Target amount of memory to move out of the laundry queue during a
2103 	 * background laundering.  This is proportional to the amount of system
2104 	 * memory.
2105 	 */
2106 	vm_background_launder_target = (vm_cnt.v_free_target -
2107 	    vm_cnt.v_free_min) / 10;
2108 }
2109 
2110 /*
2111  *     vm_pageout is the high level pageout daemon.
2112  */
2113 static void
2114 vm_pageout(void)
2115 {
2116 	int error;
2117 #ifdef VM_NUMA_ALLOC
2118 	int i;
2119 #endif
2120 
2121 	swap_pager_swap_init();
2122 	error = kthread_add(vm_pageout_laundry_worker, NULL, curproc, NULL,
2123 	    0, 0, "laundry: dom0");
2124 	if (error != 0)
2125 		panic("starting laundry for domain 0, error %d", error);
2126 #ifdef VM_NUMA_ALLOC
2127 	for (i = 1; i < vm_ndomains; i++) {
2128 		error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i,
2129 		    curproc, NULL, 0, 0, "dom%d", i);
2130 		if (error != 0) {
2131 			panic("starting pageout for domain %d, error %d\n",
2132 			    i, error);
2133 		}
2134 	}
2135 #endif
2136 	error = kthread_add(uma_reclaim_worker, NULL, curproc, NULL,
2137 	    0, 0, "uma");
2138 	if (error != 0)
2139 		panic("starting uma_reclaim helper, error %d\n", error);
2140 	vm_pageout_worker((void *)(uintptr_t)0);
2141 }
2142 
2143 /*
2144  * Unless the free page queue lock is held by the caller, this function
2145  * should be regarded as advisory.  Specifically, the caller should
2146  * not msleep() on &vm_cnt.v_free_count following this function unless
2147  * the free page queue lock is held until the msleep() is performed.
2148  */
2149 void
2150 pagedaemon_wakeup(void)
2151 {
2152 
2153 	if (!vm_pageout_wanted && curthread->td_proc != pageproc) {
2154 		vm_pageout_wanted = true;
2155 		wakeup(&vm_pageout_wanted);
2156 	}
2157 }
2158 
2159 #if !defined(NO_SWAPPING)
2160 static void
2161 vm_req_vmdaemon(int req)
2162 {
2163 	static int lastrun = 0;
2164 
2165 	mtx_lock(&vm_daemon_mtx);
2166 	vm_pageout_req_swapout |= req;
2167 	if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
2168 		wakeup(&vm_daemon_needed);
2169 		lastrun = ticks;
2170 	}
2171 	mtx_unlock(&vm_daemon_mtx);
2172 }
2173 
2174 static void
2175 vm_daemon(void)
2176 {
2177 	struct rlimit rsslim;
2178 	struct proc *p;
2179 	struct thread *td;
2180 	struct vmspace *vm;
2181 	int breakout, swapout_flags, tryagain, attempts;
2182 #ifdef RACCT
2183 	uint64_t rsize, ravailable;
2184 #endif
2185 
2186 	while (TRUE) {
2187 		mtx_lock(&vm_daemon_mtx);
2188 		msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep",
2189 #ifdef RACCT
2190 		    racct_enable ? hz : 0
2191 #else
2192 		    0
2193 #endif
2194 		);
2195 		swapout_flags = vm_pageout_req_swapout;
2196 		vm_pageout_req_swapout = 0;
2197 		mtx_unlock(&vm_daemon_mtx);
2198 		if (swapout_flags)
2199 			swapout_procs(swapout_flags);
2200 
2201 		/*
2202 		 * scan the processes for exceeding their rlimits or if
2203 		 * process is swapped out -- deactivate pages
2204 		 */
2205 		tryagain = 0;
2206 		attempts = 0;
2207 again:
2208 		attempts++;
2209 		sx_slock(&allproc_lock);
2210 		FOREACH_PROC_IN_SYSTEM(p) {
2211 			vm_pindex_t limit, size;
2212 
2213 			/*
2214 			 * if this is a system process or if we have already
2215 			 * looked at this process, skip it.
2216 			 */
2217 			PROC_LOCK(p);
2218 			if (p->p_state != PRS_NORMAL ||
2219 			    p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) {
2220 				PROC_UNLOCK(p);
2221 				continue;
2222 			}
2223 			/*
2224 			 * if the process is in a non-running type state,
2225 			 * don't touch it.
2226 			 */
2227 			breakout = 0;
2228 			FOREACH_THREAD_IN_PROC(p, td) {
2229 				thread_lock(td);
2230 				if (!TD_ON_RUNQ(td) &&
2231 				    !TD_IS_RUNNING(td) &&
2232 				    !TD_IS_SLEEPING(td) &&
2233 				    !TD_IS_SUSPENDED(td)) {
2234 					thread_unlock(td);
2235 					breakout = 1;
2236 					break;
2237 				}
2238 				thread_unlock(td);
2239 			}
2240 			if (breakout) {
2241 				PROC_UNLOCK(p);
2242 				continue;
2243 			}
2244 			/*
2245 			 * get a limit
2246 			 */
2247 			lim_rlimit_proc(p, RLIMIT_RSS, &rsslim);
2248 			limit = OFF_TO_IDX(
2249 			    qmin(rsslim.rlim_cur, rsslim.rlim_max));
2250 
2251 			/*
2252 			 * let processes that are swapped out really be
2253 			 * swapped out set the limit to nothing (will force a
2254 			 * swap-out.)
2255 			 */
2256 			if ((p->p_flag & P_INMEM) == 0)
2257 				limit = 0;	/* XXX */
2258 			vm = vmspace_acquire_ref(p);
2259 			_PHOLD_LITE(p);
2260 			PROC_UNLOCK(p);
2261 			if (vm == NULL) {
2262 				PRELE(p);
2263 				continue;
2264 			}
2265 			sx_sunlock(&allproc_lock);
2266 
2267 			size = vmspace_resident_count(vm);
2268 			if (size >= limit) {
2269 				vm_pageout_map_deactivate_pages(
2270 				    &vm->vm_map, limit);
2271 			}
2272 #ifdef RACCT
2273 			if (racct_enable) {
2274 				rsize = IDX_TO_OFF(size);
2275 				PROC_LOCK(p);
2276 				racct_set(p, RACCT_RSS, rsize);
2277 				ravailable = racct_get_available(p, RACCT_RSS);
2278 				PROC_UNLOCK(p);
2279 				if (rsize > ravailable) {
2280 					/*
2281 					 * Don't be overly aggressive; this
2282 					 * might be an innocent process,
2283 					 * and the limit could've been exceeded
2284 					 * by some memory hog.  Don't try
2285 					 * to deactivate more than 1/4th
2286 					 * of process' resident set size.
2287 					 */
2288 					if (attempts <= 8) {
2289 						if (ravailable < rsize -
2290 						    (rsize / 4)) {
2291 							ravailable = rsize -
2292 							    (rsize / 4);
2293 						}
2294 					}
2295 					vm_pageout_map_deactivate_pages(
2296 					    &vm->vm_map,
2297 					    OFF_TO_IDX(ravailable));
2298 					/* Update RSS usage after paging out. */
2299 					size = vmspace_resident_count(vm);
2300 					rsize = IDX_TO_OFF(size);
2301 					PROC_LOCK(p);
2302 					racct_set(p, RACCT_RSS, rsize);
2303 					PROC_UNLOCK(p);
2304 					if (rsize > ravailable)
2305 						tryagain = 1;
2306 				}
2307 			}
2308 #endif
2309 			vmspace_free(vm);
2310 			sx_slock(&allproc_lock);
2311 			PRELE(p);
2312 		}
2313 		sx_sunlock(&allproc_lock);
2314 		if (tryagain != 0 && attempts <= 10)
2315 			goto again;
2316 	}
2317 }
2318 #endif			/* !defined(NO_SWAPPING) */
2319