xref: /freebsd/sys/vm/vm_pageout.c (revision 39ee7a7a6bdd1557b1c3532abf60d139798ac88b)
1 /*-
2  * Copyright (c) 1991 Regents of the University of California.
3  * All rights reserved.
4  * Copyright (c) 1994 John S. Dyson
5  * All rights reserved.
6  * Copyright (c) 1994 David Greenman
7  * All rights reserved.
8  * Copyright (c) 2005 Yahoo! Technologies Norway AS
9  * All rights reserved.
10  *
11  * This code is derived from software contributed to Berkeley by
12  * The Mach Operating System project at Carnegie-Mellon University.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. All advertising materials mentioning features or use of this software
23  *    must display the following acknowledgement:
24  *	This product includes software developed by the University of
25  *	California, Berkeley and its contributors.
26  * 4. Neither the name of the University nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40  * SUCH DAMAGE.
41  *
42  *	from: @(#)vm_pageout.c	7.4 (Berkeley) 5/7/91
43  *
44  *
45  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
46  * All rights reserved.
47  *
48  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
49  *
50  * Permission to use, copy, modify and distribute this software and
51  * its documentation is hereby granted, provided that both the copyright
52  * notice and this permission notice appear in all copies of the
53  * software, derivative works or modified versions, and any portions
54  * thereof, and that both notices appear in supporting documentation.
55  *
56  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
57  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
58  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
59  *
60  * Carnegie Mellon requests users of this software to return to
61  *
62  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
63  *  School of Computer Science
64  *  Carnegie Mellon University
65  *  Pittsburgh PA 15213-3890
66  *
67  * any improvements or extensions that they make and grant Carnegie the
68  * rights to redistribute these changes.
69  */
70 
71 /*
72  *	The proverbial page-out daemon.
73  */
74 
75 #include <sys/cdefs.h>
76 __FBSDID("$FreeBSD$");
77 
78 #include "opt_vm.h"
79 #include "opt_kdtrace.h"
80 #include <sys/param.h>
81 #include <sys/systm.h>
82 #include <sys/kernel.h>
83 #include <sys/eventhandler.h>
84 #include <sys/lock.h>
85 #include <sys/mutex.h>
86 #include <sys/proc.h>
87 #include <sys/kthread.h>
88 #include <sys/ktr.h>
89 #include <sys/mount.h>
90 #include <sys/racct.h>
91 #include <sys/resourcevar.h>
92 #include <sys/sched.h>
93 #include <sys/sdt.h>
94 #include <sys/signalvar.h>
95 #include <sys/smp.h>
96 #include <sys/time.h>
97 #include <sys/vnode.h>
98 #include <sys/vmmeter.h>
99 #include <sys/rwlock.h>
100 #include <sys/sx.h>
101 #include <sys/sysctl.h>
102 
103 #include <vm/vm.h>
104 #include <vm/vm_param.h>
105 #include <vm/vm_object.h>
106 #include <vm/vm_page.h>
107 #include <vm/vm_map.h>
108 #include <vm/vm_pageout.h>
109 #include <vm/vm_pager.h>
110 #include <vm/vm_phys.h>
111 #include <vm/swap_pager.h>
112 #include <vm/vm_extern.h>
113 #include <vm/uma.h>
114 
115 /*
116  * System initialization
117  */
118 
119 /* the kernel process "vm_pageout"*/
120 static void vm_pageout(void);
121 static void vm_pageout_init(void);
122 static int vm_pageout_clean(vm_page_t m);
123 static int vm_pageout_cluster(vm_page_t m);
124 static void vm_pageout_scan(struct vm_domain *vmd, int pass);
125 static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass);
126 
127 SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init,
128     NULL);
129 
130 struct proc *pageproc;
131 
132 static struct kproc_desc page_kp = {
133 	"pagedaemon",
134 	vm_pageout,
135 	&pageproc
136 };
137 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start,
138     &page_kp);
139 
140 SDT_PROVIDER_DEFINE(vm);
141 SDT_PROBE_DEFINE(vm, , , vm__lowmem_cache);
142 SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan);
143 
144 #if !defined(NO_SWAPPING)
145 /* the kernel process "vm_daemon"*/
146 static void vm_daemon(void);
147 static struct	proc *vmproc;
148 
149 static struct kproc_desc vm_kp = {
150 	"vmdaemon",
151 	vm_daemon,
152 	&vmproc
153 };
154 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
155 #endif
156 
157 
158 int vm_pages_needed;		/* Event on which pageout daemon sleeps */
159 int vm_pageout_deficit;		/* Estimated number of pages deficit */
160 int vm_pageout_wakeup_thresh;
161 
162 #if !defined(NO_SWAPPING)
163 static int vm_pageout_req_swapout;	/* XXX */
164 static int vm_daemon_needed;
165 static struct mtx vm_daemon_mtx;
166 /* Allow for use by vm_pageout before vm_daemon is initialized. */
167 MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF);
168 #endif
169 static int vm_max_launder = 32;
170 static int vm_pageout_update_period;
171 static int defer_swap_pageouts;
172 static int disable_swap_pageouts;
173 static int lowmem_period = 10;
174 static time_t lowmem_uptime;
175 
176 #if defined(NO_SWAPPING)
177 static int vm_swap_enabled = 0;
178 static int vm_swap_idle_enabled = 0;
179 #else
180 static int vm_swap_enabled = 1;
181 static int vm_swap_idle_enabled = 0;
182 #endif
183 
184 static int vm_panic_on_oom = 0;
185 
186 SYSCTL_INT(_vm, OID_AUTO, panic_on_oom,
187 	CTLFLAG_RWTUN, &vm_panic_on_oom, 0,
188 	"panic on out of memory instead of killing the largest process");
189 
190 SYSCTL_INT(_vm, OID_AUTO, pageout_wakeup_thresh,
191 	CTLFLAG_RW, &vm_pageout_wakeup_thresh, 0,
192 	"free page threshold for waking up the pageout daemon");
193 
194 SYSCTL_INT(_vm, OID_AUTO, max_launder,
195 	CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
196 
197 SYSCTL_INT(_vm, OID_AUTO, pageout_update_period,
198 	CTLFLAG_RW, &vm_pageout_update_period, 0,
199 	"Maximum active LRU update period");
200 
201 SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RW, &lowmem_period, 0,
202 	"Low memory callback period");
203 
204 #if defined(NO_SWAPPING)
205 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
206 	CTLFLAG_RD, &vm_swap_enabled, 0, "Enable entire process swapout");
207 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
208 	CTLFLAG_RD, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
209 #else
210 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
211 	CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
212 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
213 	CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
214 #endif
215 
216 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
217 	CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
218 
219 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
220 	CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
221 
222 static int pageout_lock_miss;
223 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
224 	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
225 
226 #define VM_PAGEOUT_PAGE_COUNT 16
227 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
228 
229 int vm_page_max_wired;		/* XXX max # of wired pages system-wide */
230 SYSCTL_INT(_vm, OID_AUTO, max_wired,
231 	CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count");
232 
233 static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *);
234 static boolean_t vm_pageout_launder(struct vm_pagequeue *pq, int, vm_paddr_t,
235     vm_paddr_t);
236 #if !defined(NO_SWAPPING)
237 static void vm_pageout_map_deactivate_pages(vm_map_t, long);
238 static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long);
239 static void vm_req_vmdaemon(int req);
240 #endif
241 static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *);
242 
243 /*
244  * Initialize a dummy page for marking the caller's place in the specified
245  * paging queue.  In principle, this function only needs to set the flag
246  * PG_MARKER.  Nonetheless, it wirte busies and initializes the hold count
247  * to one as safety precautions.
248  */
249 static void
250 vm_pageout_init_marker(vm_page_t marker, u_short queue)
251 {
252 
253 	bzero(marker, sizeof(*marker));
254 	marker->flags = PG_MARKER;
255 	marker->busy_lock = VPB_SINGLE_EXCLUSIVER;
256 	marker->queue = queue;
257 	marker->hold_count = 1;
258 }
259 
260 /*
261  * vm_pageout_fallback_object_lock:
262  *
263  * Lock vm object currently associated with `m'. VM_OBJECT_TRYWLOCK is
264  * known to have failed and page queue must be either PQ_ACTIVE or
265  * PQ_INACTIVE.  To avoid lock order violation, unlock the page queues
266  * while locking the vm object.  Use marker page to detect page queue
267  * changes and maintain notion of next page on page queue.  Return
268  * TRUE if no changes were detected, FALSE otherwise.  vm object is
269  * locked on return.
270  *
271  * This function depends on both the lock portion of struct vm_object
272  * and normal struct vm_page being type stable.
273  */
274 static boolean_t
275 vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next)
276 {
277 	struct vm_page marker;
278 	struct vm_pagequeue *pq;
279 	boolean_t unchanged;
280 	u_short queue;
281 	vm_object_t object;
282 
283 	queue = m->queue;
284 	vm_pageout_init_marker(&marker, queue);
285 	pq = vm_page_pagequeue(m);
286 	object = m->object;
287 
288 	TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q);
289 	vm_pagequeue_unlock(pq);
290 	vm_page_unlock(m);
291 	VM_OBJECT_WLOCK(object);
292 	vm_page_lock(m);
293 	vm_pagequeue_lock(pq);
294 
295 	/*
296 	 * The page's object might have changed, and/or the page might
297 	 * have moved from its original position in the queue.  If the
298 	 * page's object has changed, then the caller should abandon
299 	 * processing the page because the wrong object lock was
300 	 * acquired.  Use the marker's plinks.q, not the page's, to
301 	 * determine if the page has been moved.  The state of the
302 	 * page's plinks.q can be indeterminate; whereas, the marker's
303 	 * plinks.q must be valid.
304 	 */
305 	*next = TAILQ_NEXT(&marker, plinks.q);
306 	unchanged = m->object == object &&
307 	    m == TAILQ_PREV(&marker, pglist, plinks.q);
308 	KASSERT(!unchanged || m->queue == queue,
309 	    ("page %p queue %d %d", m, queue, m->queue));
310 	TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q);
311 	return (unchanged);
312 }
313 
314 /*
315  * Lock the page while holding the page queue lock.  Use marker page
316  * to detect page queue changes and maintain notion of next page on
317  * page queue.  Return TRUE if no changes were detected, FALSE
318  * otherwise.  The page is locked on return. The page queue lock might
319  * be dropped and reacquired.
320  *
321  * This function depends on normal struct vm_page being type stable.
322  */
323 static boolean_t
324 vm_pageout_page_lock(vm_page_t m, vm_page_t *next)
325 {
326 	struct vm_page marker;
327 	struct vm_pagequeue *pq;
328 	boolean_t unchanged;
329 	u_short queue;
330 
331 	vm_page_lock_assert(m, MA_NOTOWNED);
332 	if (vm_page_trylock(m))
333 		return (TRUE);
334 
335 	queue = m->queue;
336 	vm_pageout_init_marker(&marker, queue);
337 	pq = vm_page_pagequeue(m);
338 
339 	TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q);
340 	vm_pagequeue_unlock(pq);
341 	vm_page_lock(m);
342 	vm_pagequeue_lock(pq);
343 
344 	/* Page queue might have changed. */
345 	*next = TAILQ_NEXT(&marker, plinks.q);
346 	unchanged = m == TAILQ_PREV(&marker, pglist, plinks.q);
347 	KASSERT(!unchanged || m->queue == queue,
348 	    ("page %p queue %d %d", m, queue, m->queue));
349 	TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q);
350 	return (unchanged);
351 }
352 
353 /*
354  * vm_pageout_clean:
355  *
356  * Clean the page and remove it from the laundry.
357  *
358  * We set the busy bit to cause potential page faults on this page to
359  * block.  Note the careful timing, however, the busy bit isn't set till
360  * late and we cannot do anything that will mess with the page.
361  */
362 static int
363 vm_pageout_cluster(vm_page_t m)
364 {
365 	vm_object_t object;
366 	vm_page_t mc[2*vm_pageout_page_count], pb, ps;
367 	int pageout_count;
368 	int ib, is, page_base;
369 	vm_pindex_t pindex = m->pindex;
370 
371 	vm_page_lock_assert(m, MA_OWNED);
372 	object = m->object;
373 	VM_OBJECT_ASSERT_WLOCKED(object);
374 
375 	/*
376 	 * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
377 	 * with the new swapper, but we could have serious problems paging
378 	 * out other object types if there is insufficient memory.
379 	 *
380 	 * Unfortunately, checking free memory here is far too late, so the
381 	 * check has been moved up a procedural level.
382 	 */
383 
384 	/*
385 	 * Can't clean the page if it's busy or held.
386 	 */
387 	vm_page_assert_unbusied(m);
388 	KASSERT(m->hold_count == 0, ("vm_pageout_clean: page %p is held", m));
389 	vm_page_unlock(m);
390 
391 	mc[vm_pageout_page_count] = pb = ps = m;
392 	pageout_count = 1;
393 	page_base = vm_pageout_page_count;
394 	ib = 1;
395 	is = 1;
396 
397 	/*
398 	 * Scan object for clusterable pages.
399 	 *
400 	 * We can cluster ONLY if: ->> the page is NOT
401 	 * clean, wired, busy, held, or mapped into a
402 	 * buffer, and one of the following:
403 	 * 1) The page is inactive, or a seldom used
404 	 *    active page.
405 	 * -or-
406 	 * 2) we force the issue.
407 	 *
408 	 * During heavy mmap/modification loads the pageout
409 	 * daemon can really fragment the underlying file
410 	 * due to flushing pages out of order and not trying
411 	 * align the clusters (which leave sporatic out-of-order
412 	 * holes).  To solve this problem we do the reverse scan
413 	 * first and attempt to align our cluster, then do a
414 	 * forward scan if room remains.
415 	 */
416 more:
417 	while (ib && pageout_count < vm_pageout_page_count) {
418 		vm_page_t p;
419 
420 		if (ib > pindex) {
421 			ib = 0;
422 			break;
423 		}
424 
425 		if ((p = vm_page_prev(pb)) == NULL || vm_page_busied(p)) {
426 			ib = 0;
427 			break;
428 		}
429 		vm_page_test_dirty(p);
430 		if (p->dirty == 0) {
431 			ib = 0;
432 			break;
433 		}
434 		vm_page_lock(p);
435 		if (p->queue != PQ_INACTIVE ||
436 		    p->hold_count != 0) {	/* may be undergoing I/O */
437 			vm_page_unlock(p);
438 			ib = 0;
439 			break;
440 		}
441 		vm_page_unlock(p);
442 		mc[--page_base] = pb = p;
443 		++pageout_count;
444 		++ib;
445 		/*
446 		 * alignment boundry, stop here and switch directions.  Do
447 		 * not clear ib.
448 		 */
449 		if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
450 			break;
451 	}
452 
453 	while (pageout_count < vm_pageout_page_count &&
454 	    pindex + is < object->size) {
455 		vm_page_t p;
456 
457 		if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p))
458 			break;
459 		vm_page_test_dirty(p);
460 		if (p->dirty == 0)
461 			break;
462 		vm_page_lock(p);
463 		if (p->queue != PQ_INACTIVE ||
464 		    p->hold_count != 0) {	/* may be undergoing I/O */
465 			vm_page_unlock(p);
466 			break;
467 		}
468 		vm_page_unlock(p);
469 		mc[page_base + pageout_count] = ps = p;
470 		++pageout_count;
471 		++is;
472 	}
473 
474 	/*
475 	 * If we exhausted our forward scan, continue with the reverse scan
476 	 * when possible, even past a page boundry.  This catches boundry
477 	 * conditions.
478 	 */
479 	if (ib && pageout_count < vm_pageout_page_count)
480 		goto more;
481 
482 	/*
483 	 * we allow reads during pageouts...
484 	 */
485 	return (vm_pageout_flush(&mc[page_base], pageout_count, 0, 0, NULL,
486 	    NULL));
487 }
488 
489 /*
490  * vm_pageout_flush() - launder the given pages
491  *
492  *	The given pages are laundered.  Note that we setup for the start of
493  *	I/O ( i.e. busy the page ), mark it read-only, and bump the object
494  *	reference count all in here rather then in the parent.  If we want
495  *	the parent to do more sophisticated things we may have to change
496  *	the ordering.
497  *
498  *	Returned runlen is the count of pages between mreq and first
499  *	page after mreq with status VM_PAGER_AGAIN.
500  *	*eio is set to TRUE if pager returned VM_PAGER_ERROR or VM_PAGER_FAIL
501  *	for any page in runlen set.
502  */
503 int
504 vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen,
505     boolean_t *eio)
506 {
507 	vm_object_t object = mc[0]->object;
508 	int pageout_status[count];
509 	int numpagedout = 0;
510 	int i, runlen;
511 
512 	VM_OBJECT_ASSERT_WLOCKED(object);
513 
514 	/*
515 	 * Initiate I/O.  Bump the vm_page_t->busy counter and
516 	 * mark the pages read-only.
517 	 *
518 	 * We do not have to fixup the clean/dirty bits here... we can
519 	 * allow the pager to do it after the I/O completes.
520 	 *
521 	 * NOTE! mc[i]->dirty may be partial or fragmented due to an
522 	 * edge case with file fragments.
523 	 */
524 	for (i = 0; i < count; i++) {
525 		KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
526 		    ("vm_pageout_flush: partially invalid page %p index %d/%d",
527 			mc[i], i, count));
528 		vm_page_sbusy(mc[i]);
529 		pmap_remove_write(mc[i]);
530 	}
531 	vm_object_pip_add(object, count);
532 
533 	vm_pager_put_pages(object, mc, count, flags, pageout_status);
534 
535 	runlen = count - mreq;
536 	if (eio != NULL)
537 		*eio = FALSE;
538 	for (i = 0; i < count; i++) {
539 		vm_page_t mt = mc[i];
540 
541 		KASSERT(pageout_status[i] == VM_PAGER_PEND ||
542 		    !pmap_page_is_write_mapped(mt),
543 		    ("vm_pageout_flush: page %p is not write protected", mt));
544 		switch (pageout_status[i]) {
545 		case VM_PAGER_OK:
546 		case VM_PAGER_PEND:
547 			numpagedout++;
548 			break;
549 		case VM_PAGER_BAD:
550 			/*
551 			 * Page outside of range of object. Right now we
552 			 * essentially lose the changes by pretending it
553 			 * worked.
554 			 */
555 			vm_page_undirty(mt);
556 			break;
557 		case VM_PAGER_ERROR:
558 		case VM_PAGER_FAIL:
559 			/*
560 			 * If page couldn't be paged out, then reactivate the
561 			 * page so it doesn't clog the inactive list.  (We
562 			 * will try paging out it again later).
563 			 */
564 			vm_page_lock(mt);
565 			vm_page_activate(mt);
566 			vm_page_unlock(mt);
567 			if (eio != NULL && i >= mreq && i - mreq < runlen)
568 				*eio = TRUE;
569 			break;
570 		case VM_PAGER_AGAIN:
571 			if (i >= mreq && i - mreq < runlen)
572 				runlen = i - mreq;
573 			break;
574 		}
575 
576 		/*
577 		 * If the operation is still going, leave the page busy to
578 		 * block all other accesses. Also, leave the paging in
579 		 * progress indicator set so that we don't attempt an object
580 		 * collapse.
581 		 */
582 		if (pageout_status[i] != VM_PAGER_PEND) {
583 			vm_object_pip_wakeup(object);
584 			vm_page_sunbusy(mt);
585 		}
586 	}
587 	if (prunlen != NULL)
588 		*prunlen = runlen;
589 	return (numpagedout);
590 }
591 
592 static boolean_t
593 vm_pageout_launder(struct vm_pagequeue *pq, int tries, vm_paddr_t low,
594     vm_paddr_t high)
595 {
596 	struct mount *mp;
597 	struct vnode *vp;
598 	vm_object_t object;
599 	vm_paddr_t pa;
600 	vm_page_t m, m_tmp, next;
601 	int lockmode;
602 
603 	vm_pagequeue_lock(pq);
604 	TAILQ_FOREACH_SAFE(m, &pq->pq_pl, plinks.q, next) {
605 		if ((m->flags & PG_MARKER) != 0)
606 			continue;
607 		pa = VM_PAGE_TO_PHYS(m);
608 		if (pa < low || pa + PAGE_SIZE > high)
609 			continue;
610 		if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) {
611 			vm_page_unlock(m);
612 			continue;
613 		}
614 		object = m->object;
615 		if ((!VM_OBJECT_TRYWLOCK(object) &&
616 		    (!vm_pageout_fallback_object_lock(m, &next) ||
617 		    m->hold_count != 0)) || vm_page_busied(m)) {
618 			vm_page_unlock(m);
619 			VM_OBJECT_WUNLOCK(object);
620 			continue;
621 		}
622 		vm_page_test_dirty(m);
623 		if (m->dirty == 0 && object->ref_count != 0)
624 			pmap_remove_all(m);
625 		if (m->dirty != 0) {
626 			vm_page_unlock(m);
627 			if (tries == 0 || (object->flags & OBJ_DEAD) != 0) {
628 				VM_OBJECT_WUNLOCK(object);
629 				continue;
630 			}
631 			if (object->type == OBJT_VNODE) {
632 				vm_pagequeue_unlock(pq);
633 				vp = object->handle;
634 				vm_object_reference_locked(object);
635 				VM_OBJECT_WUNLOCK(object);
636 				(void)vn_start_write(vp, &mp, V_WAIT);
637 				lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
638 				    LK_SHARED : LK_EXCLUSIVE;
639 				vn_lock(vp, lockmode | LK_RETRY);
640 				VM_OBJECT_WLOCK(object);
641 				vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
642 				VM_OBJECT_WUNLOCK(object);
643 				VOP_UNLOCK(vp, 0);
644 				vm_object_deallocate(object);
645 				vn_finished_write(mp);
646 				return (TRUE);
647 			} else if (object->type == OBJT_SWAP ||
648 			    object->type == OBJT_DEFAULT) {
649 				vm_pagequeue_unlock(pq);
650 				m_tmp = m;
651 				vm_pageout_flush(&m_tmp, 1, VM_PAGER_PUT_SYNC,
652 				    0, NULL, NULL);
653 				VM_OBJECT_WUNLOCK(object);
654 				return (TRUE);
655 			}
656 		} else {
657 			/*
658 			 * Dequeue here to prevent lock recursion in
659 			 * vm_page_cache().
660 			 */
661 			vm_page_dequeue_locked(m);
662 			vm_page_cache(m);
663 			vm_page_unlock(m);
664 		}
665 		VM_OBJECT_WUNLOCK(object);
666 	}
667 	vm_pagequeue_unlock(pq);
668 	return (FALSE);
669 }
670 
671 /*
672  * Increase the number of cached pages.  The specified value, "tries",
673  * determines which categories of pages are cached:
674  *
675  *  0: All clean, inactive pages within the specified physical address range
676  *     are cached.  Will not sleep.
677  *  1: The vm_lowmem handlers are called.  All inactive pages within
678  *     the specified physical address range are cached.  May sleep.
679  *  2: The vm_lowmem handlers are called.  All inactive and active pages
680  *     within the specified physical address range are cached.  May sleep.
681  */
682 void
683 vm_pageout_grow_cache(int tries, vm_paddr_t low, vm_paddr_t high)
684 {
685 	int actl, actmax, inactl, inactmax, dom, initial_dom;
686 	static int start_dom = 0;
687 
688 	if (tries > 0) {
689 		/*
690 		 * Decrease registered cache sizes.  The vm_lowmem handlers
691 		 * may acquire locks and/or sleep, so they can only be invoked
692 		 * when "tries" is greater than zero.
693 		 */
694 		SDT_PROBE0(vm, , , vm__lowmem_cache);
695 		EVENTHANDLER_INVOKE(vm_lowmem, 0);
696 
697 		/*
698 		 * We do this explicitly after the caches have been drained
699 		 * above.
700 		 */
701 		uma_reclaim();
702 	}
703 
704 	/*
705 	 * Make the next scan start on the next domain.
706 	 */
707 	initial_dom = atomic_fetchadd_int(&start_dom, 1) % vm_ndomains;
708 
709 	inactl = 0;
710 	inactmax = vm_cnt.v_inactive_count;
711 	actl = 0;
712 	actmax = tries < 2 ? 0 : vm_cnt.v_active_count;
713 	dom = initial_dom;
714 
715 	/*
716 	 * Scan domains in round-robin order, first inactive queues,
717 	 * then active.  Since domain usually owns large physically
718 	 * contiguous chunk of memory, it makes sense to completely
719 	 * exhaust one domain before switching to next, while growing
720 	 * the pool of contiguous physical pages.
721 	 *
722 	 * Do not even start launder a domain which cannot contain
723 	 * the specified address range, as indicated by segments
724 	 * constituting the domain.
725 	 */
726 again:
727 	if (inactl < inactmax) {
728 		if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs,
729 		    low, high) &&
730 		    vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_INACTIVE],
731 		    tries, low, high)) {
732 			inactl++;
733 			goto again;
734 		}
735 		if (++dom == vm_ndomains)
736 			dom = 0;
737 		if (dom != initial_dom)
738 			goto again;
739 	}
740 	if (actl < actmax) {
741 		if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs,
742 		    low, high) &&
743 		    vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_ACTIVE],
744 		      tries, low, high)) {
745 			actl++;
746 			goto again;
747 		}
748 		if (++dom == vm_ndomains)
749 			dom = 0;
750 		if (dom != initial_dom)
751 			goto again;
752 	}
753 }
754 
755 #if !defined(NO_SWAPPING)
756 /*
757  *	vm_pageout_object_deactivate_pages
758  *
759  *	Deactivate enough pages to satisfy the inactive target
760  *	requirements.
761  *
762  *	The object and map must be locked.
763  */
764 static void
765 vm_pageout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object,
766     long desired)
767 {
768 	vm_object_t backing_object, object;
769 	vm_page_t p;
770 	int act_delta, remove_mode;
771 
772 	VM_OBJECT_ASSERT_LOCKED(first_object);
773 	if ((first_object->flags & OBJ_FICTITIOUS) != 0)
774 		return;
775 	for (object = first_object;; object = backing_object) {
776 		if (pmap_resident_count(pmap) <= desired)
777 			goto unlock_return;
778 		VM_OBJECT_ASSERT_LOCKED(object);
779 		if ((object->flags & OBJ_UNMANAGED) != 0 ||
780 		    object->paging_in_progress != 0)
781 			goto unlock_return;
782 
783 		remove_mode = 0;
784 		if (object->shadow_count > 1)
785 			remove_mode = 1;
786 		/*
787 		 * Scan the object's entire memory queue.
788 		 */
789 		TAILQ_FOREACH(p, &object->memq, listq) {
790 			if (pmap_resident_count(pmap) <= desired)
791 				goto unlock_return;
792 			if (vm_page_busied(p))
793 				continue;
794 			PCPU_INC(cnt.v_pdpages);
795 			vm_page_lock(p);
796 			if (p->wire_count != 0 || p->hold_count != 0 ||
797 			    !pmap_page_exists_quick(pmap, p)) {
798 				vm_page_unlock(p);
799 				continue;
800 			}
801 			act_delta = pmap_ts_referenced(p);
802 			if ((p->aflags & PGA_REFERENCED) != 0) {
803 				if (act_delta == 0)
804 					act_delta = 1;
805 				vm_page_aflag_clear(p, PGA_REFERENCED);
806 			}
807 			if (p->queue != PQ_ACTIVE && act_delta != 0) {
808 				vm_page_activate(p);
809 				p->act_count += act_delta;
810 			} else if (p->queue == PQ_ACTIVE) {
811 				if (act_delta == 0) {
812 					p->act_count -= min(p->act_count,
813 					    ACT_DECLINE);
814 					if (!remove_mode && p->act_count == 0) {
815 						pmap_remove_all(p);
816 						vm_page_deactivate(p);
817 					} else
818 						vm_page_requeue(p);
819 				} else {
820 					vm_page_activate(p);
821 					if (p->act_count < ACT_MAX -
822 					    ACT_ADVANCE)
823 						p->act_count += ACT_ADVANCE;
824 					vm_page_requeue(p);
825 				}
826 			} else if (p->queue == PQ_INACTIVE)
827 				pmap_remove_all(p);
828 			vm_page_unlock(p);
829 		}
830 		if ((backing_object = object->backing_object) == NULL)
831 			goto unlock_return;
832 		VM_OBJECT_RLOCK(backing_object);
833 		if (object != first_object)
834 			VM_OBJECT_RUNLOCK(object);
835 	}
836 unlock_return:
837 	if (object != first_object)
838 		VM_OBJECT_RUNLOCK(object);
839 }
840 
841 /*
842  * deactivate some number of pages in a map, try to do it fairly, but
843  * that is really hard to do.
844  */
845 static void
846 vm_pageout_map_deactivate_pages(map, desired)
847 	vm_map_t map;
848 	long desired;
849 {
850 	vm_map_entry_t tmpe;
851 	vm_object_t obj, bigobj;
852 	int nothingwired;
853 
854 	if (!vm_map_trylock(map))
855 		return;
856 
857 	bigobj = NULL;
858 	nothingwired = TRUE;
859 
860 	/*
861 	 * first, search out the biggest object, and try to free pages from
862 	 * that.
863 	 */
864 	tmpe = map->header.next;
865 	while (tmpe != &map->header) {
866 		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
867 			obj = tmpe->object.vm_object;
868 			if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) {
869 				if (obj->shadow_count <= 1 &&
870 				    (bigobj == NULL ||
871 				     bigobj->resident_page_count < obj->resident_page_count)) {
872 					if (bigobj != NULL)
873 						VM_OBJECT_RUNLOCK(bigobj);
874 					bigobj = obj;
875 				} else
876 					VM_OBJECT_RUNLOCK(obj);
877 			}
878 		}
879 		if (tmpe->wired_count > 0)
880 			nothingwired = FALSE;
881 		tmpe = tmpe->next;
882 	}
883 
884 	if (bigobj != NULL) {
885 		vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired);
886 		VM_OBJECT_RUNLOCK(bigobj);
887 	}
888 	/*
889 	 * Next, hunt around for other pages to deactivate.  We actually
890 	 * do this search sort of wrong -- .text first is not the best idea.
891 	 */
892 	tmpe = map->header.next;
893 	while (tmpe != &map->header) {
894 		if (pmap_resident_count(vm_map_pmap(map)) <= desired)
895 			break;
896 		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
897 			obj = tmpe->object.vm_object;
898 			if (obj != NULL) {
899 				VM_OBJECT_RLOCK(obj);
900 				vm_pageout_object_deactivate_pages(map->pmap, obj, desired);
901 				VM_OBJECT_RUNLOCK(obj);
902 			}
903 		}
904 		tmpe = tmpe->next;
905 	}
906 
907 	/*
908 	 * Remove all mappings if a process is swapped out, this will free page
909 	 * table pages.
910 	 */
911 	if (desired == 0 && nothingwired) {
912 		pmap_remove(vm_map_pmap(map), vm_map_min(map),
913 		    vm_map_max(map));
914 	}
915 
916 	vm_map_unlock(map);
917 }
918 #endif		/* !defined(NO_SWAPPING) */
919 
920 /*
921  * Attempt to acquire all of the necessary locks to launder a page and
922  * then call through the clustering layer to PUTPAGES.  Wait a short
923  * time for a vnode lock.
924  *
925  * Requires the page and object lock on entry, releases both before return.
926  * Returns 0 on success and an errno otherwise.
927  */
928 static int
929 vm_pageout_clean(vm_page_t m)
930 {
931 	struct vnode *vp;
932 	struct mount *mp;
933 	vm_object_t object;
934 	vm_pindex_t pindex;
935 	int error, lockmode;
936 
937 	vm_page_assert_locked(m);
938 	object = m->object;
939 	VM_OBJECT_ASSERT_WLOCKED(object);
940 	error = 0;
941 	vp = NULL;
942 	mp = NULL;
943 
944 	/*
945 	 * The object is already known NOT to be dead.   It
946 	 * is possible for the vget() to block the whole
947 	 * pageout daemon, but the new low-memory handling
948 	 * code should prevent it.
949 	 *
950 	 * We can't wait forever for the vnode lock, we might
951 	 * deadlock due to a vn_read() getting stuck in
952 	 * vm_wait while holding this vnode.  We skip the
953 	 * vnode if we can't get it in a reasonable amount
954 	 * of time.
955 	 */
956 	if (object->type == OBJT_VNODE) {
957 		vm_page_unlock(m);
958 		vp = object->handle;
959 		if (vp->v_type == VREG &&
960 		    vn_start_write(vp, &mp, V_NOWAIT) != 0) {
961 			mp = NULL;
962 			error = EDEADLK;
963 			goto unlock_all;
964 		}
965 		KASSERT(mp != NULL,
966 		    ("vp %p with NULL v_mount", vp));
967 		vm_object_reference_locked(object);
968 		pindex = m->pindex;
969 		VM_OBJECT_WUNLOCK(object);
970 		lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
971 		    LK_SHARED : LK_EXCLUSIVE;
972 		if (vget(vp, lockmode | LK_TIMELOCK, curthread)) {
973 			vp = NULL;
974 			error = EDEADLK;
975 			goto unlock_mp;
976 		}
977 		VM_OBJECT_WLOCK(object);
978 		vm_page_lock(m);
979 		/*
980 		 * While the object and page were unlocked, the page
981 		 * may have been:
982 		 * (1) moved to a different queue,
983 		 * (2) reallocated to a different object,
984 		 * (3) reallocated to a different offset, or
985 		 * (4) cleaned.
986 		 */
987 		if (m->queue != PQ_INACTIVE || m->object != object ||
988 		    m->pindex != pindex || m->dirty == 0) {
989 			vm_page_unlock(m);
990 			error = ENXIO;
991 			goto unlock_all;
992 		}
993 
994 		/*
995 		 * The page may have been busied or held while the object
996 		 * and page locks were released.
997 		 */
998 		if (vm_page_busied(m) || m->hold_count != 0) {
999 			vm_page_unlock(m);
1000 			error = EBUSY;
1001 			goto unlock_all;
1002 		}
1003 	}
1004 
1005 	/*
1006 	 * If a page is dirty, then it is either being washed
1007 	 * (but not yet cleaned) or it is still in the
1008 	 * laundry.  If it is still in the laundry, then we
1009 	 * start the cleaning operation.
1010 	 */
1011 	if (vm_pageout_cluster(m) == 0)
1012 		error = EIO;
1013 
1014 unlock_all:
1015 	VM_OBJECT_WUNLOCK(object);
1016 
1017 unlock_mp:
1018 	vm_page_lock_assert(m, MA_NOTOWNED);
1019 	if (mp != NULL) {
1020 		if (vp != NULL)
1021 			vput(vp);
1022 		vm_object_deallocate(object);
1023 		vn_finished_write(mp);
1024 	}
1025 
1026 	return (error);
1027 }
1028 
1029 /*
1030  *	vm_pageout_scan does the dirty work for the pageout daemon.
1031  *
1032  *	pass 0 - Update active LRU/deactivate pages
1033  *	pass 1 - Move inactive to cache or free
1034  *	pass 2 - Launder dirty pages
1035  */
1036 static void
1037 vm_pageout_scan(struct vm_domain *vmd, int pass)
1038 {
1039 	vm_page_t m, next;
1040 	struct vm_pagequeue *pq;
1041 	vm_object_t object;
1042 	long min_scan;
1043 	int act_delta, addl_page_shortage, deficit, error, maxlaunder, maxscan;
1044 	int page_shortage, scan_tick, scanned, vnodes_skipped;
1045 	boolean_t pageout_ok, queues_locked;
1046 
1047 	/*
1048 	 * If we need to reclaim memory ask kernel caches to return
1049 	 * some.  We rate limit to avoid thrashing.
1050 	 */
1051 	if (vmd == &vm_dom[0] && pass > 0 &&
1052 	    (time_uptime - lowmem_uptime) >= lowmem_period) {
1053 		/*
1054 		 * Decrease registered cache sizes.
1055 		 */
1056 		SDT_PROBE0(vm, , , vm__lowmem_scan);
1057 		EVENTHANDLER_INVOKE(vm_lowmem, 0);
1058 		/*
1059 		 * We do this explicitly after the caches have been
1060 		 * drained above.
1061 		 */
1062 		uma_reclaim();
1063 		lowmem_uptime = time_uptime;
1064 	}
1065 
1066 	/*
1067 	 * The addl_page_shortage is the number of temporarily
1068 	 * stuck pages in the inactive queue.  In other words, the
1069 	 * number of pages from the inactive count that should be
1070 	 * discounted in setting the target for the active queue scan.
1071 	 */
1072 	addl_page_shortage = 0;
1073 
1074 	/*
1075 	 * Calculate the number of pages we want to either free or move
1076 	 * to the cache.
1077 	 */
1078 	if (pass > 0) {
1079 		deficit = atomic_readandclear_int(&vm_pageout_deficit);
1080 		page_shortage = vm_paging_target() + deficit;
1081 	} else
1082 		page_shortage = deficit = 0;
1083 
1084 	/*
1085 	 * maxlaunder limits the number of dirty pages we flush per scan.
1086 	 * For most systems a smaller value (16 or 32) is more robust under
1087 	 * extreme memory and disk pressure because any unnecessary writes
1088 	 * to disk can result in extreme performance degredation.  However,
1089 	 * systems with excessive dirty pages (especially when MAP_NOSYNC is
1090 	 * used) will die horribly with limited laundering.  If the pageout
1091 	 * daemon cannot clean enough pages in the first pass, we let it go
1092 	 * all out in succeeding passes.
1093 	 */
1094 	if ((maxlaunder = vm_max_launder) <= 1)
1095 		maxlaunder = 1;
1096 	if (pass > 1)
1097 		maxlaunder = 10000;
1098 
1099 	vnodes_skipped = 0;
1100 
1101 	/*
1102 	 * Start scanning the inactive queue for pages we can move to the
1103 	 * cache or free.  The scan will stop when the target is reached or
1104 	 * we have scanned the entire inactive queue.  Note that m->act_count
1105 	 * is not used to form decisions for the inactive queue, only for the
1106 	 * active queue.
1107 	 */
1108 	pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
1109 	maxscan = pq->pq_cnt;
1110 	vm_pagequeue_lock(pq);
1111 	queues_locked = TRUE;
1112 	for (m = TAILQ_FIRST(&pq->pq_pl);
1113 	     m != NULL && maxscan-- > 0 && page_shortage > 0;
1114 	     m = next) {
1115 		vm_pagequeue_assert_locked(pq);
1116 		KASSERT(queues_locked, ("unlocked queues"));
1117 		KASSERT(m->queue == PQ_INACTIVE, ("Inactive queue %p", m));
1118 
1119 		PCPU_INC(cnt.v_pdpages);
1120 		next = TAILQ_NEXT(m, plinks.q);
1121 
1122 		/*
1123 		 * skip marker pages
1124 		 */
1125 		if (m->flags & PG_MARKER)
1126 			continue;
1127 
1128 		KASSERT((m->flags & PG_FICTITIOUS) == 0,
1129 		    ("Fictitious page %p cannot be in inactive queue", m));
1130 		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1131 		    ("Unmanaged page %p cannot be in inactive queue", m));
1132 
1133 		/*
1134 		 * The page or object lock acquisitions fail if the
1135 		 * page was removed from the queue or moved to a
1136 		 * different position within the queue.  In either
1137 		 * case, addl_page_shortage should not be incremented.
1138 		 */
1139 		if (!vm_pageout_page_lock(m, &next))
1140 			goto unlock_page;
1141 		else if (m->hold_count != 0) {
1142 			/*
1143 			 * Held pages are essentially stuck in the
1144 			 * queue.  So, they ought to be discounted
1145 			 * from the inactive count.  See the
1146 			 * calculation of the page_shortage for the
1147 			 * loop over the active queue below.
1148 			 */
1149 			addl_page_shortage++;
1150 			goto unlock_page;
1151 		}
1152 		object = m->object;
1153 		if (!VM_OBJECT_TRYWLOCK(object)) {
1154 			if (!vm_pageout_fallback_object_lock(m, &next))
1155 				goto unlock_object;
1156 			else if (m->hold_count != 0) {
1157 				addl_page_shortage++;
1158 				goto unlock_object;
1159 			}
1160 		}
1161 		if (vm_page_busied(m)) {
1162 			/*
1163 			 * Don't mess with busy pages.  Leave them at
1164 			 * the front of the queue.  Most likely, they
1165 			 * are being paged out and will leave the
1166 			 * queue shortly after the scan finishes.  So,
1167 			 * they ought to be discounted from the
1168 			 * inactive count.
1169 			 */
1170 			addl_page_shortage++;
1171 unlock_object:
1172 			VM_OBJECT_WUNLOCK(object);
1173 unlock_page:
1174 			vm_page_unlock(m);
1175 			continue;
1176 		}
1177 		KASSERT(m->hold_count == 0, ("Held page %p", m));
1178 
1179 		/*
1180 		 * We unlock the inactive page queue, invalidating the
1181 		 * 'next' pointer.  Use our marker to remember our
1182 		 * place.
1183 		 */
1184 		TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q);
1185 		vm_pagequeue_unlock(pq);
1186 		queues_locked = FALSE;
1187 
1188 		/*
1189 		 * Invalid pages can be easily freed. They cannot be
1190 		 * mapped, vm_page_free() asserts this.
1191 		 */
1192 		if (m->valid == 0)
1193 			goto free_page;
1194 
1195 		/*
1196 		 * If the page has been referenced and the object is not dead,
1197 		 * reactivate or requeue the page depending on whether the
1198 		 * object is mapped.
1199 		 */
1200 		if ((m->aflags & PGA_REFERENCED) != 0) {
1201 			vm_page_aflag_clear(m, PGA_REFERENCED);
1202 			act_delta = 1;
1203 		} else
1204 			act_delta = 0;
1205 		if (object->ref_count != 0) {
1206 			act_delta += pmap_ts_referenced(m);
1207 		} else {
1208 			KASSERT(!pmap_page_is_mapped(m),
1209 			    ("vm_pageout_scan: page %p is mapped", m));
1210 		}
1211 		if (act_delta != 0) {
1212 			if (object->ref_count != 0) {
1213 				vm_page_activate(m);
1214 
1215 				/*
1216 				 * Increase the activation count if the page
1217 				 * was referenced while in the inactive queue.
1218 				 * This makes it less likely that the page will
1219 				 * be returned prematurely to the inactive
1220 				 * queue.
1221  				 */
1222 				m->act_count += act_delta + ACT_ADVANCE;
1223 				goto drop_page;
1224 			} else if ((object->flags & OBJ_DEAD) == 0)
1225 				goto requeue_page;
1226 		}
1227 
1228 		/*
1229 		 * If the page appears to be clean at the machine-independent
1230 		 * layer, then remove all of its mappings from the pmap in
1231 		 * anticipation of placing it onto the cache queue.  If,
1232 		 * however, any of the page's mappings allow write access,
1233 		 * then the page may still be modified until the last of those
1234 		 * mappings are removed.
1235 		 */
1236 		if (object->ref_count != 0) {
1237 			vm_page_test_dirty(m);
1238 			if (m->dirty == 0)
1239 				pmap_remove_all(m);
1240 		}
1241 
1242 		if (m->dirty == 0) {
1243 			/*
1244 			 * Clean pages can be freed.
1245 			 */
1246 free_page:
1247 			vm_page_free(m);
1248 			PCPU_INC(cnt.v_dfree);
1249 			--page_shortage;
1250 		} else if ((object->flags & OBJ_DEAD) != 0) {
1251 			/*
1252 			 * Leave dirty pages from dead objects at the front of
1253 			 * the queue.  They are being paged out and freed by
1254 			 * the thread that destroyed the object.  They will
1255 			 * leave the queue shortly after the scan finishes, so
1256 			 * they should be discounted from the inactive count.
1257 			 */
1258 			addl_page_shortage++;
1259 		} else if ((m->flags & PG_WINATCFLS) == 0 && pass < 2) {
1260 			/*
1261 			 * Dirty pages need to be paged out, but flushing
1262 			 * a page is extremely expensive versus freeing
1263 			 * a clean page.  Rather then artificially limiting
1264 			 * the number of pages we can flush, we instead give
1265 			 * dirty pages extra priority on the inactive queue
1266 			 * by forcing them to be cycled through the queue
1267 			 * twice before being flushed, after which the
1268 			 * (now clean) page will cycle through once more
1269 			 * before being freed.  This significantly extends
1270 			 * the thrash point for a heavily loaded machine.
1271 			 */
1272 			m->flags |= PG_WINATCFLS;
1273 requeue_page:
1274 			vm_pagequeue_lock(pq);
1275 			queues_locked = TRUE;
1276 			vm_page_requeue_locked(m);
1277 		} else if (maxlaunder > 0) {
1278 			/*
1279 			 * We always want to try to flush some dirty pages if
1280 			 * we encounter them, to keep the system stable.
1281 			 * Normally this number is small, but under extreme
1282 			 * pressure where there are insufficient clean pages
1283 			 * on the inactive queue, we may have to go all out.
1284 			 */
1285 
1286 			if (object->type != OBJT_SWAP &&
1287 			    object->type != OBJT_DEFAULT)
1288 				pageout_ok = TRUE;
1289 			else if (disable_swap_pageouts)
1290 				pageout_ok = FALSE;
1291 			else if (defer_swap_pageouts)
1292 				pageout_ok = vm_page_count_min();
1293 			else
1294 				pageout_ok = TRUE;
1295 			if (!pageout_ok)
1296 				goto requeue_page;
1297 			error = vm_pageout_clean(m);
1298 			/*
1299 			 * Decrement page_shortage on success to account for
1300 			 * the (future) cleaned page.  Otherwise we could wind
1301 			 * up laundering or cleaning too many pages.
1302 			 */
1303 			if (error == 0) {
1304 				page_shortage--;
1305 				maxlaunder--;
1306 			} else if (error == EDEADLK) {
1307 				pageout_lock_miss++;
1308 				vnodes_skipped++;
1309 			} else if (error == EBUSY) {
1310 				addl_page_shortage++;
1311 			}
1312 			vm_page_lock_assert(m, MA_NOTOWNED);
1313 			goto relock_queues;
1314 		}
1315 drop_page:
1316 		vm_page_unlock(m);
1317 		VM_OBJECT_WUNLOCK(object);
1318 relock_queues:
1319 		if (!queues_locked) {
1320 			vm_pagequeue_lock(pq);
1321 			queues_locked = TRUE;
1322 		}
1323 		next = TAILQ_NEXT(&vmd->vmd_marker, plinks.q);
1324 		TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_marker, plinks.q);
1325 	}
1326 	vm_pagequeue_unlock(pq);
1327 
1328 #if !defined(NO_SWAPPING)
1329 	/*
1330 	 * Wakeup the swapout daemon if we didn't cache or free the targeted
1331 	 * number of pages.
1332 	 */
1333 	if (vm_swap_enabled && page_shortage > 0)
1334 		vm_req_vmdaemon(VM_SWAP_NORMAL);
1335 #endif
1336 
1337 	/*
1338 	 * Wakeup the sync daemon if we skipped a vnode in a writeable object
1339 	 * and we didn't cache or free enough pages.
1340 	 */
1341 	if (vnodes_skipped > 0 && page_shortage > vm_cnt.v_free_target -
1342 	    vm_cnt.v_free_min)
1343 		(void)speedup_syncer();
1344 
1345 	/*
1346 	 * Compute the number of pages we want to try to move from the
1347 	 * active queue to the inactive queue.
1348 	 */
1349 	page_shortage = vm_cnt.v_inactive_target - vm_cnt.v_inactive_count +
1350 	    vm_paging_target() + deficit + addl_page_shortage;
1351 
1352 	pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
1353 	vm_pagequeue_lock(pq);
1354 	maxscan = pq->pq_cnt;
1355 
1356 	/*
1357 	 * If we're just idle polling attempt to visit every
1358 	 * active page within 'update_period' seconds.
1359 	 */
1360 	scan_tick = ticks;
1361 	if (vm_pageout_update_period != 0) {
1362 		min_scan = pq->pq_cnt;
1363 		min_scan *= scan_tick - vmd->vmd_last_active_scan;
1364 		min_scan /= hz * vm_pageout_update_period;
1365 	} else
1366 		min_scan = 0;
1367 	if (min_scan > 0 || (page_shortage > 0 && maxscan > 0))
1368 		vmd->vmd_last_active_scan = scan_tick;
1369 
1370 	/*
1371 	 * Scan the active queue for pages that can be deactivated.  Update
1372 	 * the per-page activity counter and use it to identify deactivation
1373 	 * candidates.
1374 	 */
1375 	for (m = TAILQ_FIRST(&pq->pq_pl), scanned = 0; m != NULL && (scanned <
1376 	    min_scan || (page_shortage > 0 && scanned < maxscan)); m = next,
1377 	    scanned++) {
1378 
1379 		KASSERT(m->queue == PQ_ACTIVE,
1380 		    ("vm_pageout_scan: page %p isn't active", m));
1381 
1382 		next = TAILQ_NEXT(m, plinks.q);
1383 		if ((m->flags & PG_MARKER) != 0)
1384 			continue;
1385 		KASSERT((m->flags & PG_FICTITIOUS) == 0,
1386 		    ("Fictitious page %p cannot be in active queue", m));
1387 		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1388 		    ("Unmanaged page %p cannot be in active queue", m));
1389 		if (!vm_pageout_page_lock(m, &next)) {
1390 			vm_page_unlock(m);
1391 			continue;
1392 		}
1393 
1394 		/*
1395 		 * The count for pagedaemon pages is done after checking the
1396 		 * page for eligibility...
1397 		 */
1398 		PCPU_INC(cnt.v_pdpages);
1399 
1400 		/*
1401 		 * Check to see "how much" the page has been used.
1402 		 */
1403 		if ((m->aflags & PGA_REFERENCED) != 0) {
1404 			vm_page_aflag_clear(m, PGA_REFERENCED);
1405 			act_delta = 1;
1406 		} else
1407 			act_delta = 0;
1408 
1409 		/*
1410 		 * Unlocked object ref count check.  Two races are possible.
1411 		 * 1) The ref was transitioning to zero and we saw non-zero,
1412 		 *    the pmap bits will be checked unnecessarily.
1413 		 * 2) The ref was transitioning to one and we saw zero.
1414 		 *    The page lock prevents a new reference to this page so
1415 		 *    we need not check the reference bits.
1416 		 */
1417 		if (m->object->ref_count != 0)
1418 			act_delta += pmap_ts_referenced(m);
1419 
1420 		/*
1421 		 * Advance or decay the act_count based on recent usage.
1422 		 */
1423 		if (act_delta != 0) {
1424 			m->act_count += ACT_ADVANCE + act_delta;
1425 			if (m->act_count > ACT_MAX)
1426 				m->act_count = ACT_MAX;
1427 		} else
1428 			m->act_count -= min(m->act_count, ACT_DECLINE);
1429 
1430 		/*
1431 		 * Move this page to the tail of the active or inactive
1432 		 * queue depending on usage.
1433 		 */
1434 		if (m->act_count == 0) {
1435 			/* Dequeue to avoid later lock recursion. */
1436 			vm_page_dequeue_locked(m);
1437 			vm_page_deactivate(m);
1438 			page_shortage--;
1439 		} else
1440 			vm_page_requeue_locked(m);
1441 		vm_page_unlock(m);
1442 	}
1443 	vm_pagequeue_unlock(pq);
1444 #if !defined(NO_SWAPPING)
1445 	/*
1446 	 * Idle process swapout -- run once per second.
1447 	 */
1448 	if (vm_swap_idle_enabled) {
1449 		static long lsec;
1450 		if (time_second != lsec) {
1451 			vm_req_vmdaemon(VM_SWAP_IDLE);
1452 			lsec = time_second;
1453 		}
1454 	}
1455 #endif
1456 
1457 	/*
1458 	 * If we are critically low on one of RAM or swap and low on
1459 	 * the other, kill the largest process.  However, we avoid
1460 	 * doing this on the first pass in order to give ourselves a
1461 	 * chance to flush out dirty vnode-backed pages and to allow
1462 	 * active pages to be moved to the inactive queue and reclaimed.
1463 	 */
1464 	vm_pageout_mightbe_oom(vmd, pass);
1465 }
1466 
1467 static int vm_pageout_oom_vote;
1468 
1469 /*
1470  * The pagedaemon threads randlomly select one to perform the
1471  * OOM.  Trying to kill processes before all pagedaemons
1472  * failed to reach free target is premature.
1473  */
1474 static void
1475 vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass)
1476 {
1477 	int old_vote;
1478 
1479 	if (pass <= 1 || !((swap_pager_avail < 64 && vm_page_count_min()) ||
1480 	    (swap_pager_full && vm_paging_target() > 0))) {
1481 		if (vmd->vmd_oom) {
1482 			vmd->vmd_oom = FALSE;
1483 			atomic_subtract_int(&vm_pageout_oom_vote, 1);
1484 		}
1485 		return;
1486 	}
1487 
1488 	if (vmd->vmd_oom)
1489 		return;
1490 
1491 	vmd->vmd_oom = TRUE;
1492 	old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1);
1493 	if (old_vote != vm_ndomains - 1)
1494 		return;
1495 
1496 	/*
1497 	 * The current pagedaemon thread is the last in the quorum to
1498 	 * start OOM.  Initiate the selection and signaling of the
1499 	 * victim.
1500 	 */
1501 	vm_pageout_oom(VM_OOM_MEM);
1502 
1503 	/*
1504 	 * After one round of OOM terror, recall our vote.  On the
1505 	 * next pass, current pagedaemon would vote again if the low
1506 	 * memory condition is still there, due to vmd_oom being
1507 	 * false.
1508 	 */
1509 	vmd->vmd_oom = FALSE;
1510 	atomic_subtract_int(&vm_pageout_oom_vote, 1);
1511 }
1512 
1513 void
1514 vm_pageout_oom(int shortage)
1515 {
1516 	struct proc *p, *bigproc;
1517 	vm_offset_t size, bigsize;
1518 	struct thread *td;
1519 	struct vmspace *vm;
1520 
1521 	/*
1522 	 * We keep the process bigproc locked once we find it to keep anyone
1523 	 * from messing with it; however, there is a possibility of
1524 	 * deadlock if process B is bigproc and one of it's child processes
1525 	 * attempts to propagate a signal to B while we are waiting for A's
1526 	 * lock while walking this list.  To avoid this, we don't block on
1527 	 * the process lock but just skip a process if it is already locked.
1528 	 */
1529 	bigproc = NULL;
1530 	bigsize = 0;
1531 	sx_slock(&allproc_lock);
1532 	FOREACH_PROC_IN_SYSTEM(p) {
1533 		int breakout;
1534 
1535 		PROC_LOCK(p);
1536 
1537 		/*
1538 		 * If this is a system, protected or killed process, skip it.
1539 		 */
1540 		if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC |
1541 		    P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 ||
1542 		    p->p_pid == 1 || P_KILLED(p) ||
1543 		    (p->p_pid < 48 && swap_pager_avail != 0)) {
1544 			PROC_UNLOCK(p);
1545 			continue;
1546 		}
1547 		/*
1548 		 * If the process is in a non-running type state,
1549 		 * don't touch it.  Check all the threads individually.
1550 		 */
1551 		breakout = 0;
1552 		FOREACH_THREAD_IN_PROC(p, td) {
1553 			thread_lock(td);
1554 			if (!TD_ON_RUNQ(td) &&
1555 			    !TD_IS_RUNNING(td) &&
1556 			    !TD_IS_SLEEPING(td) &&
1557 			    !TD_IS_SUSPENDED(td)) {
1558 				thread_unlock(td);
1559 				breakout = 1;
1560 				break;
1561 			}
1562 			thread_unlock(td);
1563 		}
1564 		if (breakout) {
1565 			PROC_UNLOCK(p);
1566 			continue;
1567 		}
1568 		/*
1569 		 * get the process size
1570 		 */
1571 		vm = vmspace_acquire_ref(p);
1572 		if (vm == NULL) {
1573 			PROC_UNLOCK(p);
1574 			continue;
1575 		}
1576 		_PHOLD(p);
1577 		if (!vm_map_trylock_read(&vm->vm_map)) {
1578 			_PRELE(p);
1579 			PROC_UNLOCK(p);
1580 			vmspace_free(vm);
1581 			continue;
1582 		}
1583 		PROC_UNLOCK(p);
1584 		size = vmspace_swap_count(vm);
1585 		vm_map_unlock_read(&vm->vm_map);
1586 		if (shortage == VM_OOM_MEM)
1587 			size += vmspace_resident_count(vm);
1588 		vmspace_free(vm);
1589 		/*
1590 		 * if the this process is bigger than the biggest one
1591 		 * remember it.
1592 		 */
1593 		if (size > bigsize) {
1594 			if (bigproc != NULL)
1595 				PRELE(bigproc);
1596 			bigproc = p;
1597 			bigsize = size;
1598 		} else {
1599 			PRELE(p);
1600 		}
1601 	}
1602 	sx_sunlock(&allproc_lock);
1603 	if (bigproc != NULL) {
1604 		if (vm_panic_on_oom != 0)
1605 			panic("out of swap space");
1606 		PROC_LOCK(bigproc);
1607 		killproc(bigproc, "out of swap space");
1608 		sched_nice(bigproc, PRIO_MIN);
1609 		_PRELE(bigproc);
1610 		PROC_UNLOCK(bigproc);
1611 		wakeup(&vm_cnt.v_free_count);
1612 	}
1613 }
1614 
1615 static void
1616 vm_pageout_worker(void *arg)
1617 {
1618 	struct vm_domain *domain;
1619 	int domidx;
1620 
1621 	domidx = (uintptr_t)arg;
1622 	domain = &vm_dom[domidx];
1623 
1624 	/*
1625 	 * XXXKIB It could be useful to bind pageout daemon threads to
1626 	 * the cores belonging to the domain, from which vm_page_array
1627 	 * is allocated.
1628 	 */
1629 
1630 	KASSERT(domain->vmd_segs != 0, ("domain without segments"));
1631 	domain->vmd_last_active_scan = ticks;
1632 	vm_pageout_init_marker(&domain->vmd_marker, PQ_INACTIVE);
1633 
1634 	/*
1635 	 * The pageout daemon worker is never done, so loop forever.
1636 	 */
1637 	while (TRUE) {
1638 		/*
1639 		 * If we have enough free memory, wakeup waiters.  Do
1640 		 * not clear vm_pages_needed until we reach our target,
1641 		 * otherwise we may be woken up over and over again and
1642 		 * waste a lot of cpu.
1643 		 */
1644 		mtx_lock(&vm_page_queue_free_mtx);
1645 		if (vm_pages_needed && !vm_page_count_min()) {
1646 			if (!vm_paging_needed())
1647 				vm_pages_needed = 0;
1648 			wakeup(&vm_cnt.v_free_count);
1649 		}
1650 		if (vm_pages_needed) {
1651 			/*
1652 			 * We're still not done.  Either vm_pages_needed was
1653 			 * set by another thread during the previous scan
1654 			 * (typically, this happens during a level 0 scan) or
1655 			 * vm_pages_needed was already set and the scan failed
1656 			 * to free enough pages.  If we haven't yet performed
1657 			 * a level >= 2 scan (unlimited dirty cleaning), then
1658 			 * upgrade the level and scan again now.  Otherwise,
1659 			 * sleep a bit and try again later.  While sleeping,
1660 			 * vm_pages_needed can be cleared.
1661 			 */
1662 			if (domain->vmd_pass > 1)
1663 				msleep(&vm_pages_needed,
1664 				    &vm_page_queue_free_mtx, PVM, "psleep",
1665 				    hz / 2);
1666 		} else {
1667 			/*
1668 			 * Good enough, sleep until required to refresh
1669 			 * stats.
1670 			 */
1671 			msleep(&vm_pages_needed, &vm_page_queue_free_mtx,
1672 			    PVM, "psleep", hz);
1673 		}
1674 		if (vm_pages_needed) {
1675 			vm_cnt.v_pdwakeups++;
1676 			domain->vmd_pass++;
1677 		} else
1678 			domain->vmd_pass = 0;
1679 		mtx_unlock(&vm_page_queue_free_mtx);
1680 		vm_pageout_scan(domain, domain->vmd_pass);
1681 	}
1682 }
1683 
1684 /*
1685  *	vm_pageout_init initialises basic pageout daemon settings.
1686  */
1687 static void
1688 vm_pageout_init(void)
1689 {
1690 	/*
1691 	 * Initialize some paging parameters.
1692 	 */
1693 	vm_cnt.v_interrupt_free_min = 2;
1694 	if (vm_cnt.v_page_count < 2000)
1695 		vm_pageout_page_count = 8;
1696 
1697 	/*
1698 	 * v_free_reserved needs to include enough for the largest
1699 	 * swap pager structures plus enough for any pv_entry structs
1700 	 * when paging.
1701 	 */
1702 	if (vm_cnt.v_page_count > 1024)
1703 		vm_cnt.v_free_min = 4 + (vm_cnt.v_page_count - 1024) / 200;
1704 	else
1705 		vm_cnt.v_free_min = 4;
1706 	vm_cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
1707 	    vm_cnt.v_interrupt_free_min;
1708 	vm_cnt.v_free_reserved = vm_pageout_page_count +
1709 	    vm_cnt.v_pageout_free_min + (vm_cnt.v_page_count / 768);
1710 	vm_cnt.v_free_severe = vm_cnt.v_free_min / 2;
1711 	vm_cnt.v_free_target = 4 * vm_cnt.v_free_min + vm_cnt.v_free_reserved;
1712 	vm_cnt.v_free_min += vm_cnt.v_free_reserved;
1713 	vm_cnt.v_free_severe += vm_cnt.v_free_reserved;
1714 	vm_cnt.v_inactive_target = (3 * vm_cnt.v_free_target) / 2;
1715 	if (vm_cnt.v_inactive_target > vm_cnt.v_free_count / 3)
1716 		vm_cnt.v_inactive_target = vm_cnt.v_free_count / 3;
1717 
1718 	/*
1719 	 * Set the default wakeup threshold to be 10% above the minimum
1720 	 * page limit.  This keeps the steady state out of shortfall.
1721 	 */
1722 	vm_pageout_wakeup_thresh = (vm_cnt.v_free_min / 10) * 11;
1723 
1724 	/*
1725 	 * Set interval in seconds for active scan.  We want to visit each
1726 	 * page at least once every ten minutes.  This is to prevent worst
1727 	 * case paging behaviors with stale active LRU.
1728 	 */
1729 	if (vm_pageout_update_period == 0)
1730 		vm_pageout_update_period = 600;
1731 
1732 	/* XXX does not really belong here */
1733 	if (vm_page_max_wired == 0)
1734 		vm_page_max_wired = vm_cnt.v_free_count / 3;
1735 }
1736 
1737 /*
1738  *     vm_pageout is the high level pageout daemon.
1739  */
1740 static void
1741 vm_pageout(void)
1742 {
1743 	int error;
1744 #if MAXMEMDOM > 1
1745 	int i;
1746 #endif
1747 
1748 	swap_pager_swap_init();
1749 #if MAXMEMDOM > 1
1750 	for (i = 1; i < vm_ndomains; i++) {
1751 		error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i,
1752 		    curproc, NULL, 0, 0, "dom%d", i);
1753 		if (error != 0) {
1754 			panic("starting pageout for domain %d, error %d\n",
1755 			    i, error);
1756 		}
1757 	}
1758 #endif
1759 	error = kthread_add(uma_reclaim_worker, NULL, curproc, NULL,
1760 	    0, 0, "uma");
1761 	if (error != 0)
1762 		panic("starting uma_reclaim helper, error %d\n", error);
1763 	vm_pageout_worker((void *)(uintptr_t)0);
1764 }
1765 
1766 /*
1767  * Unless the free page queue lock is held by the caller, this function
1768  * should be regarded as advisory.  Specifically, the caller should
1769  * not msleep() on &vm_cnt.v_free_count following this function unless
1770  * the free page queue lock is held until the msleep() is performed.
1771  */
1772 void
1773 pagedaemon_wakeup(void)
1774 {
1775 
1776 	if (!vm_pages_needed && curthread->td_proc != pageproc) {
1777 		vm_pages_needed = 1;
1778 		wakeup(&vm_pages_needed);
1779 	}
1780 }
1781 
1782 #if !defined(NO_SWAPPING)
1783 static void
1784 vm_req_vmdaemon(int req)
1785 {
1786 	static int lastrun = 0;
1787 
1788 	mtx_lock(&vm_daemon_mtx);
1789 	vm_pageout_req_swapout |= req;
1790 	if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
1791 		wakeup(&vm_daemon_needed);
1792 		lastrun = ticks;
1793 	}
1794 	mtx_unlock(&vm_daemon_mtx);
1795 }
1796 
1797 static void
1798 vm_daemon(void)
1799 {
1800 	struct rlimit rsslim;
1801 	struct proc *p;
1802 	struct thread *td;
1803 	struct vmspace *vm;
1804 	int breakout, swapout_flags, tryagain, attempts;
1805 #ifdef RACCT
1806 	uint64_t rsize, ravailable;
1807 #endif
1808 
1809 	while (TRUE) {
1810 		mtx_lock(&vm_daemon_mtx);
1811 		msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep",
1812 #ifdef RACCT
1813 		    racct_enable ? hz : 0
1814 #else
1815 		    0
1816 #endif
1817 		);
1818 		swapout_flags = vm_pageout_req_swapout;
1819 		vm_pageout_req_swapout = 0;
1820 		mtx_unlock(&vm_daemon_mtx);
1821 		if (swapout_flags)
1822 			swapout_procs(swapout_flags);
1823 
1824 		/*
1825 		 * scan the processes for exceeding their rlimits or if
1826 		 * process is swapped out -- deactivate pages
1827 		 */
1828 		tryagain = 0;
1829 		attempts = 0;
1830 again:
1831 		attempts++;
1832 		sx_slock(&allproc_lock);
1833 		FOREACH_PROC_IN_SYSTEM(p) {
1834 			vm_pindex_t limit, size;
1835 
1836 			/*
1837 			 * if this is a system process or if we have already
1838 			 * looked at this process, skip it.
1839 			 */
1840 			PROC_LOCK(p);
1841 			if (p->p_state != PRS_NORMAL ||
1842 			    p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) {
1843 				PROC_UNLOCK(p);
1844 				continue;
1845 			}
1846 			/*
1847 			 * if the process is in a non-running type state,
1848 			 * don't touch it.
1849 			 */
1850 			breakout = 0;
1851 			FOREACH_THREAD_IN_PROC(p, td) {
1852 				thread_lock(td);
1853 				if (!TD_ON_RUNQ(td) &&
1854 				    !TD_IS_RUNNING(td) &&
1855 				    !TD_IS_SLEEPING(td) &&
1856 				    !TD_IS_SUSPENDED(td)) {
1857 					thread_unlock(td);
1858 					breakout = 1;
1859 					break;
1860 				}
1861 				thread_unlock(td);
1862 			}
1863 			if (breakout) {
1864 				PROC_UNLOCK(p);
1865 				continue;
1866 			}
1867 			/*
1868 			 * get a limit
1869 			 */
1870 			lim_rlimit_proc(p, RLIMIT_RSS, &rsslim);
1871 			limit = OFF_TO_IDX(
1872 			    qmin(rsslim.rlim_cur, rsslim.rlim_max));
1873 
1874 			/*
1875 			 * let processes that are swapped out really be
1876 			 * swapped out set the limit to nothing (will force a
1877 			 * swap-out.)
1878 			 */
1879 			if ((p->p_flag & P_INMEM) == 0)
1880 				limit = 0;	/* XXX */
1881 			vm = vmspace_acquire_ref(p);
1882 			PROC_UNLOCK(p);
1883 			if (vm == NULL)
1884 				continue;
1885 
1886 			size = vmspace_resident_count(vm);
1887 			if (size >= limit) {
1888 				vm_pageout_map_deactivate_pages(
1889 				    &vm->vm_map, limit);
1890 			}
1891 #ifdef RACCT
1892 			if (racct_enable) {
1893 				rsize = IDX_TO_OFF(size);
1894 				PROC_LOCK(p);
1895 				racct_set(p, RACCT_RSS, rsize);
1896 				ravailable = racct_get_available(p, RACCT_RSS);
1897 				PROC_UNLOCK(p);
1898 				if (rsize > ravailable) {
1899 					/*
1900 					 * Don't be overly aggressive; this
1901 					 * might be an innocent process,
1902 					 * and the limit could've been exceeded
1903 					 * by some memory hog.  Don't try
1904 					 * to deactivate more than 1/4th
1905 					 * of process' resident set size.
1906 					 */
1907 					if (attempts <= 8) {
1908 						if (ravailable < rsize -
1909 						    (rsize / 4)) {
1910 							ravailable = rsize -
1911 							    (rsize / 4);
1912 						}
1913 					}
1914 					vm_pageout_map_deactivate_pages(
1915 					    &vm->vm_map,
1916 					    OFF_TO_IDX(ravailable));
1917 					/* Update RSS usage after paging out. */
1918 					size = vmspace_resident_count(vm);
1919 					rsize = IDX_TO_OFF(size);
1920 					PROC_LOCK(p);
1921 					racct_set(p, RACCT_RSS, rsize);
1922 					PROC_UNLOCK(p);
1923 					if (rsize > ravailable)
1924 						tryagain = 1;
1925 				}
1926 			}
1927 #endif
1928 			vmspace_free(vm);
1929 		}
1930 		sx_sunlock(&allproc_lock);
1931 		if (tryagain != 0 && attempts <= 10)
1932 			goto again;
1933 	}
1934 }
1935 #endif			/* !defined(NO_SWAPPING) */
1936