xref: /illumos-gate/usr/src/uts/common/os/vm_pageout.c (revision 32640292339b07090f10ce34d455f98711077343)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2018 Joyent, Inc.
24  * Copyright 2023 Oxide Computer Company
25  * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
26  */
27 
28 /*
29  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
30  * Use is subject to license terms.
31  */
32 
33 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
34 /* All Rights Reserved */
35 
36 /*
37  * University Copyright- Copyright (c) 1982, 1986, 1988
38  * The Regents of the University of California
39  * All Rights Reserved
40  *
41  * University Acknowledgment- Portions of this document are derived from
42  * software developed by the University of California, Berkeley, and its
43  * contributors.
44  */
45 
46 #include <sys/types.h>
47 #include <sys/t_lock.h>
48 #include <sys/param.h>
49 #include <sys/buf.h>
50 #include <sys/uio.h>
51 #include <sys/proc.h>
52 #include <sys/systm.h>
53 #include <sys/mman.h>
54 #include <sys/cred.h>
55 #include <sys/vnode.h>
56 #include <sys/vm.h>
57 #include <sys/vmparam.h>
58 #include <sys/vtrace.h>
59 #include <sys/cmn_err.h>
60 #include <sys/cpuvar.h>
61 #include <sys/user.h>
62 #include <sys/kmem.h>
63 #include <sys/debug.h>
64 #include <sys/callb.h>
65 #include <sys/mem_cage.h>
66 #include <sys/time.h>
67 #include <sys/stdbool.h>
68 
69 #include <vm/hat.h>
70 #include <vm/as.h>
71 #include <vm/seg.h>
72 #include <vm/page.h>
73 #include <vm/pvn.h>
74 #include <vm/seg_kmem.h>
75 
76 /*
77  * FREE MEMORY MANAGEMENT
78  *
79  * Management of the pool of free pages is a tricky business.  There are
80  * several critical threshold values which constrain our allocation of new
81  * pages and inform the rate of paging out of memory to swap.  These threshold
82  * values, and the behaviour they induce, are described below in descending
83  * order of size -- and thus increasing order of severity!
84  *
85  *   +---------------------------------------------------- physmem (all memory)
86  *   |
87  *   | Ordinarily there are no particular constraints placed on page
88  *   v allocation.  The page scanner is not running and page_create_va()
89  *   | will effectively grant all page requests (whether from the kernel
90  *   | or from user processes) without artificial delay.
91  *   |
92  *   +------------------------ lotsfree (1.56% of physmem, min. 16MB, max. 2GB)
93  *   |
94  *   | When we have less than "lotsfree" pages, pageout_scanner() is
95  *   v signalled by schedpaging() to begin looking for pages that can
96  *   | be evicted to disk to bring us back above lotsfree.  At this
97  *   | stage there is still no constraint on allocation of free pages.
98  *   |
99  *   | For small systems, we set a lower bound of 16MB for lotsfree;
100  *   v this is the natural value for a system with 1GB memory.  This is
101  *   | to ensure that the pageout reserve pool contains at least 4MB
102  *   | for use by ZFS.
103  *   |
104  *   | For systems with a large amount of memory, we constrain lotsfree
105  *   | to be at most 2GB (with a pageout reserve of around 0.5GB), as
106  *   v at some point the required slack relates more closely to the
107  *   | rate at which paging can occur than to the total amount of memory.
108  *   |
109  *   +------------------- desfree (1/2 of lotsfree, 0.78% of physmem, min. 8MB)
110  *   |
111  *   | When we drop below desfree, a number of kernel facilities will
112  *   v wait before allocating more memory, under the assumption that
113  *   | pageout or reaping will make progress and free up some memory.
114  *   | This behaviour is not especially coordinated; look for comparisons
115  *   | of desfree and freemem.
116  *   |
117  *   | In addition to various attempts at advisory caution, clock()
118  *   | will wake up the thread that is ordinarily parked in sched().
119  *   | This routine is responsible for the heavy-handed swapping out
120  *   v of entire processes in an attempt to arrest the slide of free
121  *   | memory.  See comments in sched.c for more details.
122  *   |
123  *   +----- minfree & throttlefree (3/4 of desfree, 0.59% of physmem, min. 6MB)
124  *   |
125  *   | These two separate tunables have, by default, the same value.
126  *   v Various parts of the kernel use minfree to signal the need for
127  *   | more aggressive reclamation of memory, and sched() is more
128  *   | aggressive at swapping processes out.
129  *   |
130  *   | If free memory falls below throttlefree, page_create_va() will
131  *   | use page_create_throttle() to begin holding most requests for
132  *   | new pages while pageout and reaping free up memory.  Sleeping
133  *   v allocations (e.g., KM_SLEEP) are held here while we wait for
134  *   | more memory.  Non-sleeping allocations are generally allowed to
135  *   | proceed, unless their priority is explicitly lowered with
136  *   | KM_NORMALPRI (Note: KM_NOSLEEP_LAZY == (KM_NOSLEEP | KM_NORMALPRI).).
137  *   |
138  *   +------- pageout_reserve (3/4 of throttlefree, 0.44% of physmem, min. 4MB)
139  *   |
140  *   | When we hit throttlefree, the situation is already dire.  The
141  *   v system is generally paging out memory and swapping out entire
142  *   | processes in order to free up memory for continued operation.
143  *   |
144  *   | Unfortunately, evicting memory to disk generally requires short
145  *   | term use of additional memory; e.g., allocation of buffers for
146  *   | storage drivers, updating maps of free and used blocks, etc.
147  *   | As such, pageout_reserve is the number of pages that we keep in
148  *   | special reserve for use by pageout() and sched() and by any
149  *   v other parts of the kernel that need to be working for those to
150  *   | make forward progress such as the ZFS I/O pipeline.
151  *   |
152  *   | When we are below pageout_reserve, we fail or hold any allocation
153  *   | that has not explicitly requested access to the reserve pool.
154  *   | Access to the reserve is generally granted via the KM_PUSHPAGE
155  *   | flag, or by marking a thread T_PUSHPAGE such that all allocations
156  *   | can implicitly tap the reserve.  For more details, see the
157  *   v NOMEMWAIT() macro, the T_PUSHPAGE thread flag, the KM_PUSHPAGE
158  *   | and VM_PUSHPAGE allocation flags, and page_create_throttle().
159  *   |
160  *   +---------------------------------------------------------- no free memory
161  *   |
162  *   | If we have arrived here, things are very bad indeed.  It is
163  *   v surprisingly difficult to tell if this condition is even fatal,
164  *   | as enough memory may have been granted to pageout() and to the
165  *   | ZFS I/O pipeline that requests for eviction that have already been
166  *   | made will complete and free up memory some time soon.
167  *   |
168  *   | If free memory does not materialise, the system generally remains
169  *   | deadlocked.  The pageout_deadman() below is run once per second
170  *   | from clock(), seeking to limit the amount of time a single request
171  *   v to page out can be blocked before the system panics to get a crash
172  *   | dump and return to service.
173  *   |
174  *   +-------------------------------------------------------------------------
175  */
176 
177 /*
178  * The following parameters control operation of the page replacement
179  * algorithm.  They are initialized to 0, and then computed at boot time based
180  * on the size of the system; see setupclock().  If they are patched non-zero
181  * in a loaded vmunix they are left alone and may thus be changed per system
182  * using "mdb -kw" on the loaded system.
183  */
184 pgcnt_t		slowscan = 0;
185 pgcnt_t		fastscan = 0;
186 
187 static pgcnt_t	handspreadpages = 0;
188 
189 /*
190  * looppages:
191  *     Cached copy of the total number of pages in the system (total_pages).
192  *
193  * loopfraction:
194  *     Divisor used to relate fastscan to looppages in setupclock().
195  */
196 static uint_t	loopfraction = 2;
197 static pgcnt_t	looppages;
198 
199 static uint_t	min_percent_cpu = 4;
200 static uint_t	max_percent_cpu = 80;
201 static pgcnt_t	maxfastscan = 0;
202 static pgcnt_t	maxslowscan = 100;
203 
204 #define		MEGABYTES		(1024ULL * 1024ULL)
205 
206 /*
207  * pageout_threshold_style:
208  *     set to 1 to use the previous default threshold size calculation;
209  *     i.e., each threshold is half of the next largest value.
210  */
211 uint_t		pageout_threshold_style = 0;
212 
213 /*
214  * The operator may override these tunables to request a different minimum or
215  * maximum lotsfree value, or to change the divisor we use for automatic
216  * sizing.
217  *
218  * By default, we make lotsfree 1/64th of the total memory in the machine.  The
219  * minimum and maximum are specified in bytes, rather than pages; a zero value
220  * means the default values (below) are used.
221  */
222 uint_t		lotsfree_fraction = 64;
223 pgcnt_t		lotsfree_min = 0;
224 pgcnt_t		lotsfree_max = 0;
225 
226 #define		LOTSFREE_MIN_DEFAULT	(16 * MEGABYTES)
227 #define		LOTSFREE_MAX_DEFAULT	(2048 * MEGABYTES)
228 
229 /*
230  * If these tunables are set to non-zero values in /etc/system, and provided
231  * the value is not larger than the threshold above, the specified value will
232  * be used directly without any additional calculation or adjustment.  The boot
233  * time value of these overrides is preserved in the "clockinit" struct.  More
234  * detail is available in the comment at the top of the file.
235  */
236 pgcnt_t		maxpgio = 0;
237 pgcnt_t		minfree = 0;
238 pgcnt_t		desfree = 0;
239 pgcnt_t		lotsfree = 0;
240 pgcnt_t		needfree = 0;
241 pgcnt_t		throttlefree = 0;
242 pgcnt_t		pageout_reserve = 0;
243 
244 pgcnt_t		deficit;
245 pgcnt_t		nscan;
246 pgcnt_t		desscan;
247 
248 /* The maximum supported number of page_scanner() threads */
249 #define	MAX_PSCAN_THREADS	16
250 
251 /*
252  * Values for min_pageout_nsec, max_pageout_nsec and pageout_nsec are the
253  * number of nanoseconds in each wakeup cycle that gives the equivalent of some
254  * underlying %CPU duty cycle.
255  *
256  * min_pageout_nsec:
257  *     nanoseconds/wakeup equivalent of min_percent_cpu.
258  *
259  * max_pageout_nsec:
260  *     nanoseconds/wakeup equivalent of max_percent_cpu.
261  *
262  * pageout_nsec:
263  *     Number of nanoseconds budgeted for each wakeup cycle.
264  *     Computed each time around by schedpaging().
265  *     Varies between min_pageout_nsec and max_pageout_nsec,
266  *     depending on memory pressure.
267  */
268 static hrtime_t	min_pageout_nsec;
269 static hrtime_t	max_pageout_nsec;
270 static hrtime_t	pageout_nsec;
271 
272 static bool	reset_hands[MAX_PSCAN_THREADS];
273 
274 #define	PAGES_POLL_MASK	1023
275 
276 /*
277  * Pageout scheduling.
278  *
279  * Schedpaging controls the rate at which the page out daemon runs by
280  * setting the global variables nscan and desscan SCHEDPAGING_HZ
281  * times a second.  Nscan records the number of pages pageout has examined
282  * in its current pass; schedpaging() resets this value to zero each time
283  * it runs.  Desscan records the number of pages pageout should examine
284  * in its next pass; schedpaging() sets this value based on the amount of
285  * currently available memory.
286  */
287 #define	SCHEDPAGING_HZ	4
288 
289 #define	WAKE_PAGEOUT_SCANNER(tag)			\
290 	do {						\
291 		DTRACE_PROBE(schedpage__wake__ ## tag);	\
292 		cv_broadcast(&proc_pageout->p_cv);	\
293 	} while (0)
294 
295 /*
296  * despagescanners:
297  *	The desired number of page scanner threads. For testing purposes, this
298  *	value can be set in /etc/system or tuned directly with mdb(1). The
299  *	system will bring the actual number of threads into line with the
300  *	desired number. If set to an invalid value, the system will correct the
301  *	setting.
302  */
303 uint_t despagescanners = 0;
304 
305 /*
306  * pageout_sample_lim:
307  *     The limit on the number of samples needed to establish a value for new
308  *     pageout parameters: fastscan, slowscan, pageout_new_spread, and
309  *     handspreadpages.
310  *
311  * pageout_sample_cnt:
312  *     Current sample number.  Once the sample gets large enough, set new
313  *     values for handspreadpages, pageout_new_spread, fastscan and slowscan.
314  *
315  * pageout_sample_pages:
316  *     The accumulated number of pages scanned during sampling.
317  *
318  * pageout_sample_etime:
319  *     The accumulated nanoseconds for the sample.
320  *
321  * pageout_sampling:
322  *     True while sampling is still in progress.
323  *
324  * pageout_rate:
325  *     Rate in pages/nanosecond, computed at the end of sampling.
326  *
327  * pageout_new_spread:
328  *     Initially zero while the system scan rate is measured by
329  *     pageout_scanner(), which then sets this value once per system boot after
330  *     enough samples have been recorded (pageout_sample_cnt).  Once set, this
331  *     new value is used for fastscan and handspreadpages.
332  */
333 typedef hrtime_t hrrate_t;
334 
335 static uint64_t	pageout_sample_lim = 4;
336 static uint64_t	pageout_sample_cnt = 0;
337 static pgcnt_t	pageout_sample_pages = 0;
338 static hrtime_t	pageout_sample_etime = 0;
339 static bool	pageout_sampling = true;
340 static hrrate_t	pageout_rate = 0;
341 static pgcnt_t	pageout_new_spread = 0;
342 
343 /* The current number of page scanner threads */
344 static uint_t n_page_scanners = 1;
345 /* The number of page scanner threads that are actively scanning. */
346 static uint_t pageouts_running;
347 
348 /*
349  * Record number of times a pageout_scanner() wakeup cycle finished because it
350  * timed out (exceeded its CPU budget), rather than because it visited
351  * its budgeted number of pages.
352  */
353 uint64_t	pageout_timeouts = 0;
354 
355 #ifdef VM_STATS
356 static struct pageoutvmstats_str {
357 	ulong_t	checkpage[3];
358 } pageoutvmstats;
359 #endif /* VM_STATS */
360 
361 /*
362  * Threads waiting for free memory use this condition variable and lock until
363  * memory becomes available.
364  */
365 kmutex_t	memavail_lock;
366 kcondvar_t	memavail_cv;
367 
368 typedef enum pageout_hand {
369 	POH_FRONT = 1,
370 	POH_BACK,
371 } pageout_hand_t;
372 
373 typedef enum {
374 	CKP_INELIGIBLE,
375 	CKP_NOT_FREED,
376 	CKP_FREED,
377 } checkpage_result_t;
378 
379 static checkpage_result_t checkpage(page_t *, pageout_hand_t);
380 
381 static struct clockinit {
382 	bool ci_init;
383 	pgcnt_t ci_lotsfree_min;
384 	pgcnt_t ci_lotsfree_max;
385 	pgcnt_t ci_lotsfree;
386 	pgcnt_t ci_desfree;
387 	pgcnt_t ci_minfree;
388 	pgcnt_t ci_throttlefree;
389 	pgcnt_t ci_pageout_reserve;
390 	pgcnt_t ci_maxpgio;
391 	pgcnt_t ci_maxfastscan;
392 	pgcnt_t ci_fastscan;
393 	pgcnt_t ci_slowscan;
394 	pgcnt_t ci_handspreadpages;
395 	uint_t  ci_despagescanners;
396 } clockinit = { .ci_init = false };
397 
398 static inline pgcnt_t
399 clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum)
400 {
401 	if (value < minimum)
402 		return (minimum);
403 	else if (value > maximum)
404 		return (maximum);
405 	else
406 		return (value);
407 }
408 
409 static pgcnt_t
410 tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval)
411 {
412 	if (initval == 0 || initval >= initval_ceiling)
413 		return (defval);
414 	else
415 		return (initval);
416 }
417 
418 /*
419  * On large memory systems, multiple instances of the page scanner are run,
420  * each responsible for a separate region of memory. This speeds up page
421  * invalidation under low memory conditions.
422  *
423  * For testing purposes, despagescanners can be set in /etc/system or via
424  * mdb(1) and it will be used as a guide for how many page scanners to create;
425  * the value will be adjusted if it is not sensible. Otherwise, the number of
426  * page scanners is determined dynamically based on handspreadpages.
427  */
428 static void
429 recalc_pagescanners(void)
430 {
431 	uint_t des;
432 
433 	/* If the initial calibration has not been done, take no action. */
434 	if (pageout_new_spread == 0)
435 		return;
436 
437 	/*
438 	 * If `clockinit.ci_despagescanners` is non-zero, then a value for
439 	 * `despagescanners` was set during initial boot. In this case, if
440 	 * `despagescanners` has been reset to 0 then we want to revert to
441 	 * that initial boot value.
442 	 */
443 	if (despagescanners == 0)
444 		despagescanners = clockinit.ci_despagescanners;
445 
446 	if (despagescanners != 0) {
447 		/*
448 		 * We have a desired number of page scanners, either from
449 		 * /etc/system or set via mdb. Try and use it (it will be
450 		 * adjusted below if necessary).
451 		 */
452 		des = despagescanners;
453 	} else {
454 		/*
455 		 * Calculate the number of desired scanners based on the
456 		 * system's memory size.
457 		 *
458 		 * A 64GiB region size is used as the basis for calculating how
459 		 * many scanner threads should be created. For systems with up
460 		 * to 64GiB of RAM, a single thread is used; for very large
461 		 * memory systems the threads are limited to MAX_PSCAN_THREADS.
462 		 */
463 		des = (looppages - 1) / btop(64ULL << 30) + 1;
464 	}
465 
466 	/*
467 	 * Clamp the number of scanners so that we have no more than
468 	 * MAX_PSCAN_THREADS and so that each scanner covers at least 10% more
469 	 * than handspreadpages.
470 	 */
471 	pgcnt_t min_scanner_pages = handspreadpages + handspreadpages / 10;
472 	pgcnt_t max_scanners = looppages / min_scanner_pages;
473 	despagescanners = clamp(des, 1,
474 	    clamp(max_scanners, 1, MAX_PSCAN_THREADS));
475 }
476 
477 /*
478  * Set up the paging constants for the clock algorithm used by
479  * pageout_scanner(), and by the virtual memory system overall.  See the
480  * comments at the top of this file for more information about the threshold
481  * values and system responses to memory pressure.
482  *
483  * This routine is called once by main() at startup, after the initial size of
484  * physical memory is determined.  It may be called again later if memory is
485  * added to or removed from the system, or if new measurements of the page scan
486  * rate become available.
487  */
488 void
489 setupclock(void)
490 {
491 	bool half = (pageout_threshold_style == 1);
492 	bool recalc = true;
493 
494 	looppages = total_pages;
495 
496 	/*
497 	 * The operator may have provided specific values for some of the
498 	 * tunables via /etc/system.  On our first call, we preserve those
499 	 * values so that they can be used for subsequent recalculations.
500 	 *
501 	 * A value of zero for any tunable means we will use the default
502 	 * sizing.
503 	 */
504 	if (!clockinit.ci_init) {
505 		clockinit.ci_init = true;
506 
507 		clockinit.ci_lotsfree_min = lotsfree_min;
508 		clockinit.ci_lotsfree_max = lotsfree_max;
509 		clockinit.ci_lotsfree = lotsfree;
510 		clockinit.ci_desfree = desfree;
511 		clockinit.ci_minfree = minfree;
512 		clockinit.ci_throttlefree = throttlefree;
513 		clockinit.ci_pageout_reserve = pageout_reserve;
514 		clockinit.ci_maxpgio = maxpgio;
515 		clockinit.ci_maxfastscan = maxfastscan;
516 		clockinit.ci_fastscan = fastscan;
517 		clockinit.ci_slowscan = slowscan;
518 		clockinit.ci_handspreadpages = handspreadpages;
519 		clockinit.ci_despagescanners = despagescanners;
520 
521 		/*
522 		 * The first call does not trigger a recalculation, only
523 		 * subsequent calls.
524 		 */
525 		recalc = false;
526 	}
527 
528 	/*
529 	 * Configure paging threshold values.  For more details on what each
530 	 * threshold signifies, see the comments at the top of this file.
531 	 */
532 	lotsfree_max = tune(clockinit.ci_lotsfree_max, looppages,
533 	    btop(LOTSFREE_MAX_DEFAULT));
534 	lotsfree_min = tune(clockinit.ci_lotsfree_min, lotsfree_max,
535 	    btop(LOTSFREE_MIN_DEFAULT));
536 
537 	lotsfree = tune(clockinit.ci_lotsfree, looppages,
538 	    clamp(looppages / lotsfree_fraction, lotsfree_min, lotsfree_max));
539 
540 	desfree = tune(clockinit.ci_desfree, lotsfree,
541 	    lotsfree / 2);
542 
543 	minfree = tune(clockinit.ci_minfree, desfree,
544 	    half ? desfree / 2 : 3 * desfree / 4);
545 
546 	throttlefree = tune(clockinit.ci_throttlefree, desfree,
547 	    minfree);
548 
549 	pageout_reserve = tune(clockinit.ci_pageout_reserve, throttlefree,
550 	    half ? throttlefree / 2 : 3 * throttlefree / 4);
551 
552 	/*
553 	 * Maxpgio thresholds how much paging is acceptable.
554 	 * This figures that 2/3 busy on an arm is all that is
555 	 * tolerable for paging.  We assume one operation per disk rev.
556 	 *
557 	 * XXX - Does not account for multiple swap devices.
558 	 */
559 	if (clockinit.ci_maxpgio == 0) {
560 		maxpgio = (DISKRPM * 2) / 3;
561 	} else {
562 		maxpgio = clockinit.ci_maxpgio;
563 	}
564 
565 	/*
566 	 * The clock scan rate varies between fastscan and slowscan
567 	 * based on the amount of free memory available.  Fastscan
568 	 * rate should be set based on the number pages that can be
569 	 * scanned per sec using ~10% of processor time.  Since this
570 	 * value depends on the processor, MMU, Mhz etc., it is
571 	 * difficult to determine it in a generic manner for all
572 	 * architectures.
573 	 *
574 	 * Instead of trying to determine the number of pages scanned
575 	 * per sec for every processor, fastscan is set to be the smaller
576 	 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
577 	 * time is limited to ~4% of processor time.
578 	 *
579 	 * Setting fastscan to be 1/2 of memory allows pageout to scan
580 	 * all of memory in ~2 secs.  This implies that user pages not
581 	 * accessed within 1 sec (assuming, handspreadpages == fastscan)
582 	 * can be reclaimed when free memory is very low.  Stealing pages
583 	 * not accessed within 1 sec seems reasonable and ensures that
584 	 * active user processes don't thrash.
585 	 *
586 	 * Smaller values of fastscan result in scanning fewer pages
587 	 * every second and consequently pageout may not be able to free
588 	 * sufficient memory to maintain the minimum threshold.  Larger
589 	 * values of fastscan result in scanning a lot more pages which
590 	 * could lead to thrashing and higher CPU usage.
591 	 *
592 	 * Fastscan needs to be limited to a maximum value and should not
593 	 * scale with memory to prevent pageout from consuming too much
594 	 * time for scanning on slow CPU's and avoid thrashing, as a
595 	 * result of scanning too many pages, on faster CPU's.
596 	 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
597 	 * (the upper bound for fastscan) based on the average number
598 	 * of pages that can potentially be scanned in ~1 sec (using ~4%
599 	 * of the CPU) on some of the following machines that currently
600 	 * run Solaris 2.x:
601 	 *
602 	 *			average memory scanned in ~1 sec
603 	 *
604 	 *	25 Mhz SS1+:		23 Meg
605 	 *	LX:			37 Meg
606 	 *	50 Mhz SC2000:		68 Meg
607 	 *
608 	 *	40 Mhz 486:		26 Meg
609 	 *	66 Mhz 486:		42 Meg
610 	 *
611 	 * When free memory falls just below lotsfree, the scan rate
612 	 * goes from 0 to slowscan (i.e., pageout starts running).  This
613 	 * transition needs to be smooth and is achieved by ensuring that
614 	 * pageout scans a small number of pages to satisfy the transient
615 	 * memory demand.  This is set to not exceed 100 pages/sec (25 per
616 	 * wakeup) since scanning that many pages has no noticible impact
617 	 * on system performance.
618 	 *
619 	 * In addition to setting fastscan and slowscan, pageout is
620 	 * limited to using ~4% of the CPU.  This results in increasing
621 	 * the time taken to scan all of memory, which in turn means that
622 	 * user processes have a better opportunity of preventing their
623 	 * pages from being stolen.  This has a positive effect on
624 	 * interactive and overall system performance when memory demand
625 	 * is high.
626 	 *
627 	 * Thus, the rate at which pages are scanned for replacement will
628 	 * vary linearly between slowscan and the number of pages that
629 	 * can be scanned using ~4% of processor time instead of varying
630 	 * linearly between slowscan and fastscan.
631 	 *
632 	 * Also, the processor time used by pageout will vary from ~1%
633 	 * at slowscan to ~4% at fastscan instead of varying between
634 	 * ~1% at slowscan and ~10% at fastscan.
635 	 *
636 	 * The values chosen for the various VM parameters (fastscan,
637 	 * handspreadpages, etc) are not universally true for all machines,
638 	 * but appear to be a good rule of thumb for the machines we've
639 	 * tested.  They have the following ranges:
640 	 *
641 	 *	cpu speed:	20 to 70 Mhz
642 	 *	page size:	4K to 8K
643 	 *	memory size:	16M to 5G
644 	 *	page scan rate:	4000 - 17400 4K pages per sec
645 	 *
646 	 * The values need to be re-examined for machines which don't
647 	 * fall into the various ranges (e.g., slower or faster CPUs,
648 	 * smaller or larger pagesizes etc) shown above.
649 	 *
650 	 * On an MP machine, pageout is often unable to maintain the
651 	 * minimum paging thresholds under heavy load.  This is due to
652 	 * the fact that user processes running on other CPU's can be
653 	 * dirtying memory at a much faster pace than pageout can find
654 	 * pages to free.  The memory demands could be met by enabling
655 	 * more than one CPU to run the clock algorithm in such a manner
656 	 * that the various clock hands don't overlap.  This also makes
657 	 * it more difficult to determine the values for fastscan, slowscan
658 	 * and handspreadpages.
659 	 *
660 	 * The swapper is currently used to free up memory when pageout
661 	 * is unable to meet memory demands by swapping out processes.
662 	 * In addition to freeing up memory, swapping also reduces the
663 	 * demand for memory by preventing user processes from running
664 	 * and thereby consuming memory.
665 	 */
666 	if (clockinit.ci_maxfastscan == 0) {
667 		if (pageout_new_spread != 0) {
668 			maxfastscan = pageout_new_spread;
669 		} else {
670 			maxfastscan = MAXHANDSPREADPAGES;
671 		}
672 	} else {
673 		maxfastscan = clockinit.ci_maxfastscan;
674 	}
675 
676 	if (clockinit.ci_fastscan == 0) {
677 		fastscan = MIN(looppages / loopfraction, maxfastscan);
678 	} else {
679 		fastscan = clockinit.ci_fastscan;
680 	}
681 
682 	if (fastscan > looppages / loopfraction) {
683 		fastscan = looppages / loopfraction;
684 	}
685 
686 	/*
687 	 * Set slow scan time to 1/10 the fast scan time, but
688 	 * not to exceed maxslowscan.
689 	 */
690 	if (clockinit.ci_slowscan == 0) {
691 		slowscan = MIN(fastscan / 10, maxslowscan);
692 	} else {
693 		slowscan = clockinit.ci_slowscan;
694 	}
695 
696 	if (slowscan > fastscan / 2) {
697 		slowscan = fastscan / 2;
698 	}
699 
700 	/*
701 	 * Handspreadpages is the distance (in pages) between front and back
702 	 * pageout daemon hands.  The amount of time to reclaim a page
703 	 * once pageout examines it increases with this distance and
704 	 * decreases as the scan rate rises. It must be < the amount
705 	 * of pageable memory.
706 	 *
707 	 * Since pageout is limited to ~4% of the CPU, setting handspreadpages
708 	 * to be "fastscan" results in the front hand being a few secs
709 	 * (varies based on the processor speed) ahead of the back hand
710 	 * at fastscan rates.  This distance can be further reduced, if
711 	 * necessary, by increasing the processor time used by pageout
712 	 * to be more than ~4% and preferrably not more than ~10%.
713 	 *
714 	 * As a result, user processes have a much better chance of
715 	 * referencing their pages before the back hand examines them.
716 	 * This also significantly lowers the number of reclaims from
717 	 * the freelist since pageout does not end up freeing pages which
718 	 * may be referenced a sec later.
719 	 */
720 	if (clockinit.ci_handspreadpages == 0) {
721 		handspreadpages = fastscan;
722 	} else {
723 		handspreadpages = clockinit.ci_handspreadpages;
724 	}
725 
726 	/*
727 	 * Make sure that back hand follows front hand by at least
728 	 * 1/SCHEDPAGING_HZ seconds.  Without this test, it is possible for the
729 	 * back hand to look at a page during the same wakeup of the pageout
730 	 * daemon in which the front hand cleared its ref bit.
731 	 */
732 	if (handspreadpages >= looppages) {
733 		handspreadpages = looppages - 1;
734 	}
735 
736 	/*
737 	 * Establish the minimum and maximum length of time to be spent
738 	 * scanning pages per wakeup, limiting the scanner duty cycle. The
739 	 * input percentage values (0-100) must be converted to a fraction of
740 	 * the number of nanoseconds in a second of wall time, then further
741 	 * scaled down by the number of scanner wakeups in a second.
742 	 */
743 	min_pageout_nsec = MAX(1,
744 	    NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
745 	max_pageout_nsec = MAX(min_pageout_nsec,
746 	    NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
747 
748 	/*
749 	 * If not called for recalculation, return and skip the remaining
750 	 * steps.
751 	 */
752 	if (!recalc)
753 		return;
754 
755 	/*
756 	 * Set a flag to re-evaluate the clock hand positions.
757 	 */
758 	for (uint_t i = 0; i < MAX_PSCAN_THREADS; i++)
759 		reset_hands[i] = true;
760 
761 	recalc_pagescanners();
762 }
763 
764 static kmutex_t	pageout_mutex;
765 
766 /*
767  * Pool of available async pageout putpage requests.
768  */
769 static struct async_reqs *push_req;
770 static struct async_reqs *req_freelist;	/* available req structs */
771 static struct async_reqs *push_list;	/* pending reqs */
772 static kmutex_t push_lock;		/* protects req pool */
773 static kcondvar_t push_cv;
774 
775 /*
776  * If pageout() is stuck on a single push for this many seconds,
777  * pageout_deadman() will assume the system has hit a memory deadlock.  If set
778  * to 0, the deadman will have no effect.
779  *
780  * Note that we are only looking for stalls in the calls that pageout() makes
781  * to VOP_PUTPAGE().  These calls are merely asynchronous requests for paging
782  * I/O, which should not take long unless the underlying strategy call blocks
783  * indefinitely for memory.  The actual I/O request happens (or fails) later.
784  */
785 uint_t pageout_deadman_seconds = 90;
786 
787 static uint_t pageout_stucktime = 0;
788 static bool pageout_pushing = false;
789 static uint64_t pageout_pushcount = 0;
790 static uint64_t pageout_pushcount_seen = 0;
791 
792 int async_list_size = 8192;
793 
794 static void pageout_scanner(void *);
795 
796 /*
797  * If a page is being shared more than "po_share" times
798  * then leave it alone- don't page it out.
799  */
800 #define	MIN_PO_SHARE	(8)
801 #define	MAX_PO_SHARE	((MIN_PO_SHARE) << 24)
802 ulong_t	po_share = MIN_PO_SHARE;
803 
804 /*
805  * Schedule rate for paging.
806  * Rate is linear interpolation between
807  * slowscan with lotsfree and fastscan when out of memory.
808  */
809 static void
810 schedpaging(void *arg)
811 {
812 	spgcnt_t vavail;
813 
814 	if (freemem < lotsfree + needfree + kmem_reapahead)
815 		kmem_reap();
816 
817 	if (freemem < lotsfree + needfree)
818 		seg_preap();
819 
820 	if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
821 		kcage_cageout_wakeup();
822 
823 	if (mutex_tryenter(&pageout_mutex)) {
824 		if (pageouts_running != 0)
825 			goto out;
826 
827 		/* No pageout scanner threads running. */
828 		nscan = 0;
829 		vavail = freemem - deficit;
830 		if (pageout_new_spread != 0)
831 			vavail -= needfree;
832 		/* Note that vavail is signed so don't use clamp() here */
833 		if (vavail < 0)
834 			vavail = 0;
835 		if (vavail > lotsfree)
836 			vavail = lotsfree;
837 
838 		if (needfree > 0 && pageout_new_spread == 0) {
839 			/*
840 			 * If we've not yet collected enough samples to
841 			 * calculate a spread, use the old logic of kicking
842 			 * into high gear anytime needfree is non-zero.
843 			 */
844 			desscan = fastscan / SCHEDPAGING_HZ;
845 		} else {
846 			/*
847 			 * Once we've calculated a spread based on system
848 			 * memory and usage, just treat needfree as another
849 			 * form of deficit.
850 			 */
851 			spgcnt_t faststmp, slowstmp, result;
852 
853 			slowstmp = slowscan * vavail;
854 			faststmp = fastscan * (lotsfree - vavail);
855 			result = (slowstmp + faststmp) /
856 			    nz(lotsfree) / SCHEDPAGING_HZ;
857 			desscan = (pgcnt_t)result;
858 		}
859 
860 		pageout_nsec = min_pageout_nsec + (lotsfree - vavail) *
861 		    (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree);
862 
863 		DTRACE_PROBE2(schedpage__calc, pgcnt_t, desscan, hrtime_t,
864 		    pageout_nsec);
865 
866 		if (pageout_new_spread != 0 && despagescanners != 0 &&
867 		    despagescanners != n_page_scanners) {
868 			/*
869 			 * We have finished the pagescan initialisation and the
870 			 * desired number of page scanners has changed, either
871 			 * because sampling just finished, because of a memory
872 			 * DR, or because despagescanners has been modified on
873 			 * the fly (e.g. via mdb(1)).
874 			 */
875 			uint_t curr_nscan = n_page_scanners;
876 			uint_t i;
877 
878 			/* Re-validate despagescanners */
879 			recalc_pagescanners();
880 
881 			n_page_scanners = despagescanners;
882 
883 			for (i = 0; i < MAX_PSCAN_THREADS; i++)
884 				reset_hands[i] = true;
885 
886 			/* If we need more scanners, start them now. */
887 			for (i = curr_nscan; i < n_page_scanners; i++) {
888 				(void) lwp_kernel_create(proc_pageout,
889 				    pageout_scanner, (void *)(uintptr_t)i,
890 				    TS_RUN, curthread->t_pri);
891 			}
892 
893 			/*
894 			 * If the number of scanners has decreased, trigger a
895 			 * wakeup so that the excess threads will terminate.
896 			 */
897 			if (n_page_scanners < curr_nscan) {
898 				WAKE_PAGEOUT_SCANNER(reducing);
899 			}
900 		}
901 
902 		if (pageout_sampling) {
903 			/*
904 			 * We still need to measure the rate at which the
905 			 * system is able to scan pages of memory. Each of
906 			 * these initial samples is a scan of as much system
907 			 * memory as practical, regardless of whether or not we
908 			 * are experiencing memory pressure.
909 			 */
910 			desscan = total_pages;
911 			pageout_nsec = max_pageout_nsec;
912 
913 			WAKE_PAGEOUT_SCANNER(sampling);
914 		} else if (freemem < lotsfree + needfree) {
915 			/*
916 			 * We need more memory.
917 			 */
918 			WAKE_PAGEOUT_SCANNER(lowmem);
919 		} else {
920 			/*
921 			 * There are enough free pages, no need to
922 			 * kick the scanner threads.  And next time
923 			 * around, keep more of the `highly shared'
924 			 * pages.
925 			 */
926 			cv_signal_pageout();
927 			if (po_share > MIN_PO_SHARE)
928 				po_share >>= 1;
929 		}
930 out:
931 		mutex_exit(&pageout_mutex);
932 	}
933 
934 	/*
935 	 * Signal threads waiting for available memory.
936 	 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
937 	 * in this case it is not needed - the waiters will be woken up during
938 	 * the next invocation of this function.
939 	 */
940 	if (kmem_avail() > 0)
941 		cv_broadcast(&memavail_cv);
942 
943 	(void) timeout(schedpaging, arg, hz / SCHEDPAGING_HZ);
944 }
945 
946 pgcnt_t		pushes;
947 ulong_t		push_list_size;		/* # of requests on pageout queue */
948 
949 /*
950  * Paging out should always be enabled.  This tunable exists to hold pageout
951  * for debugging purposes.  If set to 0, pageout_scanner() will go back to
952  * sleep each time it is woken by schedpaging().
953  */
954 uint_t dopageout = 1;
955 
956 /*
957  * The page out daemon, which runs as process 2.
958  *
959  * The daemon treats physical memory as a circular array of pages and scans
960  * the pages using a 'two-handed clock' algorithm. The front hand moves
961  * through the pages, clearing the reference bit. The back hand travels a
962  * distance (handspreadpages) behind the front hand, freeing the pages that
963  * have not been referenced in the time since the front hand passed. If
964  * modified, they are first written to their backing store before being
965  * freed.
966  *
967  * In order to make page invalidation more responsive on machines with
968  * larger memory, multiple pageout_scanner threads may be created. In this
969  * case, each thread is given a segment of the memory "clock face" so that
970  * memory can be reclaimed more quickly. As long as there are at least lotsfree
971  * pages, then pageout_scanner threads are not run.
972  *
973  * There are multiple threads that act on behalf of the pageout process. A
974  * set of threads scan pages (pageout_scanner) and frees them up if they
975  * don't require any VOP_PUTPAGE operation. If a page must be written back
976  * to its backing store, the request is put on a list and the other
977  * (pageout) thread is signaled. The pageout thread grabs VOP_PUTPAGE
978  * requests from the list, and processes them. Some filesystems may require
979  * resources for the VOP_PUTPAGE operations (like memory) and hence can
980  * block the pageout thread, but the scanner thread can still operate.
981  * There is still no guarantee that memory deadlocks cannot occur.
982  */
983 void
984 pageout()
985 {
986 	struct async_reqs *arg;
987 	pri_t pageout_pri;
988 	int i;
989 	pgcnt_t max_pushes;
990 	callb_cpr_t cprinfo;
991 
992 	proc_pageout = ttoproc(curthread);
993 	proc_pageout->p_cstime = 0;
994 	proc_pageout->p_stime =  0;
995 	proc_pageout->p_cutime =  0;
996 	proc_pageout->p_utime = 0;
997 	bcopy("pageout", PTOU(curproc)->u_psargs, 8);
998 	bcopy("pageout", PTOU(curproc)->u_comm, 7);
999 
1000 	mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
1001 	mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
1002 
1003 	/*
1004 	 * Allocate and initialize the async request structures for pageout.
1005 	 */
1006 	push_req = (struct async_reqs *)
1007 	    kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
1008 
1009 	req_freelist = push_req;
1010 	for (i = 0; i < async_list_size - 1; i++) {
1011 		push_req[i].a_next = &push_req[i + 1];
1012 	}
1013 
1014 	pageout_pri = curthread->t_pri;
1015 
1016 	/* Create the first pageout scanner thread. */
1017 	(void) lwp_kernel_create(proc_pageout, pageout_scanner,
1018 	    (void *)0,	/* this is instance 0, not NULL */
1019 	    TS_RUN, pageout_pri - 1);
1020 
1021 	/*
1022 	 * kick off the pageout scheduler.
1023 	 */
1024 	schedpaging(NULL);
1025 
1026 	/*
1027 	 * Create kernel cage thread.
1028 	 * The kernel cage thread is started under the pageout process
1029 	 * to take advantage of the less restricted page allocation
1030 	 * in page_create_throttle().
1031 	 */
1032 	kcage_cageout_init();
1033 
1034 	/*
1035 	 * Limit pushes to avoid saturating pageout devices.
1036 	 */
1037 	max_pushes = maxpgio / SCHEDPAGING_HZ;
1038 	CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
1039 
1040 	for (;;) {
1041 		mutex_enter(&push_lock);
1042 
1043 		while ((arg = push_list) == NULL || pushes > max_pushes) {
1044 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1045 			cv_wait(&push_cv, &push_lock);
1046 			pushes = 0;
1047 			CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
1048 		}
1049 		push_list = arg->a_next;
1050 		arg->a_next = NULL;
1051 		pageout_pushing = true;
1052 		mutex_exit(&push_lock);
1053 
1054 		DTRACE_PROBE(pageout__push);
1055 
1056 		if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
1057 		    arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
1058 			pushes++;
1059 		}
1060 
1061 		/* vp held by checkpage() */
1062 		VN_RELE(arg->a_vp);
1063 
1064 		mutex_enter(&push_lock);
1065 		pageout_pushing = false;
1066 		pageout_pushcount++;
1067 		arg->a_next = req_freelist;	/* back on freelist */
1068 		req_freelist = arg;
1069 		push_list_size--;
1070 		mutex_exit(&push_lock);
1071 	}
1072 }
1073 
1074 static void
1075 pageout_sample_add(pgcnt_t count, hrtime_t elapsed)
1076 {
1077 	VERIFY(pageout_sampling);
1078 
1079 	/*
1080 	 * The global variables used below are only modified during initial
1081 	 * scanning when there is a single page scanner thread running.
1082 	 */
1083 	pageout_sample_pages += count;
1084 	pageout_sample_etime += elapsed;
1085 	pageout_sample_cnt++;
1086 
1087 	if (pageout_sample_cnt >= pageout_sample_lim) {
1088 		/*
1089 		 * We have enough samples, set the spread.
1090 		 */
1091 		pageout_sampling = false;
1092 		pageout_rate = (hrrate_t)pageout_sample_pages *
1093 		    (hrrate_t)(NANOSEC) / pageout_sample_etime;
1094 		pageout_new_spread = pageout_rate / 10;
1095 	}
1096 }
1097 
1098 static inline page_t *
1099 wrapping_page_next(page_t *cur, page_t *start, page_t *end)
1100 {
1101 	if (cur == end)
1102 		return (start);
1103 	return (page_nextn(cur, 1));
1104 }
1105 
1106 /*
1107  * Kernel thread that scans pages looking for ones to free
1108  */
1109 static void
1110 pageout_scanner(void *a)
1111 {
1112 	page_t *fhand, *bhand, *fhandstart;
1113 	page_t *regionstart, *regionend;
1114 	uint_t laps;
1115 	callb_cpr_t cprinfo;
1116 	pgcnt_t	nscan_cnt;
1117 	pgcnt_t	pcount;
1118 	hrtime_t sample_start, sample_end;
1119 	uint_t inst = (uint_t)(uintptr_t)a;
1120 
1121 	VERIFY3U(inst, <, MAX_PSCAN_THREADS);
1122 
1123 	CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
1124 	mutex_enter(&pageout_mutex);
1125 
1126 	/*
1127 	 * The restart case does not attempt to point the hands at roughly
1128 	 * the right point on the assumption that after one circuit things
1129 	 * will have settled down, and restarts shouldn't be that often.
1130 	 */
1131 	reset_hands[inst] = true;
1132 
1133 	pageouts_running++;
1134 	mutex_exit(&pageout_mutex);
1135 
1136 loop:
1137 	cv_signal_pageout();
1138 
1139 	mutex_enter(&pageout_mutex);
1140 	pageouts_running--;
1141 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
1142 	cv_wait(&proc_pageout->p_cv, &pageout_mutex);
1143 	CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
1144 	pageouts_running++;
1145 	mutex_exit(&pageout_mutex);
1146 
1147 	/*
1148 	 * Check if pageout has been disabled for debugging purposes.
1149 	 */
1150 	if (dopageout == 0)
1151 		goto loop;
1152 
1153 	/*
1154 	 * One may reset the clock hands and scanned region for debugging
1155 	 * purposes. Hands will also be reset on first thread startup, if
1156 	 * the number of scanning threads (n_page_scanners) changes, or if
1157 	 * memory is added to, or removed from, the system.
1158 	 */
1159 	if (reset_hands[inst]) {
1160 		page_t *first;
1161 
1162 		reset_hands[inst] = false;
1163 
1164 		if (inst >= n_page_scanners) {
1165 			/*
1166 			 * The desired number of page scanners has been
1167 			 * reduced and this instance is no longer wanted.
1168 			 * Exit the lwp.
1169 			 */
1170 			VERIFY3U(inst, !=, 0);
1171 			DTRACE_PROBE1(pageout__exit, uint_t, inst);
1172 			mutex_enter(&pageout_mutex);
1173 			pageouts_running--;
1174 			mutex_exit(&pageout_mutex);
1175 			mutex_enter(&curproc->p_lock);
1176 			lwp_exit();
1177 			/* NOTREACHED */
1178 		}
1179 
1180 		first = page_first();
1181 
1182 		/*
1183 		 * Each scanner thread gets its own sector of the memory
1184 		 * clock face.
1185 		 */
1186 		pgcnt_t span, offset;
1187 
1188 		span = looppages / n_page_scanners;
1189 		VERIFY3U(span, >, handspreadpages);
1190 
1191 		offset = inst * span;
1192 		regionstart = page_nextn(first, offset);
1193 		if (inst == n_page_scanners - 1) {
1194 			/* The last instance goes up to the last page */
1195 			regionend = page_nextn(first, looppages - 1);
1196 		} else {
1197 			regionend = page_nextn(regionstart, span - 1);
1198 		}
1199 
1200 		bhand = regionstart;
1201 		fhand = page_nextn(bhand, handspreadpages);
1202 
1203 		DTRACE_PROBE4(pageout__reset, uint_t, inst,
1204 		    pgcnt_t, regionstart, pgcnt_t, regionend,
1205 		    pgcnt_t, fhand);
1206 	}
1207 
1208 	/*
1209 	 * This CPU kstat is only incremented here and we're on this CPU, so no
1210 	 * lock.
1211 	 */
1212 	CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
1213 
1214 	/*
1215 	 * Keep track of the number of times we have scanned all the way around
1216 	 * the loop on this wakeup.
1217 	 */
1218 	laps = 0;
1219 
1220 	/*
1221 	 * Track the number of pages visited during this scan so that we can
1222 	 * periodically measure our duty cycle.
1223 	 */
1224 	nscan_cnt = 0;
1225 	pcount = 0;
1226 
1227 	DTRACE_PROBE5(pageout__start, uint_t, inst, pgcnt_t, desscan,
1228 	    hrtime_t, pageout_nsec, page_t *, bhand, page_t *, fhand);
1229 
1230 	/*
1231 	 * Record the initial position of the front hand for this cycle so
1232 	 * that we can detect when the hand wraps around.
1233 	 */
1234 	fhandstart = fhand;
1235 
1236 	sample_start = gethrtime();
1237 
1238 	/*
1239 	 * Scan the appropriate number of pages for a single duty cycle.
1240 	 */
1241 	while (nscan_cnt < desscan) {
1242 		checkpage_result_t rvfront, rvback;
1243 
1244 		if (!pageout_sampling && freemem >= lotsfree + needfree) {
1245 			/*
1246 			 * We are not sampling and enough memory has become
1247 			 * available that scanning is no longer required.
1248 			 */
1249 			DTRACE_PROBE1(pageout__memfree, uint_t, inst);
1250 			break;
1251 		}
1252 
1253 		DTRACE_PROBE2(pageout__loop, uint_t, inst, pgcnt_t, pcount);
1254 
1255 		/*
1256 		 * Periodically check to see if we have exceeded the CPU duty
1257 		 * cycle for a single wakeup.
1258 		 */
1259 		if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
1260 			hrtime_t pageout_cycle_nsec;
1261 
1262 			pageout_cycle_nsec = gethrtime() - sample_start;
1263 			if (pageout_cycle_nsec >= pageout_nsec) {
1264 				atomic_inc_64(&pageout_timeouts);
1265 				DTRACE_PROBE1(pageout__timeout, uint_t, inst);
1266 				break;
1267 			}
1268 		}
1269 
1270 		/*
1271 		 * If checkpage manages to add a page to the free list,
1272 		 * we give ourselves another couple of trips around the loop.
1273 		 */
1274 		if ((rvfront = checkpage(fhand, POH_FRONT)) == CKP_FREED) {
1275 			laps = 0;
1276 		}
1277 		if ((rvback = checkpage(bhand, POH_BACK)) == CKP_FREED) {
1278 			laps = 0;
1279 		}
1280 
1281 		++pcount;
1282 
1283 		/*
1284 		 * This CPU kstat is only incremented here and we're on this
1285 		 * CPU, so no lock.
1286 		 */
1287 		CPU_STATS_ADDQ(CPU, vm, scan, 1);
1288 
1289 		/*
1290 		 * Don't include ineligible pages in the number scanned.
1291 		 */
1292 		if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE)
1293 			nscan_cnt++;
1294 
1295 		/*
1296 		 * Tick
1297 		 */
1298 		bhand = wrapping_page_next(bhand, regionstart, regionend);
1299 		fhand = wrapping_page_next(fhand, regionstart, regionend);
1300 
1301 		/*
1302 		 * The front hand has wrapped around during this wakeup.
1303 		 */
1304 		if (fhand == fhandstart) {
1305 			laps++;
1306 			DTRACE_PROBE2(pageout__hand__wrap, uint_t, inst,
1307 			    uint_t, laps);
1308 
1309 			/*
1310 			 * This CPU kstat is only incremented here and we're
1311 			 * on this CPU, so no lock.
1312 			 */
1313 			CPU_STATS_ADDQ(CPU, vm, rev, 1);
1314 
1315 			if (laps > 1) {
1316 				/*
1317 				 * Extremely unlikely, but it happens.
1318 				 * We went around the loop at least once
1319 				 * and didn't get far enough.
1320 				 * If we are still skipping `highly shared'
1321 				 * pages, skip fewer of them.  Otherwise,
1322 				 * give up till the next clock tick.
1323 				 */
1324 				if (po_share < MAX_PO_SHARE) {
1325 					po_share <<= 1;
1326 				} else {
1327 					break;
1328 				}
1329 			}
1330 		}
1331 	}
1332 
1333 	sample_end = gethrtime();
1334 	atomic_add_long(&nscan, nscan_cnt);
1335 
1336 	DTRACE_PROBE4(pageout__end, uint_t, inst, uint_t, laps,
1337 	    pgcnt_t, nscan_cnt, pgcnt_t, pcount)
1338 
1339 	/*
1340 	 * Continue accumulating samples until we have enough to get a
1341 	 * reasonable value for average scan rate.
1342 	 */
1343 	if (pageout_sampling) {
1344 		VERIFY3U(inst, ==, 0);
1345 		pageout_sample_add(pcount, sample_end - sample_start);
1346 		/*
1347 		 * If, after the sample just added, we have finished sampling,
1348 		 * set up the paging constants.
1349 		 */
1350 		if (!pageout_sampling)
1351 			setupclock();
1352 	}
1353 
1354 	goto loop;
1355 }
1356 
1357 /*
1358  * The pageout deadman is run once per second by clock().
1359  */
1360 void
1361 pageout_deadman(void)
1362 {
1363 	if (panicstr != NULL) {
1364 		/*
1365 		 * There is no pageout after panic.
1366 		 */
1367 		return;
1368 	}
1369 
1370 	if (pageout_deadman_seconds == 0) {
1371 		/*
1372 		 * The deadman is not enabled.
1373 		 */
1374 		return;
1375 	}
1376 
1377 	if (!pageout_pushing) {
1378 		goto reset;
1379 	}
1380 
1381 	/*
1382 	 * We are pushing a page.  Check to see if it is the same call we saw
1383 	 * last time we looked:
1384 	 */
1385 	if (pageout_pushcount != pageout_pushcount_seen) {
1386 		/*
1387 		 * It is a different call from the last check, so we are not
1388 		 * stuck.
1389 		 */
1390 		goto reset;
1391 	}
1392 
1393 	if (++pageout_stucktime >= pageout_deadman_seconds) {
1394 		panic("pageout_deadman: stuck pushing the same page for %d "
1395 		    "seconds (freemem is %lu)", pageout_deadman_seconds,
1396 		    freemem);
1397 	}
1398 
1399 	return;
1400 
1401 reset:
1402 	/*
1403 	 * Reset our tracking state to reflect that we are not stuck:
1404 	 */
1405 	pageout_stucktime = 0;
1406 	pageout_pushcount_seen = pageout_pushcount;
1407 }
1408 
1409 /*
1410  * Look at the page at hand.  If it is locked (e.g., for physical i/o),
1411  * system (u., page table) or free, then leave it alone.  Otherwise,
1412  * if we are running the front hand, turn off the page's reference bit.
1413  * If the proc is over maxrss, we take it.  If running the back hand,
1414  * check whether the page has been reclaimed.  If not, free the page,
1415  * pushing it to disk first if necessary.
1416  *
1417  * Return values:
1418  *	CKP_INELIGIBLE if the page is not a candidate at all,
1419  *	CKP_NOT_FREED  if the page was not freed, or
1420  *	CKP_FREED      if we freed it.
1421  */
1422 static checkpage_result_t
1423 checkpage(page_t *pp, pageout_hand_t whichhand)
1424 {
1425 	int ppattr;
1426 	int isfs = 0;
1427 	int isexec = 0;
1428 	int pagesync_flag;
1429 
1430 	/*
1431 	 * Skip pages:
1432 	 *	- associated with the kernel vnode since
1433 	 *	    they are always "exclusively" locked.
1434 	 *	- that are free
1435 	 *	- that are shared more than po_share'd times
1436 	 *	- its already locked
1437 	 *
1438 	 * NOTE:  These optimizations assume that reads are atomic.
1439 	 */
1440 
1441 	if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
1442 	    pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
1443 	    hat_page_checkshare(pp, po_share)) {
1444 		return (CKP_INELIGIBLE);
1445 	}
1446 
1447 	if (!page_trylock(pp, SE_EXCL)) {
1448 		/*
1449 		 * Skip the page if we can't acquire the "exclusive" lock.
1450 		 */
1451 		return (CKP_INELIGIBLE);
1452 	} else if (PP_ISFREE(pp)) {
1453 		/*
1454 		 * It became free between the above check and our actually
1455 		 * locking the page.  Oh well, there will be other pages.
1456 		 */
1457 		page_unlock(pp);
1458 		return (CKP_INELIGIBLE);
1459 	}
1460 
1461 	/*
1462 	 * Reject pages that cannot be freed. The page_struct_lock
1463 	 * need not be acquired to examine these
1464 	 * fields since the page has an "exclusive" lock.
1465 	 */
1466 	if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1467 		page_unlock(pp);
1468 		return (CKP_INELIGIBLE);
1469 	}
1470 
1471 	/*
1472 	 * Maintain statistics for what we are freeing
1473 	 */
1474 	if (pp->p_vnode != NULL) {
1475 		if (pp->p_vnode->v_flag & VVMEXEC)
1476 			isexec = 1;
1477 
1478 		if (!IS_SWAPFSVP(pp->p_vnode))
1479 			isfs = 1;
1480 	}
1481 
1482 	/*
1483 	 * Turn off REF and MOD bits with the front hand.
1484 	 * The back hand examines the REF bit and always considers
1485 	 * SHARED pages as referenced.
1486 	 */
1487 	if (whichhand == POH_FRONT) {
1488 		pagesync_flag = HAT_SYNC_ZERORM;
1489 	} else {
1490 		pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
1491 		    HAT_SYNC_STOPON_SHARED;
1492 	}
1493 
1494 	ppattr = hat_pagesync(pp, pagesync_flag);
1495 
1496 recheck:
1497 	/*
1498 	 * If page is referenced; make unreferenced but reclaimable.
1499 	 * If this page is not referenced, then it must be reclaimable
1500 	 * and we can add it to the free list.
1501 	 */
1502 	if (ppattr & P_REF) {
1503 		DTRACE_PROBE2(pageout__isref, page_t *, pp,
1504 		    pageout_hand_t, whichhand);
1505 
1506 		if (whichhand == POH_FRONT) {
1507 			/*
1508 			 * Checking of rss or madvise flags needed here...
1509 			 *
1510 			 * If not "well-behaved", fall through into the code
1511 			 * for not referenced.
1512 			 */
1513 			hat_clrref(pp);
1514 		}
1515 
1516 		/*
1517 		 * Somebody referenced the page since the front
1518 		 * hand went by, so it's not a candidate for
1519 		 * freeing up.
1520 		 */
1521 		page_unlock(pp);
1522 		return (CKP_NOT_FREED);
1523 	}
1524 
1525 	VM_STAT_ADD(pageoutvmstats.checkpage[0]);
1526 
1527 	/*
1528 	 * If large page, attempt to demote it. If successfully demoted,
1529 	 * retry the checkpage.
1530 	 */
1531 	if (pp->p_szc != 0) {
1532 		if (!page_try_demote_pages(pp)) {
1533 			VM_STAT_ADD(pageoutvmstats.checkpage[1]);
1534 			page_unlock(pp);
1535 			return (CKP_INELIGIBLE);
1536 		}
1537 
1538 		ASSERT(pp->p_szc == 0);
1539 		VM_STAT_ADD(pageoutvmstats.checkpage[2]);
1540 
1541 		/*
1542 		 * Since page_try_demote_pages() could have unloaded some
1543 		 * mappings it makes sense to reload ppattr.
1544 		 */
1545 		ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1546 	}
1547 
1548 	/*
1549 	 * If the page is currently dirty, we have to arrange to have it
1550 	 * cleaned before it can be freed.
1551 	 *
1552 	 * XXX - ASSERT(pp->p_vnode != NULL);
1553 	 */
1554 	if ((ppattr & P_MOD) && pp->p_vnode != NULL) {
1555 		struct vnode *vp = pp->p_vnode;
1556 		u_offset_t offset = pp->p_offset;
1557 
1558 		/*
1559 		 * XXX - Test for process being swapped out or about to exit?
1560 		 * [Can't get back to process(es) using the page.]
1561 		 */
1562 
1563 		/*
1564 		 * Hold the vnode before releasing the page lock to
1565 		 * prevent it from being freed and re-used by some
1566 		 * other thread.
1567 		 */
1568 		VN_HOLD(vp);
1569 		page_unlock(pp);
1570 
1571 		/*
1572 		 * Queue I/O request for the pageout thread.
1573 		 */
1574 		if (!queue_io_request(vp, offset)) {
1575 			VN_RELE(vp);
1576 			return (CKP_NOT_FREED);
1577 		}
1578 		return (CKP_FREED);
1579 	}
1580 
1581 	/*
1582 	 * Now we unload all the translations and put the page back on to the
1583 	 * free list.  If the page was used (referenced or modified) after the
1584 	 * pagesync but before it was unloaded we catch it and handle the page
1585 	 * properly.
1586 	 */
1587 	DTRACE_PROBE2(pageout__free, page_t *, pp, pageout_hand_t, whichhand);
1588 	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1589 	ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1590 	if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode != NULL)) {
1591 		goto recheck;
1592 	}
1593 
1594 	VN_DISPOSE(pp, B_FREE, 0, kcred);
1595 
1596 	CPU_STATS_ADD_K(vm, dfree, 1);
1597 
1598 	if (isfs) {
1599 		if (isexec) {
1600 			CPU_STATS_ADD_K(vm, execfree, 1);
1601 		} else {
1602 			CPU_STATS_ADD_K(vm, fsfree, 1);
1603 		}
1604 	} else {
1605 		CPU_STATS_ADD_K(vm, anonfree, 1);
1606 	}
1607 
1608 	return (CKP_FREED);
1609 }
1610 
1611 /*
1612  * Queue async i/o request from pageout_scanner and segment swapout
1613  * routines on one common list.  This ensures that pageout devices (swap)
1614  * are not saturated by pageout_scanner or swapout requests.
1615  * The pageout thread empties this list by initiating i/o operations.
1616  */
1617 int
1618 queue_io_request(vnode_t *vp, u_offset_t off)
1619 {
1620 	struct async_reqs *arg;
1621 
1622 	/*
1623 	 * If we cannot allocate an async request struct,
1624 	 * skip this page.
1625 	 */
1626 	mutex_enter(&push_lock);
1627 	if ((arg = req_freelist) == NULL) {
1628 		mutex_exit(&push_lock);
1629 		return (0);
1630 	}
1631 	req_freelist = arg->a_next;		/* adjust freelist */
1632 	push_list_size++;
1633 
1634 	arg->a_vp = vp;
1635 	arg->a_off = off;
1636 	arg->a_len = PAGESIZE;
1637 	arg->a_flags = B_ASYNC | B_FREE;
1638 	arg->a_cred = kcred;		/* always held */
1639 
1640 	/*
1641 	 * Add to list of pending write requests.
1642 	 */
1643 	arg->a_next = push_list;
1644 	push_list = arg;
1645 
1646 	if (req_freelist == NULL) {
1647 		/*
1648 		 * No free async requests left. The lock is held so we
1649 		 * might as well signal the pusher thread now.
1650 		 */
1651 		cv_signal(&push_cv);
1652 	}
1653 	mutex_exit(&push_lock);
1654 	return (1);
1655 }
1656 
1657 /*
1658  * Wake up pageout to initiate i/o if push_list is not empty.
1659  */
1660 void
1661 cv_signal_pageout()
1662 {
1663 	if (push_list != NULL) {
1664 		mutex_enter(&push_lock);
1665 		cv_signal(&push_cv);
1666 		mutex_exit(&push_lock);
1667 	}
1668 }
1669