xref: /illumos-gate/usr/src/uts/common/os/vm_pageout.c (revision dd72704bd9e794056c558153663c739e2012d721)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2021 Oxide Computer Company
24  * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
25  */
26 
27 /*
28  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
29  * Use is subject to license terms.
30  */
31 
32 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
33 /* All Rights Reserved */
34 
35 /*
36  * University Copyright- Copyright (c) 1982, 1986, 1988
37  * The Regents of the University of California
38  * All Rights Reserved
39  *
40  * University Acknowledgment- Portions of this document are derived from
41  * software developed by the University of California, Berkeley, and its
42  * contributors.
43  */
44 
45 #include <sys/types.h>
46 #include <sys/t_lock.h>
47 #include <sys/param.h>
48 #include <sys/buf.h>
49 #include <sys/uio.h>
50 #include <sys/proc.h>
51 #include <sys/systm.h>
52 #include <sys/mman.h>
53 #include <sys/cred.h>
54 #include <sys/vnode.h>
55 #include <sys/vm.h>
56 #include <sys/vmparam.h>
57 #include <sys/vtrace.h>
58 #include <sys/cmn_err.h>
59 #include <sys/cpuvar.h>
60 #include <sys/user.h>
61 #include <sys/kmem.h>
62 #include <sys/debug.h>
63 #include <sys/callb.h>
64 #include <sys/mem_cage.h>
65 #include <sys/time.h>
66 #include <sys/stdbool.h>
67 
68 #include <vm/hat.h>
69 #include <vm/as.h>
70 #include <vm/seg.h>
71 #include <vm/page.h>
72 #include <vm/pvn.h>
73 #include <vm/seg_kmem.h>
74 
75 /*
76  * FREE MEMORY MANAGEMENT
77  *
78  * Management of the pool of free pages is a tricky business.  There are
79  * several critical threshold values which constrain our allocation of new
80  * pages and inform the rate of paging out of memory to swap.  These threshold
81  * values, and the behaviour they induce, are described below in descending
82  * order of size -- and thus increasing order of severity!
83  *
84  *   +---------------------------------------------------- physmem (all memory)
85  *   |
86  *   | Ordinarily there are no particular constraints placed on page
87  *   v allocation.  The page scanner is not running and page_create_va()
88  *   | will effectively grant all page requests (whether from the kernel
89  *   | or from user processes) without artificial delay.
90  *   |
91  *   +------------------------ lotsfree (1.56% of physmem, min. 16MB, max. 2GB)
92  *   |
93  *   | When we have less than "lotsfree" pages, pageout_scanner() is
94  *   v signalled by schedpaging() to begin looking for pages that can
95  *   | be evicted to disk to bring us back above lotsfree.  At this
96  *   | stage there is still no constraint on allocation of free pages.
97  *   |
98  *   | For small systems, we set a lower bound of 16MB for lotsfree;
99  *   v this is the natural value for a system with 1GB memory.  This is
100  *   | to ensure that the pageout reserve pool contains at least 4MB
101  *   | for use by ZFS.
102  *   |
103  *   | For systems with a large amount of memory, we constrain lotsfree
104  *   | to be at most 2GB (with a pageout reserve of around 0.5GB), as
105  *   v at some point the required slack relates more closely to the
106  *   | rate at which paging can occur than to the total amount of memory.
107  *   |
108  *   +------------------- desfree (1/2 of lotsfree, 0.78% of physmem, min. 8MB)
109  *   |
110  *   | When we drop below desfree, a number of kernel facilities will
111  *   v wait before allocating more memory, under the assumption that
112  *   | pageout or reaping will make progress and free up some memory.
113  *   | This behaviour is not especially coordinated; look for comparisons
114  *   | of desfree and freemem.
115  *   |
116  *   | In addition to various attempts at advisory caution, clock()
117  *   | will wake up the thread that is ordinarily parked in sched().
118  *   | This routine is responsible for the heavy-handed swapping out
119  *   v of entire processes in an attempt to arrest the slide of free
120  *   | memory.  See comments in sched.c for more details.
121  *   |
122  *   +----- minfree & throttlefree (3/4 of desfree, 0.59% of physmem, min. 6MB)
123  *   |
124  *   | These two separate tunables have, by default, the same value.
125  *   v Various parts of the kernel use minfree to signal the need for
126  *   | more aggressive reclamation of memory, and sched() is more
127  *   | aggressive at swapping processes out.
128  *   |
129  *   | If free memory falls below throttlefree, page_create_va() will
130  *   | use page_create_throttle() to begin holding most requests for
131  *   | new pages while pageout and reaping free up memory.  Sleeping
132  *   v allocations (e.g., KM_SLEEP) are held here while we wait for
133  *   | more memory.  Non-sleeping allocations are generally allowed to
134  *   | proceed, unless their priority is explicitly lowered with
135  *   | KM_NORMALPRI (Note: KM_NOSLEEP_LAZY == (KM_NOSLEEP | KM_NORMALPRI).).
136  *   |
137  *   +------- pageout_reserve (3/4 of throttlefree, 0.44% of physmem, min. 4MB)
138  *   |
139  *   | When we hit throttlefree, the situation is already dire.  The
140  *   v system is generally paging out memory and swapping out entire
141  *   | processes in order to free up memory for continued operation.
142  *   |
143  *   | Unfortunately, evicting memory to disk generally requires short
144  *   | term use of additional memory; e.g., allocation of buffers for
145  *   | storage drivers, updating maps of free and used blocks, etc.
146  *   | As such, pageout_reserve is the number of pages that we keep in
147  *   | special reserve for use by pageout() and sched() and by any
148  *   v other parts of the kernel that need to be working for those to
149  *   | make forward progress such as the ZFS I/O pipeline.
150  *   |
151  *   | When we are below pageout_reserve, we fail or hold any allocation
152  *   | that has not explicitly requested access to the reserve pool.
153  *   | Access to the reserve is generally granted via the KM_PUSHPAGE
154  *   | flag, or by marking a thread T_PUSHPAGE such that all allocations
155  *   | can implicitly tap the reserve.  For more details, see the
156  *   v NOMEMWAIT() macro, the T_PUSHPAGE thread flag, the KM_PUSHPAGE
157  *   | and VM_PUSHPAGE allocation flags, and page_create_throttle().
158  *   |
159  *   +---------------------------------------------------------- no free memory
160  *   |
161  *   | If we have arrived here, things are very bad indeed.  It is
162  *   v surprisingly difficult to tell if this condition is even fatal,
163  *   | as enough memory may have been granted to pageout() and to the
164  *   | ZFS I/O pipeline that requests for eviction that have already been
165  *   | made will complete and free up memory some time soon.
166  *   |
167  *   | If free memory does not materialise, the system generally remains
168  *   | deadlocked.  The pageout_deadman() below is run once per second
169  *   | from clock(), seeking to limit the amount of time a single request
170  *   v to page out can be blocked before the system panics to get a crash
171  *   | dump and return to service.
172  *   |
173  *   +-------------------------------------------------------------------------
174  */
175 
176 /*
177  * The following parameters control operation of the page replacement
178  * algorithm.  They are initialized to 0, and then computed at boot time based
179  * on the size of the system; see setupclock().  If they are patched non-zero
180  * in a loaded vmunix they are left alone and may thus be changed per system
181  * using "mdb -kw" on the loaded system.
182  */
183 pgcnt_t		slowscan = 0;
184 pgcnt_t		fastscan = 0;
185 
186 static pgcnt_t	handspreadpages = 0;
187 
188 /*
189  * looppages:
190  *     Cached copy of the total number of pages in the system (total_pages).
191  *
192  * loopfraction:
193  *     Divisor used to relate fastscan to looppages in setupclock().
194  */
195 static uint_t	loopfraction = 2;
196 static pgcnt_t	looppages;
197 
198 static uint_t	min_percent_cpu = 4;
199 static uint_t	max_percent_cpu = 80;
200 static pgcnt_t	maxfastscan = 0;
201 static pgcnt_t	maxslowscan = 100;
202 
203 #define		MEGABYTES		(1024ULL * 1024ULL)
204 
205 /*
206  * pageout_threshold_style:
207  *     set to 1 to use the previous default threshold size calculation;
208  *     i.e., each threshold is half of the next largest value.
209  */
210 uint_t		pageout_threshold_style = 0;
211 
212 /*
213  * The operator may override these tunables to request a different minimum or
214  * maximum lotsfree value, or to change the divisor we use for automatic
215  * sizing.
216  *
217  * By default, we make lotsfree 1/64th of the total memory in the machine.  The
218  * minimum and maximum are specified in bytes, rather than pages; a zero value
219  * means the default values (below) are used.
220  */
221 uint_t		lotsfree_fraction = 64;
222 pgcnt_t		lotsfree_min = 0;
223 pgcnt_t		lotsfree_max = 0;
224 
225 #define		LOTSFREE_MIN_DEFAULT	(16 * MEGABYTES)
226 #define		LOTSFREE_MAX_DEFAULT	(2048 * MEGABYTES)
227 
228 /*
229  * If these tunables are set to non-zero values in /etc/system, and provided
230  * the value is not larger than the threshold above, the specified value will
231  * be used directly without any additional calculation or adjustment.  The boot
232  * time value of these overrides is preserved in the "clockinit" struct.  More
233  * detail is available in the comment at the top of the file.
234  */
235 pgcnt_t		maxpgio = 0;
236 pgcnt_t		minfree = 0;
237 pgcnt_t		desfree = 0;
238 pgcnt_t		lotsfree = 0;
239 pgcnt_t		needfree = 0;
240 pgcnt_t		throttlefree = 0;
241 pgcnt_t		pageout_reserve = 0;
242 
243 pgcnt_t		deficit;
244 pgcnt_t		nscan;
245 pgcnt_t		desscan;
246 
247 /*
248  * Values for min_pageout_nsec, max_pageout_nsec and pageout_nsec are the
249  * number of nanoseconds in each wakeup cycle that gives the equivalent of some
250  * underlying %CPU duty cycle.
251  *
252  * min_pageout_nsec:
253  *     nanoseconds/wakeup equivalent of min_percent_cpu.
254  *
255  * max_pageout_nsec:
256  *     nanoseconds/wakeup equivalent of max_percent_cpu.
257  *
258  * pageout_nsec:
259  *     Number of nanoseconds budgeted for each wakeup cycle.
260  *     Computed each time around by schedpaging().
261  *     Varies between min_pageout_nsec and max_pageout_nsec,
262  *     depending on memory pressure.
263  */
264 static hrtime_t	min_pageout_nsec;
265 static hrtime_t	max_pageout_nsec;
266 static hrtime_t	pageout_nsec;
267 
268 static uint_t	reset_hands;
269 
270 #define	PAGES_POLL_MASK	1023
271 
272 /*
273  * pageout_sample_lim:
274  *     The limit on the number of samples needed to establish a value for new
275  *     pageout parameters: fastscan, slowscan, pageout_new_spread, and
276  *     handspreadpages.
277  *
278  * pageout_sample_cnt:
279  *     Current sample number.  Once the sample gets large enough, set new
280  *     values for handspreadpages, pageout_new_spread, fastscan and slowscan.
281  *
282  * pageout_sample_pages:
283  *     The accumulated number of pages scanned during sampling.
284  *
285  * pageout_sample_etime:
286  *     The accumulated nanoseconds for the sample.
287  *
288  * pageout_rate:
289  *     Rate in pages/nanosecond, computed at the end of sampling.
290  *
291  * pageout_new_spread:
292  *     Initially zero while the system scan rate is measured by
293  *     pageout_scanner(), which then sets this value once per system boot after
294  *     enough samples have been recorded (pageout_sample_cnt).  Once set, this
295  *     new value is used for fastscan and handspreadpages.
296  *
297  * sample_start, sample_end:
298  *     The hrtime at which the last pageout_scanner() sample began and ended.
299  */
300 typedef hrtime_t hrrate_t;
301 
302 static uint64_t	pageout_sample_lim = 4;
303 static uint64_t	pageout_sample_cnt = 0;
304 static pgcnt_t	pageout_sample_pages = 0;
305 static hrrate_t	pageout_rate = 0;
306 static pgcnt_t	pageout_new_spread = 0;
307 
308 static hrtime_t	pageout_cycle_nsec;
309 static hrtime_t	sample_start, sample_end;
310 static hrtime_t	pageout_sample_etime = 0;
311 
312 /*
313  * Record number of times a pageout_scanner() wakeup cycle finished because it
314  * timed out (exceeded its CPU budget), rather than because it visited
315  * its budgeted number of pages.
316  */
317 uint64_t	pageout_timeouts = 0;
318 
319 #ifdef VM_STATS
320 static struct pageoutvmstats_str {
321 	ulong_t	checkpage[3];
322 } pageoutvmstats;
323 #endif /* VM_STATS */
324 
325 /*
326  * Threads waiting for free memory use this condition variable and lock until
327  * memory becomes available.
328  */
329 kmutex_t	memavail_lock;
330 kcondvar_t	memavail_cv;
331 
332 typedef enum pageout_hand {
333 	POH_FRONT = 1,
334 	POH_BACK,
335 } pageout_hand_t;
336 
337 typedef enum {
338 	CKP_INELIGIBLE,
339 	CKP_NOT_FREED,
340 	CKP_FREED,
341 } checkpage_result_t;
342 
343 static checkpage_result_t checkpage(page_t *, pageout_hand_t);
344 
345 static struct clockinit {
346 	bool ci_init;
347 	pgcnt_t ci_lotsfree_min;
348 	pgcnt_t ci_lotsfree_max;
349 	pgcnt_t ci_lotsfree;
350 	pgcnt_t ci_desfree;
351 	pgcnt_t ci_minfree;
352 	pgcnt_t ci_throttlefree;
353 	pgcnt_t ci_pageout_reserve;
354 	pgcnt_t ci_maxpgio;
355 	pgcnt_t ci_maxfastscan;
356 	pgcnt_t ci_fastscan;
357 	pgcnt_t ci_slowscan;
358 	pgcnt_t ci_handspreadpages;
359 } clockinit = { .ci_init = false };
360 
361 static pgcnt_t
362 clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum)
363 {
364 	if (value < minimum) {
365 		return (minimum);
366 	} else if (value > maximum) {
367 		return (maximum);
368 	} else {
369 		return (value);
370 	}
371 }
372 
373 static pgcnt_t
374 tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval)
375 {
376 	if (initval == 0 || initval >= initval_ceiling) {
377 		return (defval);
378 	} else {
379 		return (initval);
380 	}
381 }
382 
383 /*
384  * Set up the paging constants for the clock algorithm used by
385  * pageout_scanner(), and by the virtual memory system overall.  See the
386  * comments at the top of this file for more information about the threshold
387  * values and system responses to memory pressure.
388  *
389  * This routine is called once by main() at startup, after the initial size of
390  * physical memory is determined.  It may be called again later if memory is
391  * added to or removed from the system, or if new measurements of the page scan
392  * rate become available.
393  */
394 void
395 setupclock(void)
396 {
397 	pgcnt_t defval;
398 	bool half = (pageout_threshold_style == 1);
399 	bool recalc = true;
400 
401 	looppages = total_pages;
402 
403 	/*
404 	 * The operator may have provided specific values for some of the
405 	 * tunables via /etc/system.  On our first call, we preserve those
406 	 * values so that they can be used for subsequent recalculations.
407 	 *
408 	 * A value of zero for any tunable means we will use the default
409 	 * sizing.
410 	 */
411 	if (!clockinit.ci_init) {
412 		clockinit.ci_init = true;
413 
414 		clockinit.ci_lotsfree_min = lotsfree_min;
415 		clockinit.ci_lotsfree_max = lotsfree_max;
416 		clockinit.ci_lotsfree = lotsfree;
417 		clockinit.ci_desfree = desfree;
418 		clockinit.ci_minfree = minfree;
419 		clockinit.ci_throttlefree = throttlefree;
420 		clockinit.ci_pageout_reserve = pageout_reserve;
421 		clockinit.ci_maxpgio = maxpgio;
422 		clockinit.ci_maxfastscan = maxfastscan;
423 		clockinit.ci_fastscan = fastscan;
424 		clockinit.ci_slowscan = slowscan;
425 		clockinit.ci_handspreadpages = handspreadpages;
426 
427 		/*
428 		 * The first call does not trigger a recalculation, only
429 		 * subsequent calls.
430 		 */
431 		recalc = false;
432 	}
433 
434 	/*
435 	 * Configure paging threshold values.  For more details on what each
436 	 * threshold signifies, see the comments at the top of this file.
437 	 */
438 	lotsfree_max = tune(clockinit.ci_lotsfree_max, looppages,
439 	    btop(LOTSFREE_MAX_DEFAULT));
440 	lotsfree_min = tune(clockinit.ci_lotsfree_min, lotsfree_max,
441 	    btop(LOTSFREE_MIN_DEFAULT));
442 
443 	lotsfree = tune(clockinit.ci_lotsfree, looppages,
444 	    clamp(looppages / lotsfree_fraction, lotsfree_min, lotsfree_max));
445 
446 	desfree = tune(clockinit.ci_desfree, lotsfree,
447 	    lotsfree / 2);
448 
449 	minfree = tune(clockinit.ci_minfree, desfree,
450 	    half ? desfree / 2 : 3 * desfree / 4);
451 
452 	throttlefree = tune(clockinit.ci_throttlefree, desfree,
453 	    minfree);
454 
455 	pageout_reserve = tune(clockinit.ci_pageout_reserve, throttlefree,
456 	    half ? throttlefree / 2 : 3 * throttlefree / 4);
457 
458 	/*
459 	 * Maxpgio thresholds how much paging is acceptable.
460 	 * This figures that 2/3 busy on an arm is all that is
461 	 * tolerable for paging.  We assume one operation per disk rev.
462 	 *
463 	 * XXX - Does not account for multiple swap devices.
464 	 */
465 	if (clockinit.ci_maxpgio == 0) {
466 		maxpgio = (DISKRPM * 2) / 3;
467 	} else {
468 		maxpgio = clockinit.ci_maxpgio;
469 	}
470 
471 	/*
472 	 * The clock scan rate varies between fastscan and slowscan
473 	 * based on the amount of free memory available.  Fastscan
474 	 * rate should be set based on the number pages that can be
475 	 * scanned per sec using ~10% of processor time.  Since this
476 	 * value depends on the processor, MMU, Mhz etc., it is
477 	 * difficult to determine it in a generic manner for all
478 	 * architectures.
479 	 *
480 	 * Instead of trying to determine the number of pages scanned
481 	 * per sec for every processor, fastscan is set to be the smaller
482 	 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
483 	 * time is limited to ~4% of processor time.
484 	 *
485 	 * Setting fastscan to be 1/2 of memory allows pageout to scan
486 	 * all of memory in ~2 secs.  This implies that user pages not
487 	 * accessed within 1 sec (assuming, handspreadpages == fastscan)
488 	 * can be reclaimed when free memory is very low.  Stealing pages
489 	 * not accessed within 1 sec seems reasonable and ensures that
490 	 * active user processes don't thrash.
491 	 *
492 	 * Smaller values of fastscan result in scanning fewer pages
493 	 * every second and consequently pageout may not be able to free
494 	 * sufficient memory to maintain the minimum threshold.  Larger
495 	 * values of fastscan result in scanning a lot more pages which
496 	 * could lead to thrashing and higher CPU usage.
497 	 *
498 	 * Fastscan needs to be limited to a maximum value and should not
499 	 * scale with memory to prevent pageout from consuming too much
500 	 * time for scanning on slow CPU's and avoid thrashing, as a
501 	 * result of scanning too many pages, on faster CPU's.
502 	 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
503 	 * (the upper bound for fastscan) based on the average number
504 	 * of pages that can potentially be scanned in ~1 sec (using ~4%
505 	 * of the CPU) on some of the following machines that currently
506 	 * run Solaris 2.x:
507 	 *
508 	 *			average memory scanned in ~1 sec
509 	 *
510 	 *	25 Mhz SS1+:		23 Meg
511 	 *	LX:			37 Meg
512 	 *	50 Mhz SC2000:		68 Meg
513 	 *
514 	 *	40 Mhz 486:		26 Meg
515 	 *	66 Mhz 486:		42 Meg
516 	 *
517 	 * When free memory falls just below lotsfree, the scan rate
518 	 * goes from 0 to slowscan (i.e., pageout starts running).  This
519 	 * transition needs to be smooth and is achieved by ensuring that
520 	 * pageout scans a small number of pages to satisfy the transient
521 	 * memory demand.  This is set to not exceed 100 pages/sec (25 per
522 	 * wakeup) since scanning that many pages has no noticible impact
523 	 * on system performance.
524 	 *
525 	 * In addition to setting fastscan and slowscan, pageout is
526 	 * limited to using ~4% of the CPU.  This results in increasing
527 	 * the time taken to scan all of memory, which in turn means that
528 	 * user processes have a better opportunity of preventing their
529 	 * pages from being stolen.  This has a positive effect on
530 	 * interactive and overall system performance when memory demand
531 	 * is high.
532 	 *
533 	 * Thus, the rate at which pages are scanned for replacement will
534 	 * vary linearly between slowscan and the number of pages that
535 	 * can be scanned using ~4% of processor time instead of varying
536 	 * linearly between slowscan and fastscan.
537 	 *
538 	 * Also, the processor time used by pageout will vary from ~1%
539 	 * at slowscan to ~4% at fastscan instead of varying between
540 	 * ~1% at slowscan and ~10% at fastscan.
541 	 *
542 	 * The values chosen for the various VM parameters (fastscan,
543 	 * handspreadpages, etc) are not universally true for all machines,
544 	 * but appear to be a good rule of thumb for the machines we've
545 	 * tested.  They have the following ranges:
546 	 *
547 	 *	cpu speed:	20 to 70 Mhz
548 	 *	page size:	4K to 8K
549 	 *	memory size:	16M to 5G
550 	 *	page scan rate:	4000 - 17400 4K pages per sec
551 	 *
552 	 * The values need to be re-examined for machines which don't
553 	 * fall into the various ranges (e.g., slower or faster CPUs,
554 	 * smaller or larger pagesizes etc) shown above.
555 	 *
556 	 * On an MP machine, pageout is often unable to maintain the
557 	 * minimum paging thresholds under heavy load.  This is due to
558 	 * the fact that user processes running on other CPU's can be
559 	 * dirtying memory at a much faster pace than pageout can find
560 	 * pages to free.  The memory demands could be met by enabling
561 	 * more than one CPU to run the clock algorithm in such a manner
562 	 * that the various clock hands don't overlap.  This also makes
563 	 * it more difficult to determine the values for fastscan, slowscan
564 	 * and handspreadpages.
565 	 *
566 	 * The swapper is currently used to free up memory when pageout
567 	 * is unable to meet memory demands by swapping out processes.
568 	 * In addition to freeing up memory, swapping also reduces the
569 	 * demand for memory by preventing user processes from running
570 	 * and thereby consuming memory.
571 	 */
572 	if (clockinit.ci_maxfastscan == 0) {
573 		if (pageout_new_spread != 0) {
574 			maxfastscan = pageout_new_spread;
575 		} else {
576 			maxfastscan = MAXHANDSPREADPAGES;
577 		}
578 	} else {
579 		maxfastscan = clockinit.ci_maxfastscan;
580 	}
581 
582 	if (clockinit.ci_fastscan == 0) {
583 		fastscan = MIN(looppages / loopfraction, maxfastscan);
584 	} else {
585 		fastscan = clockinit.ci_fastscan;
586 	}
587 
588 	if (fastscan > looppages / loopfraction) {
589 		fastscan = looppages / loopfraction;
590 	}
591 
592 	/*
593 	 * Set slow scan time to 1/10 the fast scan time, but
594 	 * not to exceed maxslowscan.
595 	 */
596 	if (clockinit.ci_slowscan == 0) {
597 		slowscan = MIN(fastscan / 10, maxslowscan);
598 	} else {
599 		slowscan = clockinit.ci_slowscan;
600 	}
601 
602 	if (slowscan > fastscan / 2) {
603 		slowscan = fastscan / 2;
604 	}
605 
606 	/*
607 	 * Handspreadpages is distance (in pages) between front and back
608 	 * pageout daemon hands.  The amount of time to reclaim a page
609 	 * once pageout examines it increases with this distance and
610 	 * decreases as the scan rate rises. It must be < the amount
611 	 * of pageable memory.
612 	 *
613 	 * Since pageout is limited to ~4% of the CPU, setting handspreadpages
614 	 * to be "fastscan" results in the front hand being a few secs
615 	 * (varies based on the processor speed) ahead of the back hand
616 	 * at fastscan rates.  This distance can be further reduced, if
617 	 * necessary, by increasing the processor time used by pageout
618 	 * to be more than ~4% and preferrably not more than ~10%.
619 	 *
620 	 * As a result, user processes have a much better chance of
621 	 * referencing their pages before the back hand examines them.
622 	 * This also significantly lowers the number of reclaims from
623 	 * the freelist since pageout does not end up freeing pages which
624 	 * may be referenced a sec later.
625 	 */
626 	if (clockinit.ci_handspreadpages == 0) {
627 		handspreadpages = fastscan;
628 	} else {
629 		handspreadpages = clockinit.ci_handspreadpages;
630 	}
631 
632 	/*
633 	 * Make sure that back hand follows front hand by at least
634 	 * 1/SCHEDPAGING_HZ seconds.  Without this test, it is possible for the
635 	 * back hand to look at a page during the same wakeup of the pageout
636 	 * daemon in which the front hand cleared its ref bit.
637 	 */
638 	if (handspreadpages >= looppages) {
639 		handspreadpages = looppages - 1;
640 	}
641 
642 	/*
643 	 * If we have been called to recalculate the parameters, set a flag to
644 	 * re-evaluate the clock hand pointers.
645 	 */
646 	if (recalc) {
647 		reset_hands = 1;
648 	}
649 }
650 
651 /*
652  * Pageout scheduling.
653  *
654  * Schedpaging controls the rate at which the page out daemon runs by
655  * setting the global variables nscan and desscan SCHEDPAGING_HZ
656  * times a second.  Nscan records the number of pages pageout has examined
657  * in its current pass; schedpaging() resets this value to zero each time
658  * it runs.  Desscan records the number of pages pageout should examine
659  * in its next pass; schedpaging() sets this value based on the amount of
660  * currently available memory.
661  */
662 #define	SCHEDPAGING_HZ	4
663 
664 static kmutex_t	pageout_mutex;	/* held while pageout or schedpaging running */
665 
666 /*
667  * Pool of available async pageout putpage requests.
668  */
669 static struct async_reqs *push_req;
670 static struct async_reqs *req_freelist;	/* available req structs */
671 static struct async_reqs *push_list;	/* pending reqs */
672 static kmutex_t push_lock;		/* protects req pool */
673 static kcondvar_t push_cv;
674 
675 /*
676  * If pageout() is stuck on a single push for this many seconds,
677  * pageout_deadman() will assume the system has hit a memory deadlock.  If set
678  * to 0, the deadman will have no effect.
679  *
680  * Note that we are only looking for stalls in the calls that pageout() makes
681  * to VOP_PUTPAGE().  These calls are merely asynchronous requests for paging
682  * I/O, which should not take long unless the underlying strategy call blocks
683  * indefinitely for memory.  The actual I/O request happens (or fails) later.
684  */
685 uint_t pageout_deadman_seconds = 90;
686 
687 static uint_t pageout_stucktime = 0;
688 static bool pageout_pushing = false;
689 static uint64_t pageout_pushcount = 0;
690 static uint64_t pageout_pushcount_seen = 0;
691 
692 static int async_list_size = 256;	/* number of async request structs */
693 
694 static void pageout_scanner(void);
695 
696 /*
697  * If a page is being shared more than "po_share" times
698  * then leave it alone- don't page it out.
699  */
700 #define	MIN_PO_SHARE	(8)
701 #define	MAX_PO_SHARE	((MIN_PO_SHARE) << 24)
702 ulong_t	po_share = MIN_PO_SHARE;
703 
704 /*
705  * Schedule rate for paging.
706  * Rate is linear interpolation between
707  * slowscan with lotsfree and fastscan when out of memory.
708  */
709 static void
710 schedpaging(void *arg)
711 {
712 	spgcnt_t vavail;
713 
714 	if (freemem < lotsfree + needfree + kmem_reapahead)
715 		kmem_reap();
716 
717 	if (freemem < lotsfree + needfree)
718 		seg_preap();
719 
720 	if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
721 		kcage_cageout_wakeup();
722 
723 	if (mutex_tryenter(&pageout_mutex)) {
724 		/* pageout() not running */
725 		nscan = 0;
726 		vavail = freemem - deficit;
727 		if (pageout_new_spread != 0)
728 			vavail -= needfree;
729 		if (vavail < 0)
730 			vavail = 0;
731 		if (vavail > lotsfree)
732 			vavail = lotsfree;
733 
734 		/*
735 		 * Fix for 1161438 (CRS SPR# 73922).  All variables
736 		 * in the original calculation for desscan were 32 bit signed
737 		 * ints.  As freemem approaches 0x0 on a system with 1 Gig or
738 		 * more of memory, the calculation can overflow.  When this
739 		 * happens, desscan becomes negative and pageout_scanner()
740 		 * stops paging out.
741 		 */
742 		if (needfree > 0 && pageout_new_spread == 0) {
743 			/*
744 			 * If we've not yet collected enough samples to
745 			 * calculate a spread, use the old logic of kicking
746 			 * into high gear anytime needfree is non-zero.
747 			 */
748 			desscan = fastscan / SCHEDPAGING_HZ;
749 		} else {
750 			/*
751 			 * Once we've calculated a spread based on system
752 			 * memory and usage, just treat needfree as another
753 			 * form of deficit.
754 			 */
755 			spgcnt_t faststmp, slowstmp, result;
756 
757 			slowstmp = slowscan * vavail;
758 			faststmp = fastscan * (lotsfree - vavail);
759 			result = (slowstmp + faststmp) /
760 			    nz(lotsfree) / SCHEDPAGING_HZ;
761 			desscan = (pgcnt_t)result;
762 		}
763 
764 		pageout_nsec = min_pageout_nsec + (lotsfree - vavail) *
765 		    (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree);
766 
767 		if (freemem < lotsfree + needfree ||
768 		    pageout_sample_cnt < pageout_sample_lim) {
769 			/*
770 			 * Either we need more memory, or we still need to
771 			 * measure the average scan rate.  Wake the scanner.
772 			 */
773 			DTRACE_PROBE(pageout__cv__signal);
774 			cv_signal(&proc_pageout->p_cv);
775 		} else {
776 			/*
777 			 * There are enough free pages, no need to
778 			 * kick the scanner thread.  And next time
779 			 * around, keep more of the `highly shared'
780 			 * pages.
781 			 */
782 			cv_signal_pageout();
783 			if (po_share > MIN_PO_SHARE) {
784 				po_share >>= 1;
785 			}
786 		}
787 		mutex_exit(&pageout_mutex);
788 	}
789 
790 	/*
791 	 * Signal threads waiting for available memory.
792 	 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
793 	 * in this case it is not needed - the waiters will be waken up during
794 	 * the next invocation of this function.
795 	 */
796 	if (kmem_avail() > 0)
797 		cv_broadcast(&memavail_cv);
798 
799 	(void) timeout(schedpaging, arg, hz / SCHEDPAGING_HZ);
800 }
801 
802 pgcnt_t		pushes;
803 ulong_t		push_list_size;		/* # of requests on pageout queue */
804 
805 /*
806  * Paging out should always be enabled.  This tunable exists to hold pageout
807  * for debugging purposes.  If set to 0, pageout_scanner() will go back to
808  * sleep each time it is woken by schedpaging().
809  */
810 uint_t dopageout = 1;
811 
812 /*
813  * The page out daemon, which runs as process 2.
814  *
815  * As long as there are at least lotsfree pages,
816  * this process is not run.  When the number of free
817  * pages stays in the range desfree to lotsfree,
818  * this daemon runs through the pages in the loop
819  * at a rate determined in schedpaging().  Pageout manages
820  * two hands on the clock.  The front hand moves through
821  * memory, clearing the reference bit,
822  * and stealing pages from procs that are over maxrss.
823  * The back hand travels a distance behind the front hand,
824  * freeing the pages that have not been referenced in the time
825  * since the front hand passed.  If modified, they are pushed to
826  * swap before being freed.
827  *
828  * There are 2 threads that act on behalf of the pageout process.
829  * One thread scans pages (pageout_scanner) and frees them up if
830  * they don't require any VOP_PUTPAGE operation. If a page must be
831  * written back to its backing store, the request is put on a list
832  * and the other (pageout) thread is signaled. The pageout thread
833  * grabs VOP_PUTPAGE requests from the list, and processes them.
834  * Some filesystems may require resources for the VOP_PUTPAGE
835  * operations (like memory) and hence can block the pageout
836  * thread, but the scanner thread can still operate. There is still
837  * no guarantee that memory deadlocks cannot occur.
838  *
839  * For now, this thing is in very rough form.
840  */
841 void
842 pageout()
843 {
844 	struct async_reqs *arg;
845 	pri_t pageout_pri;
846 	int i;
847 	pgcnt_t max_pushes;
848 	callb_cpr_t cprinfo;
849 
850 	proc_pageout = ttoproc(curthread);
851 	proc_pageout->p_cstime = 0;
852 	proc_pageout->p_stime =  0;
853 	proc_pageout->p_cutime =  0;
854 	proc_pageout->p_utime = 0;
855 	bcopy("pageout", PTOU(curproc)->u_psargs, 8);
856 	bcopy("pageout", PTOU(curproc)->u_comm, 7);
857 
858 	/*
859 	 * Create pageout scanner thread
860 	 */
861 	mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
862 	mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
863 
864 	/*
865 	 * Allocate and initialize the async request structures
866 	 * for pageout.
867 	 */
868 	push_req = (struct async_reqs *)
869 	    kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
870 
871 	req_freelist = push_req;
872 	for (i = 0; i < async_list_size - 1; i++) {
873 		push_req[i].a_next = &push_req[i + 1];
874 	}
875 
876 	pageout_pri = curthread->t_pri;
877 
878 	/* Create the pageout scanner thread. */
879 	(void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
880 	    pageout_pri - 1);
881 
882 	/*
883 	 * kick off pageout scheduler.
884 	 */
885 	schedpaging(NULL);
886 
887 	/*
888 	 * Create kernel cage thread.
889 	 * The kernel cage thread is started under the pageout process
890 	 * to take advantage of the less restricted page allocation
891 	 * in page_create_throttle().
892 	 */
893 	kcage_cageout_init();
894 
895 	/*
896 	 * Limit pushes to avoid saturating pageout devices.
897 	 */
898 	max_pushes = maxpgio / SCHEDPAGING_HZ;
899 	CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
900 
901 	for (;;) {
902 		mutex_enter(&push_lock);
903 
904 		while ((arg = push_list) == NULL || pushes > max_pushes) {
905 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
906 			cv_wait(&push_cv, &push_lock);
907 			pushes = 0;
908 			CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
909 		}
910 		push_list = arg->a_next;
911 		arg->a_next = NULL;
912 		pageout_pushing = true;
913 		mutex_exit(&push_lock);
914 
915 		if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
916 		    arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
917 			pushes++;
918 		}
919 
920 		/* vp held by checkpage() */
921 		VN_RELE(arg->a_vp);
922 
923 		mutex_enter(&push_lock);
924 		pageout_pushing = false;
925 		pageout_pushcount++;
926 		arg->a_next = req_freelist;	/* back on freelist */
927 		req_freelist = arg;
928 		push_list_size--;
929 		mutex_exit(&push_lock);
930 	}
931 }
932 
933 /*
934  * Kernel thread that scans pages looking for ones to free
935  */
936 static void
937 pageout_scanner(void)
938 {
939 	struct page *fronthand, *backhand;
940 	uint_t laps;
941 	callb_cpr_t cprinfo;
942 	pgcnt_t	nscan_limit;
943 	pgcnt_t	pcount;
944 	bool sampling;
945 
946 	CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
947 	mutex_enter(&pageout_mutex);
948 
949 	/*
950 	 * The restart case does not attempt to point the hands at roughly
951 	 * the right point on the assumption that after one circuit things
952 	 * will have settled down, and restarts shouldn't be that often.
953 	 */
954 
955 	/*
956 	 * Set the two clock hands to be separated by a reasonable amount,
957 	 * but no more than 360 degrees apart.
958 	 */
959 	backhand = page_first();
960 	if (handspreadpages >= total_pages) {
961 		fronthand = page_nextn(backhand, total_pages - 1);
962 	} else {
963 		fronthand = page_nextn(backhand, handspreadpages);
964 	}
965 
966 	/*
967 	 * Establish the minimum and maximum length of time to be spent
968 	 * scanning pages per wakeup, limiting the scanner duty cycle.  The
969 	 * input percentage values (0-100) must be converted to a fraction of
970 	 * the number of nanoseconds in a second of wall time, then further
971 	 * scaled down by the number of scanner wakeups in a second:
972 	 */
973 	min_pageout_nsec = MAX(1,
974 	    NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
975 	max_pageout_nsec = MAX(min_pageout_nsec,
976 	    NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
977 
978 loop:
979 	cv_signal_pageout();
980 
981 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
982 	cv_wait(&proc_pageout->p_cv, &pageout_mutex);
983 	CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
984 
985 	/*
986 	 * Check if pageout has been disabled for debugging purposes:
987 	 */
988 	if (!dopageout) {
989 		goto loop;
990 	}
991 
992 	/*
993 	 * One may reset the clock hands for debugging purposes.  Hands will
994 	 * also be reset if memory is added to or removed from the system.
995 	 */
996 	if (reset_hands) {
997 		reset_hands = 0;
998 
999 		backhand = page_first();
1000 		if (handspreadpages >= total_pages) {
1001 			fronthand = page_nextn(backhand, total_pages - 1);
1002 		} else {
1003 			fronthand = page_nextn(backhand, handspreadpages);
1004 		}
1005 	}
1006 
1007 	CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
1008 
1009 	/*
1010 	 * Keep track of the number of times we have scanned all the way around
1011 	 * the loop:
1012 	 */
1013 	laps = 0;
1014 
1015 	DTRACE_PROBE(pageout__start);
1016 
1017 	/*
1018 	 * Track the number of pages visited during this scan so that we can
1019 	 * periodically measure our duty cycle.
1020 	 */
1021 	pcount = 0;
1022 
1023 	if (pageout_sample_cnt < pageout_sample_lim) {
1024 		/*
1025 		 * We need to measure the rate at which the system is able to
1026 		 * scan pages of memory.  Each of these initial samples is a
1027 		 * scan of all system memory, regardless of whether or not we
1028 		 * are experiencing memory pressure.
1029 		 */
1030 		nscan_limit = total_pages;
1031 		sampling = true;
1032 	} else {
1033 		nscan_limit = desscan;
1034 		sampling = false;
1035 	}
1036 
1037 	sample_start = gethrtime();
1038 
1039 	/*
1040 	 * Scan the appropriate number of pages for a single duty cycle.
1041 	 */
1042 	while (nscan < nscan_limit) {
1043 		checkpage_result_t rvfront, rvback;
1044 
1045 		if (!sampling && freemem >= lotsfree + needfree) {
1046 			/*
1047 			 * We are not sampling and enough memory has become
1048 			 * available that scanning is no longer required.
1049 			 */
1050 			break;
1051 		}
1052 
1053 		/*
1054 		 * Periodically check to see if we have exceeded the CPU duty
1055 		 * cycle for a single wakeup.
1056 		 */
1057 		if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
1058 			pageout_cycle_nsec = gethrtime() - sample_start;
1059 			if (pageout_cycle_nsec >= pageout_nsec) {
1060 				++pageout_timeouts;
1061 				break;
1062 			}
1063 		}
1064 
1065 		/*
1066 		 * If checkpage manages to add a page to the free list,
1067 		 * we give ourselves another couple of trips around the loop.
1068 		 */
1069 		if ((rvfront = checkpage(fronthand, POH_FRONT)) == CKP_FREED) {
1070 			laps = 0;
1071 		}
1072 		if ((rvback = checkpage(backhand, POH_BACK)) == CKP_FREED) {
1073 			laps = 0;
1074 		}
1075 
1076 		++pcount;
1077 
1078 		/*
1079 		 * Protected by pageout_mutex instead of cpu_stat_lock:
1080 		 */
1081 		CPU_STATS_ADDQ(CPU, vm, scan, 1);
1082 
1083 		/*
1084 		 * Don't include ineligible pages in the number scanned.
1085 		 */
1086 		if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) {
1087 			nscan++;
1088 		}
1089 
1090 		backhand = page_next(backhand);
1091 		fronthand = page_next(fronthand);
1092 
1093 		/*
1094 		 * The front hand has wrapped around to the first page in the
1095 		 * loop.
1096 		 */
1097 		if (fronthand == page_first()) {
1098 			laps++;
1099 			DTRACE_PROBE1(pageout__hand__wrap, uint_t, laps);
1100 
1101 			/*
1102 			 * Protected by pageout_mutex instead of cpu_stat_lock:
1103 			 */
1104 			CPU_STATS_ADDQ(CPU, vm, rev, 1);
1105 
1106 			if (laps > 1) {
1107 				/*
1108 				 * Extremely unlikely, but it happens.
1109 				 * We went around the loop at least once
1110 				 * and didn't get far enough.
1111 				 * If we are still skipping `highly shared'
1112 				 * pages, skip fewer of them.  Otherwise,
1113 				 * give up till the next clock tick.
1114 				 */
1115 				if (po_share < MAX_PO_SHARE) {
1116 					po_share <<= 1;
1117 				} else {
1118 					break;
1119 				}
1120 			}
1121 		}
1122 	}
1123 
1124 	sample_end = gethrtime();
1125 
1126 	DTRACE_PROBE1(pageout__end, uint_t, laps);
1127 
1128 	if (pageout_new_spread == 0) {
1129 		if (pageout_sample_cnt < pageout_sample_lim) {
1130 			/*
1131 			 * Continue accumulating samples until we have enough
1132 			 * to get a reasonable value for average scan rate:
1133 			 */
1134 			pageout_sample_pages += pcount;
1135 			pageout_sample_etime += sample_end - sample_start;
1136 			++pageout_sample_cnt;
1137 		}
1138 
1139 		if (pageout_sample_cnt >= pageout_sample_lim) {
1140 			/*
1141 			 * We have enough samples, set the spread.
1142 			 */
1143 			pageout_rate = (hrrate_t)pageout_sample_pages *
1144 			    (hrrate_t)(NANOSEC) / pageout_sample_etime;
1145 			pageout_new_spread = pageout_rate / 10;
1146 			setupclock();
1147 		}
1148 	}
1149 
1150 	goto loop;
1151 }
1152 
1153 /*
1154  * The pageout deadman is run once per second by clock().
1155  */
1156 void
1157 pageout_deadman(void)
1158 {
1159 	if (panicstr != NULL) {
1160 		/*
1161 		 * There is no pageout after panic.
1162 		 */
1163 		return;
1164 	}
1165 
1166 	if (pageout_deadman_seconds == 0) {
1167 		/*
1168 		 * The deadman is not enabled.
1169 		 */
1170 		return;
1171 	}
1172 
1173 	if (!pageout_pushing) {
1174 		goto reset;
1175 	}
1176 
1177 	/*
1178 	 * We are pushing a page.  Check to see if it is the same call we saw
1179 	 * last time we looked:
1180 	 */
1181 	if (pageout_pushcount != pageout_pushcount_seen) {
1182 		/*
1183 		 * It is a different call from the last check, so we are not
1184 		 * stuck.
1185 		 */
1186 		goto reset;
1187 	}
1188 
1189 	if (++pageout_stucktime >= pageout_deadman_seconds) {
1190 		panic("pageout_deadman: stuck pushing the same page for %d "
1191 		    "seconds (freemem is %lu)", pageout_deadman_seconds,
1192 		    freemem);
1193 	}
1194 
1195 	return;
1196 
1197 reset:
1198 	/*
1199 	 * Reset our tracking state to reflect that we are not stuck:
1200 	 */
1201 	pageout_stucktime = 0;
1202 	pageout_pushcount_seen = pageout_pushcount;
1203 }
1204 
1205 /*
1206  * Look at the page at hand.  If it is locked (e.g., for physical i/o),
1207  * system (u., page table) or free, then leave it alone.  Otherwise,
1208  * if we are running the front hand, turn off the page's reference bit.
1209  * If the proc is over maxrss, we take it.  If running the back hand,
1210  * check whether the page has been reclaimed.  If not, free the page,
1211  * pushing it to disk first if necessary.
1212  *
1213  * Return values:
1214  *	CKP_INELIGIBLE if the page is not a candidate at all,
1215  *	CKP_NOT_FREED  if the page was not freed, or
1216  *	CKP_FREED      if we freed it.
1217  */
1218 static checkpage_result_t
1219 checkpage(struct page *pp, pageout_hand_t whichhand)
1220 {
1221 	int ppattr;
1222 	int isfs = 0;
1223 	int isexec = 0;
1224 	int pagesync_flag;
1225 
1226 	/*
1227 	 * Skip pages:
1228 	 *	- associated with the kernel vnode since
1229 	 *	    they are always "exclusively" locked.
1230 	 *	- that are free
1231 	 *	- that are shared more than po_share'd times
1232 	 *	- its already locked
1233 	 *
1234 	 * NOTE:  These optimizations assume that reads are atomic.
1235 	 */
1236 
1237 	if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
1238 	    pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
1239 	    hat_page_checkshare(pp, po_share)) {
1240 		return (CKP_INELIGIBLE);
1241 	}
1242 
1243 	if (!page_trylock(pp, SE_EXCL)) {
1244 		/*
1245 		 * Skip the page if we can't acquire the "exclusive" lock.
1246 		 */
1247 		return (CKP_INELIGIBLE);
1248 	} else if (PP_ISFREE(pp)) {
1249 		/*
1250 		 * It became free between the above check and our actually
1251 		 * locking the page.  Oh well, there will be other pages.
1252 		 */
1253 		page_unlock(pp);
1254 		return (CKP_INELIGIBLE);
1255 	}
1256 
1257 	/*
1258 	 * Reject pages that cannot be freed. The page_struct_lock
1259 	 * need not be acquired to examine these
1260 	 * fields since the page has an "exclusive" lock.
1261 	 */
1262 	if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1263 		page_unlock(pp);
1264 		return (CKP_INELIGIBLE);
1265 	}
1266 
1267 	/*
1268 	 * Maintain statistics for what we are freeing
1269 	 */
1270 	if (pp->p_vnode != NULL) {
1271 		if (pp->p_vnode->v_flag & VVMEXEC)
1272 			isexec = 1;
1273 
1274 		if (!IS_SWAPFSVP(pp->p_vnode))
1275 			isfs = 1;
1276 	}
1277 
1278 	/*
1279 	 * Turn off REF and MOD bits with the front hand.
1280 	 * The back hand examines the REF bit and always considers
1281 	 * SHARED pages as referenced.
1282 	 */
1283 	if (whichhand == POH_FRONT) {
1284 		pagesync_flag = HAT_SYNC_ZERORM;
1285 	} else {
1286 		pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
1287 		    HAT_SYNC_STOPON_SHARED;
1288 	}
1289 
1290 	ppattr = hat_pagesync(pp, pagesync_flag);
1291 
1292 recheck:
1293 	/*
1294 	 * If page is referenced; make unreferenced but reclaimable.
1295 	 * If this page is not referenced, then it must be reclaimable
1296 	 * and we can add it to the free list.
1297 	 */
1298 	if (ppattr & P_REF) {
1299 		DTRACE_PROBE2(pageout__isref, page_t *, pp,
1300 		    pageout_hand_t, whichhand);
1301 
1302 		if (whichhand == POH_FRONT) {
1303 			/*
1304 			 * Checking of rss or madvise flags needed here...
1305 			 *
1306 			 * If not "well-behaved", fall through into the code
1307 			 * for not referenced.
1308 			 */
1309 			hat_clrref(pp);
1310 		}
1311 
1312 		/*
1313 		 * Somebody referenced the page since the front
1314 		 * hand went by, so it's not a candidate for
1315 		 * freeing up.
1316 		 */
1317 		page_unlock(pp);
1318 		return (CKP_NOT_FREED);
1319 	}
1320 
1321 	VM_STAT_ADD(pageoutvmstats.checkpage[0]);
1322 
1323 	/*
1324 	 * If large page, attempt to demote it. If successfully demoted,
1325 	 * retry the checkpage.
1326 	 */
1327 	if (pp->p_szc != 0) {
1328 		if (!page_try_demote_pages(pp)) {
1329 			VM_STAT_ADD(pageoutvmstats.checkpage[1]);
1330 			page_unlock(pp);
1331 			return (CKP_INELIGIBLE);
1332 		}
1333 
1334 		ASSERT(pp->p_szc == 0);
1335 		VM_STAT_ADD(pageoutvmstats.checkpage[2]);
1336 
1337 		/*
1338 		 * Since page_try_demote_pages() could have unloaded some
1339 		 * mappings it makes sense to reload ppattr.
1340 		 */
1341 		ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1342 	}
1343 
1344 	/*
1345 	 * If the page is currently dirty, we have to arrange to have it
1346 	 * cleaned before it can be freed.
1347 	 *
1348 	 * XXX - ASSERT(pp->p_vnode != NULL);
1349 	 */
1350 	if ((ppattr & P_MOD) && pp->p_vnode != NULL) {
1351 		struct vnode *vp = pp->p_vnode;
1352 		u_offset_t offset = pp->p_offset;
1353 
1354 		/*
1355 		 * XXX - Test for process being swapped out or about to exit?
1356 		 * [Can't get back to process(es) using the page.]
1357 		 */
1358 
1359 		/*
1360 		 * Hold the vnode before releasing the page lock to
1361 		 * prevent it from being freed and re-used by some
1362 		 * other thread.
1363 		 */
1364 		VN_HOLD(vp);
1365 		page_unlock(pp);
1366 
1367 		/*
1368 		 * Queue I/O request for the pageout thread.
1369 		 */
1370 		if (!queue_io_request(vp, offset)) {
1371 			VN_RELE(vp);
1372 			return (CKP_NOT_FREED);
1373 		}
1374 		return (CKP_FREED);
1375 	}
1376 
1377 	/*
1378 	 * Now we unload all the translations and put the page back on to the
1379 	 * free list.  If the page was used (referenced or modified) after the
1380 	 * pagesync but before it was unloaded we catch it and handle the page
1381 	 * properly.
1382 	 */
1383 	DTRACE_PROBE2(pageout__free, page_t *, pp, pageout_hand_t, whichhand);
1384 	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1385 	ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1386 	if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode != NULL)) {
1387 		goto recheck;
1388 	}
1389 
1390 	VN_DISPOSE(pp, B_FREE, 0, kcred);
1391 
1392 	CPU_STATS_ADD_K(vm, dfree, 1);
1393 
1394 	if (isfs) {
1395 		if (isexec) {
1396 			CPU_STATS_ADD_K(vm, execfree, 1);
1397 		} else {
1398 			CPU_STATS_ADD_K(vm, fsfree, 1);
1399 		}
1400 	} else {
1401 		CPU_STATS_ADD_K(vm, anonfree, 1);
1402 	}
1403 
1404 	return (CKP_FREED);
1405 }
1406 
1407 /*
1408  * Queue async i/o request from pageout_scanner and segment swapout
1409  * routines on one common list.  This ensures that pageout devices (swap)
1410  * are not saturated by pageout_scanner or swapout requests.
1411  * The pageout thread empties this list by initiating i/o operations.
1412  */
1413 int
1414 queue_io_request(vnode_t *vp, u_offset_t off)
1415 {
1416 	struct async_reqs *arg;
1417 
1418 	/*
1419 	 * If we cannot allocate an async request struct,
1420 	 * skip this page.
1421 	 */
1422 	mutex_enter(&push_lock);
1423 	if ((arg = req_freelist) == NULL) {
1424 		mutex_exit(&push_lock);
1425 		return (0);
1426 	}
1427 	req_freelist = arg->a_next;		/* adjust freelist */
1428 	push_list_size++;
1429 
1430 	arg->a_vp = vp;
1431 	arg->a_off = off;
1432 	arg->a_len = PAGESIZE;
1433 	arg->a_flags = B_ASYNC | B_FREE;
1434 	arg->a_cred = kcred;		/* always held */
1435 
1436 	/*
1437 	 * Add to list of pending write requests.
1438 	 */
1439 	arg->a_next = push_list;
1440 	push_list = arg;
1441 
1442 	if (req_freelist == NULL) {
1443 		/*
1444 		 * No free async requests left. The lock is held so we
1445 		 * might as well signal the pusher thread now.
1446 		 */
1447 		cv_signal(&push_cv);
1448 	}
1449 	mutex_exit(&push_lock);
1450 	return (1);
1451 }
1452 
1453 /*
1454  * Wakeup pageout to initiate i/o if push_list is not empty.
1455  */
1456 void
1457 cv_signal_pageout()
1458 {
1459 	if (push_list != NULL) {
1460 		mutex_enter(&push_lock);
1461 		cv_signal(&push_cv);
1462 		mutex_exit(&push_lock);
1463 	}
1464 }
1465