xref: /illumos-gate/usr/src/uts/common/os/vm_pageout.c (revision a92282e44f968185a6bba094d1e5fece2da819cf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2020 Oxide Computer Company
24  */
25 
26 /*
27  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
28  * Use is subject to license terms.
29  */
30 
31 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
32 /* All Rights Reserved */
33 
34 /*
35  * University Copyright- Copyright (c) 1982, 1986, 1988
36  * The Regents of the University of California
37  * All Rights Reserved
38  *
39  * University Acknowledgment- Portions of this document are derived from
40  * software developed by the University of California, Berkeley, and its
41  * contributors.
42  */
43 
44 #include <sys/types.h>
45 #include <sys/t_lock.h>
46 #include <sys/param.h>
47 #include <sys/buf.h>
48 #include <sys/uio.h>
49 #include <sys/proc.h>
50 #include <sys/systm.h>
51 #include <sys/mman.h>
52 #include <sys/cred.h>
53 #include <sys/vnode.h>
54 #include <sys/vm.h>
55 #include <sys/vmparam.h>
56 #include <sys/vtrace.h>
57 #include <sys/cmn_err.h>
58 #include <sys/cpuvar.h>
59 #include <sys/user.h>
60 #include <sys/kmem.h>
61 #include <sys/debug.h>
62 #include <sys/callb.h>
63 #include <sys/tnf_probe.h>
64 #include <sys/mem_cage.h>
65 #include <sys/time.h>
66 #include <sys/stdbool.h>
67 
68 #include <vm/hat.h>
69 #include <vm/as.h>
70 #include <vm/seg.h>
71 #include <vm/page.h>
72 #include <vm/pvn.h>
73 #include <vm/seg_kmem.h>
74 
75 static int checkpage(page_t *, int);
76 
77 /*
78  * The following parameters control operation of the page replacement
79  * algorithm.  They are initialized to 0, and then computed at boot time
80  * based on the size of the system.  If they are patched non-zero in
81  * a loaded vmunix they are left alone and may thus be changed per system
82  * using adb on the loaded system.
83  */
84 pgcnt_t		slowscan = 0;
85 pgcnt_t		fastscan = 0;
86 
87 static pgcnt_t	handspreadpages = 0;
88 static int	loopfraction = 2;
89 static pgcnt_t	looppages;
90 static int	min_percent_cpu = 4;
91 static int	max_percent_cpu = 80;
92 static pgcnt_t	maxfastscan = 0;
93 static pgcnt_t	maxslowscan = 100;
94 
95 pgcnt_t	maxpgio = 0;
96 pgcnt_t	minfree = 0;
97 pgcnt_t	desfree = 0;
98 pgcnt_t	lotsfree = 0;
99 pgcnt_t	needfree = 0;
100 pgcnt_t	throttlefree = 0;
101 pgcnt_t	pageout_reserve = 0;
102 
103 pgcnt_t	deficit;
104 pgcnt_t	nscan;
105 pgcnt_t	desscan;
106 
107 /*
108  * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks
109  * are the number of ticks in each wakeup cycle that gives the
110  * equivalent of some underlying %CPU duty cycle.
111  * When RATETOSCHEDPAGING is 4,  and hz is 100, pageout_scanner is
112  * awakened every 25 clock ticks.  So, converting from %CPU to ticks
113  * per wakeup cycle would be x% of 25, that is (x * 100) / 25.
114  * So, for example, 4% == 1 tick and 80% == 20 ticks.
115  *
116  * min_pageout_ticks:
117  *     ticks/wakeup equivalent of min_percent_cpu.
118  *
119  * max_pageout_ticks:
120  *     ticks/wakeup equivalent of max_percent_cpu.
121  *
122  * pageout_ticks:
123  *     Number of clock ticks budgeted for each wakeup cycle.
124  *     Computed each time around by schedpaging().
125  *     Varies between min_pageout_ticks .. max_pageout_ticks,
126  *     depending on memory pressure.
127  *
128  * pageout_lbolt:
129  *     Timestamp of the last time pageout_scanner woke up and started
130  *     (or resumed) scanning for not recently referenced pages.
131  */
132 
133 static clock_t	min_pageout_ticks;
134 static clock_t	max_pageout_ticks;
135 static clock_t	pageout_ticks;
136 static clock_t	pageout_lbolt;
137 
138 static uint_t	reset_hands;
139 
140 #define	PAGES_POLL_MASK	1023
141 
142 /*
143  * pageout_sample_lim:
144  *     The limit on the number of samples needed to establish a value
145  *     for new pageout parameters, fastscan, slowscan, and handspreadpages.
146  *
147  * pageout_sample_cnt:
148  *     Current sample number.  Once the sample gets large enough,
149  *     set new values for handspreadpages, fastscan and slowscan.
150  *
151  * pageout_sample_pages:
152  *     The accumulated number of pages scanned during sampling.
153  *
154  * pageout_sample_ticks:
155  *     The accumulated clock ticks for the sample.
156  *
157  * pageout_rate:
158  *     Rate in pages/nanosecond, computed at the end of sampling.
159  *
160  * pageout_new_spread:
161  *     The new value to use for fastscan and handspreadpages.
162  *     Calculated after enough samples have been taken.
163  */
164 
165 typedef hrtime_t hrrate_t;
166 
167 static uint64_t	pageout_sample_lim = 4;
168 static uint64_t	pageout_sample_cnt = 0;
169 static pgcnt_t	pageout_sample_pages = 0;
170 static hrrate_t	pageout_rate = 0;
171 static pgcnt_t	pageout_new_spread = 0;
172 
173 static clock_t	pageout_cycle_ticks;
174 static hrtime_t	sample_start, sample_end;
175 static hrtime_t	pageout_sample_etime = 0;
176 
177 /*
178  * Record number of times a pageout_scanner wakeup cycle finished because it
179  * timed out (exceeded its CPU budget), rather than because it visited
180  * its budgeted number of pages.
181  */
182 uint64_t pageout_timeouts = 0;
183 
184 #ifdef VM_STATS
185 static struct pageoutvmstats_str {
186 	ulong_t	checkpage[3];
187 } pageoutvmstats;
188 #endif /* VM_STATS */
189 
190 /*
191  * Threads waiting for free memory use this condition variable and lock until
192  * memory becomes available.
193  */
194 kmutex_t	memavail_lock;
195 kcondvar_t	memavail_cv;
196 
197 /*
198  * The size of the clock loop.
199  */
200 #define	LOOPPAGES	total_pages
201 
202 /*
203  * Set up the paging constants for the clock algorithm.
204  * Called after the system is initialized and the amount of memory
205  * and number of paging devices is known.
206  *
207  * lotsfree is 1/64 of memory, but at least 512K.
208  * desfree is 1/2 of lotsfree.
209  * minfree is 1/2 of desfree.
210  *
211  * Note: to revert to the paging algorithm of Solaris 2.4/2.5, set:
212  *
213  *	lotsfree = btop(512K)
214  *	desfree = btop(200K)
215  *	minfree = btop(100K)
216  *	throttlefree = INT_MIN
217  *	max_percent_cpu = 4
218  */
219 void
220 setupclock(int recalc)
221 {
222 
223 	static spgcnt_t init_lfree, init_dfree, init_mfree;
224 	static spgcnt_t init_tfree, init_preserve, init_mpgio;
225 	static spgcnt_t init_mfscan, init_fscan, init_sscan, init_hspages;
226 
227 	looppages = LOOPPAGES;
228 
229 	/*
230 	 * setupclock can now be called to recalculate the paging
231 	 * parameters in the case of dynamic addition of memory.
232 	 * So to make sure we make the proper calculations, if such a
233 	 * situation should arise, we save away the initial values
234 	 * of each parameter so we can recall them when needed. This
235 	 * way we don't lose the settings an admin might have made
236 	 * through the /etc/system file.
237 	 */
238 
239 	if (!recalc) {
240 		init_lfree = lotsfree;
241 		init_dfree = desfree;
242 		init_mfree = minfree;
243 		init_tfree = throttlefree;
244 		init_preserve = pageout_reserve;
245 		init_mpgio = maxpgio;
246 		init_mfscan = maxfastscan;
247 		init_fscan = fastscan;
248 		init_sscan = slowscan;
249 		init_hspages = handspreadpages;
250 	}
251 
252 	/*
253 	 * Set up thresholds for paging:
254 	 */
255 
256 	/*
257 	 * Lotsfree is threshold where paging daemon turns on.
258 	 */
259 	if (init_lfree == 0 || init_lfree >= looppages)
260 		lotsfree = MAX(looppages / 64, btop(512 * 1024));
261 	else
262 		lotsfree = init_lfree;
263 
264 	/*
265 	 * Desfree is amount of memory desired free.
266 	 * If less than this for extended period, start swapping.
267 	 */
268 	if (init_dfree == 0 || init_dfree >= lotsfree)
269 		desfree = lotsfree / 2;
270 	else
271 		desfree = init_dfree;
272 
273 	/*
274 	 * Minfree is minimal amount of free memory which is tolerable.
275 	 */
276 	if (init_mfree == 0 || init_mfree >= desfree)
277 		minfree = desfree / 2;
278 	else
279 		minfree = init_mfree;
280 
281 	/*
282 	 * Throttlefree is the point at which we start throttling
283 	 * PG_WAIT requests until enough memory becomes available.
284 	 */
285 	if (init_tfree == 0 || init_tfree >= desfree)
286 		throttlefree = minfree;
287 	else
288 		throttlefree = init_tfree;
289 
290 	/*
291 	 * Pageout_reserve is the number of pages that we keep in
292 	 * stock for pageout's own use.  Having a few such pages
293 	 * provides insurance against system deadlock due to
294 	 * pageout needing pages.  When freemem < pageout_reserve,
295 	 * non-blocking allocations are denied to any threads
296 	 * other than pageout and sched.  (At some point we might
297 	 * want to consider a per-thread flag like T_PUSHING_PAGES
298 	 * to indicate that a thread is part of the page-pushing
299 	 * dance (e.g. an interrupt thread) and thus is entitled
300 	 * to the same special dispensation we accord pageout.)
301 	 */
302 	if (init_preserve == 0 || init_preserve >= throttlefree)
303 		pageout_reserve = throttlefree / 2;
304 	else
305 		pageout_reserve = init_preserve;
306 
307 	/*
308 	 * Maxpgio thresholds how much paging is acceptable.
309 	 * This figures that 2/3 busy on an arm is all that is
310 	 * tolerable for paging.  We assume one operation per disk rev.
311 	 *
312 	 * XXX - Does not account for multiple swap devices.
313 	 */
314 	if (init_mpgio == 0)
315 		maxpgio = (DISKRPM * 2) / 3;
316 	else
317 		maxpgio = init_mpgio;
318 
319 	/*
320 	 * The clock scan rate varies between fastscan and slowscan
321 	 * based on the amount of free memory available.  Fastscan
322 	 * rate should be set based on the number pages that can be
323 	 * scanned per sec using ~10% of processor time.  Since this
324 	 * value depends on the processor, MMU, Mhz etc., it is
325 	 * difficult to determine it in a generic manner for all
326 	 * architectures.
327 	 *
328 	 * Instead of trying to determine the number of pages scanned
329 	 * per sec for every processor, fastscan is set to be the smaller
330 	 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
331 	 * time is limited to ~4% of processor time.
332 	 *
333 	 * Setting fastscan to be 1/2 of memory allows pageout to scan
334 	 * all of memory in ~2 secs.  This implies that user pages not
335 	 * accessed within 1 sec (assuming, handspreadpages == fastscan)
336 	 * can be reclaimed when free memory is very low.  Stealing pages
337 	 * not accessed within 1 sec seems reasonable and ensures that
338 	 * active user processes don't thrash.
339 	 *
340 	 * Smaller values of fastscan result in scanning fewer pages
341 	 * every second and consequently pageout may not be able to free
342 	 * sufficient memory to maintain the minimum threshold.  Larger
343 	 * values of fastscan result in scanning a lot more pages which
344 	 * could lead to thrashing and higher CPU usage.
345 	 *
346 	 * Fastscan needs to be limited to a maximum value and should not
347 	 * scale with memory to prevent pageout from consuming too much
348 	 * time for scanning on slow CPU's and avoid thrashing, as a
349 	 * result of scanning too many pages, on faster CPU's.
350 	 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
351 	 * (the upper bound for fastscan) based on the average number
352 	 * of pages that can potentially be scanned in ~1 sec (using ~4%
353 	 * of the CPU) on some of the following machines that currently
354 	 * run Solaris 2.x:
355 	 *
356 	 *			average memory scanned in ~1 sec
357 	 *
358 	 *	25 Mhz SS1+:		23 Meg
359 	 *	LX:			37 Meg
360 	 *	50 Mhz SC2000:		68 Meg
361 	 *
362 	 *	40 Mhz 486:		26 Meg
363 	 *	66 Mhz 486:		42 Meg
364 	 *
365 	 * When free memory falls just below lotsfree, the scan rate
366 	 * goes from 0 to slowscan (i.e., pageout starts running).  This
367 	 * transition needs to be smooth and is achieved by ensuring that
368 	 * pageout scans a small number of pages to satisfy the transient
369 	 * memory demand.  This is set to not exceed 100 pages/sec (25 per
370 	 * wakeup) since scanning that many pages has no noticible impact
371 	 * on system performance.
372 	 *
373 	 * In addition to setting fastscan and slowscan, pageout is
374 	 * limited to using ~4% of the CPU.  This results in increasing
375 	 * the time taken to scan all of memory, which in turn means that
376 	 * user processes have a better opportunity of preventing their
377 	 * pages from being stolen.  This has a positive effect on
378 	 * interactive and overall system performance when memory demand
379 	 * is high.
380 	 *
381 	 * Thus, the rate at which pages are scanned for replacement will
382 	 * vary linearly between slowscan and the number of pages that
383 	 * can be scanned using ~4% of processor time instead of varying
384 	 * linearly between slowscan and fastscan.
385 	 *
386 	 * Also, the processor time used by pageout will vary from ~1%
387 	 * at slowscan to ~4% at fastscan instead of varying between
388 	 * ~1% at slowscan and ~10% at fastscan.
389 	 *
390 	 * The values chosen for the various VM parameters (fastscan,
391 	 * handspreadpages, etc) are not universally true for all machines,
392 	 * but appear to be a good rule of thumb for the machines we've
393 	 * tested.  They have the following ranges:
394 	 *
395 	 *	cpu speed:	20 to 70 Mhz
396 	 *	page size:	4K to 8K
397 	 *	memory size:	16M to 5G
398 	 *	page scan rate:	4000 - 17400 4K pages per sec
399 	 *
400 	 * The values need to be re-examined for machines which don't
401 	 * fall into the various ranges (e.g., slower or faster CPUs,
402 	 * smaller or larger pagesizes etc) shown above.
403 	 *
404 	 * On an MP machine, pageout is often unable to maintain the
405 	 * minimum paging thresholds under heavy load.  This is due to
406 	 * the fact that user processes running on other CPU's can be
407 	 * dirtying memory at a much faster pace than pageout can find
408 	 * pages to free.  The memory demands could be met by enabling
409 	 * more than one CPU to run the clock algorithm in such a manner
410 	 * that the various clock hands don't overlap.  This also makes
411 	 * it more difficult to determine the values for fastscan, slowscan
412 	 * and handspreadpages.
413 	 *
414 	 * The swapper is currently used to free up memory when pageout
415 	 * is unable to meet memory demands by swapping out processes.
416 	 * In addition to freeing up memory, swapping also reduces the
417 	 * demand for memory by preventing user processes from running
418 	 * and thereby consuming memory.
419 	 */
420 	if (init_mfscan == 0) {
421 		if (pageout_new_spread != 0)
422 			maxfastscan = pageout_new_spread;
423 		else
424 			maxfastscan = MAXHANDSPREADPAGES;
425 	} else {
426 		maxfastscan = init_mfscan;
427 	}
428 	if (init_fscan == 0)
429 		fastscan = MIN(looppages / loopfraction, maxfastscan);
430 	else
431 		fastscan = init_fscan;
432 	if (fastscan > looppages / loopfraction)
433 		fastscan = looppages / loopfraction;
434 
435 	/*
436 	 * Set slow scan time to 1/10 the fast scan time, but
437 	 * not to exceed maxslowscan.
438 	 */
439 	if (init_sscan == 0)
440 		slowscan = MIN(fastscan / 10, maxslowscan);
441 	else
442 		slowscan = init_sscan;
443 	if (slowscan > fastscan / 2)
444 		slowscan = fastscan / 2;
445 
446 	/*
447 	 * Handspreadpages is distance (in pages) between front and back
448 	 * pageout daemon hands.  The amount of time to reclaim a page
449 	 * once pageout examines it increases with this distance and
450 	 * decreases as the scan rate rises. It must be < the amount
451 	 * of pageable memory.
452 	 *
453 	 * Since pageout is limited to ~4% of the CPU, setting handspreadpages
454 	 * to be "fastscan" results in the front hand being a few secs
455 	 * (varies based on the processor speed) ahead of the back hand
456 	 * at fastscan rates.  This distance can be further reduced, if
457 	 * necessary, by increasing the processor time used by pageout
458 	 * to be more than ~4% and preferrably not more than ~10%.
459 	 *
460 	 * As a result, user processes have a much better chance of
461 	 * referencing their pages before the back hand examines them.
462 	 * This also significantly lowers the number of reclaims from
463 	 * the freelist since pageout does not end up freeing pages which
464 	 * may be referenced a sec later.
465 	 */
466 	if (init_hspages == 0)
467 		handspreadpages = fastscan;
468 	else
469 		handspreadpages = init_hspages;
470 
471 	/*
472 	 * Make sure that back hand follows front hand by at least
473 	 * 1/RATETOSCHEDPAGING seconds.  Without this test, it is possible
474 	 * for the back hand to look at a page during the same wakeup of
475 	 * the pageout daemon in which the front hand cleared its ref bit.
476 	 */
477 	if (handspreadpages >= looppages)
478 		handspreadpages = looppages - 1;
479 
480 	/*
481 	 * If we have been called to recalculate the parameters,
482 	 * set a flag to re-evaluate the clock hand pointers.
483 	 */
484 	if (recalc)
485 		reset_hands = 1;
486 }
487 
488 /*
489  * Pageout scheduling.
490  *
491  * Schedpaging controls the rate at which the page out daemon runs by
492  * setting the global variables nscan and desscan RATETOSCHEDPAGING
493  * times a second.  Nscan records the number of pages pageout has examined
494  * in its current pass; schedpaging resets this value to zero each time
495  * it runs.  Desscan records the number of pages pageout should examine
496  * in its next pass; schedpaging sets this value based on the amount of
497  * currently available memory.
498  */
499 
500 #define	RATETOSCHEDPAGING	4		/* hz that is */
501 
502 static kmutex_t	pageout_mutex;	/* held while pageout or schedpaging running */
503 
504 /*
505  * Pool of available async pageout putpage requests.
506  */
507 static struct async_reqs *push_req;
508 static struct async_reqs *req_freelist;	/* available req structs */
509 static struct async_reqs *push_list;	/* pending reqs */
510 static kmutex_t push_lock;		/* protects req pool */
511 static kcondvar_t push_cv;
512 
513 /*
514  * If pageout() is stuck on a single push for this many seconds,
515  * pageout_deadman() will assume the system has hit a memory deadlock.  If set
516  * to 0, the deadman will have no effect.
517  *
518  * Note that we are only looking for stalls in the calls that pageout() makes
519  * to VOP_PUTPAGE().  These calls are merely asynchronous requests for paging
520  * I/O, which should not take long unless the underlying strategy call blocks
521  * indefinitely for memory.  The actual I/O request happens (or fails) later.
522  */
523 uint_t pageout_deadman_seconds = 90;
524 
525 static uint_t pageout_stucktime = 0;
526 static bool pageout_pushing = false;
527 static uint64_t pageout_pushcount = 0;
528 static uint64_t pageout_pushcount_seen = 0;
529 
530 static int async_list_size = 256;	/* number of async request structs */
531 
532 static void pageout_scanner(void);
533 
534 /*
535  * If a page is being shared more than "po_share" times
536  * then leave it alone- don't page it out.
537  */
538 #define	MIN_PO_SHARE	(8)
539 #define	MAX_PO_SHARE	((MIN_PO_SHARE) << 24)
540 ulong_t	po_share = MIN_PO_SHARE;
541 
542 /*
543  * Schedule rate for paging.
544  * Rate is linear interpolation between
545  * slowscan with lotsfree and fastscan when out of memory.
546  */
547 static void
548 schedpaging(void *arg)
549 {
550 	spgcnt_t vavail;
551 
552 	if (freemem < lotsfree + needfree + kmem_reapahead)
553 		kmem_reap();
554 
555 	if (freemem < lotsfree + needfree)
556 		seg_preap();
557 
558 	if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
559 		kcage_cageout_wakeup();
560 
561 	if (mutex_tryenter(&pageout_mutex)) {
562 		/* pageout() not running */
563 		nscan = 0;
564 		vavail = freemem - deficit;
565 		if (pageout_new_spread != 0)
566 			vavail -= needfree;
567 		if (vavail < 0)
568 			vavail = 0;
569 		if (vavail > lotsfree)
570 			vavail = lotsfree;
571 
572 		/*
573 		 * Fix for 1161438 (CRS SPR# 73922).  All variables
574 		 * in the original calculation for desscan were 32 bit signed
575 		 * ints.  As freemem approaches 0x0 on a system with 1 Gig or
576 		 * more of memory, the calculation can overflow.  When this
577 		 * happens, desscan becomes negative and pageout_scanner()
578 		 * stops paging out.
579 		 */
580 		if ((needfree) && (pageout_new_spread == 0)) {
581 			/*
582 			 * If we've not yet collected enough samples to
583 			 * calculate a spread, use the old logic of kicking
584 			 * into high gear anytime needfree is non-zero.
585 			 */
586 			desscan = fastscan / RATETOSCHEDPAGING;
587 		} else {
588 			/*
589 			 * Once we've calculated a spread based on system
590 			 * memory and usage, just treat needfree as another
591 			 * form of deficit.
592 			 */
593 			spgcnt_t faststmp, slowstmp, result;
594 
595 			slowstmp = slowscan * vavail;
596 			faststmp = fastscan * (lotsfree - vavail);
597 			result = (slowstmp + faststmp) /
598 			    nz(lotsfree) / RATETOSCHEDPAGING;
599 			desscan = (pgcnt_t)result;
600 		}
601 
602 		pageout_ticks = min_pageout_ticks + (lotsfree - vavail) *
603 		    (max_pageout_ticks - min_pageout_ticks) / nz(lotsfree);
604 
605 		if (freemem < lotsfree + needfree ||
606 		    pageout_sample_cnt < pageout_sample_lim) {
607 			TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
608 			    "pageout_cv_signal:freemem %ld", freemem);
609 			cv_signal(&proc_pageout->p_cv);
610 		} else {
611 			/*
612 			 * There are enough free pages, no need to
613 			 * kick the scanner thread.  And next time
614 			 * around, keep more of the `highly shared'
615 			 * pages.
616 			 */
617 			cv_signal_pageout();
618 			if (po_share > MIN_PO_SHARE) {
619 				po_share >>= 1;
620 			}
621 		}
622 		mutex_exit(&pageout_mutex);
623 	}
624 
625 	/*
626 	 * Signal threads waiting for available memory.
627 	 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
628 	 * in this case it is not needed - the waiters will be waken up during
629 	 * the next invocation of this function.
630 	 */
631 	if (kmem_avail() > 0)
632 		cv_broadcast(&memavail_cv);
633 
634 	(void) timeout(schedpaging, arg, hz / RATETOSCHEDPAGING);
635 }
636 
637 pgcnt_t		pushes;
638 ulong_t		push_list_size;		/* # of requests on pageout queue */
639 
640 #define	FRONT	1
641 #define	BACK	2
642 
643 int dopageout = 1;	/* must be non-zero to turn page stealing on */
644 
645 /*
646  * The page out daemon, which runs as process 2.
647  *
648  * As long as there are at least lotsfree pages,
649  * this process is not run.  When the number of free
650  * pages stays in the range desfree to lotsfree,
651  * this daemon runs through the pages in the loop
652  * at a rate determined in schedpaging().  Pageout manages
653  * two hands on the clock.  The front hand moves through
654  * memory, clearing the reference bit,
655  * and stealing pages from procs that are over maxrss.
656  * The back hand travels a distance behind the front hand,
657  * freeing the pages that have not been referenced in the time
658  * since the front hand passed.  If modified, they are pushed to
659  * swap before being freed.
660  *
661  * There are 2 threads that act on behalf of the pageout process.
662  * One thread scans pages (pageout_scanner) and frees them up if
663  * they don't require any VOP_PUTPAGE operation. If a page must be
664  * written back to its backing store, the request is put on a list
665  * and the other (pageout) thread is signaled. The pageout thread
666  * grabs VOP_PUTPAGE requests from the list, and processes them.
667  * Some filesystems may require resources for the VOP_PUTPAGE
668  * operations (like memory) and hence can block the pageout
669  * thread, but the scanner thread can still operate. There is still
670  * no guarantee that memory deadlocks cannot occur.
671  *
672  * For now, this thing is in very rough form.
673  */
674 void
675 pageout()
676 {
677 	struct async_reqs *arg;
678 	pri_t pageout_pri;
679 	int i;
680 	pgcnt_t max_pushes;
681 	callb_cpr_t cprinfo;
682 
683 	proc_pageout = ttoproc(curthread);
684 	proc_pageout->p_cstime = 0;
685 	proc_pageout->p_stime =  0;
686 	proc_pageout->p_cutime =  0;
687 	proc_pageout->p_utime = 0;
688 	bcopy("pageout", PTOU(curproc)->u_psargs, 8);
689 	bcopy("pageout", PTOU(curproc)->u_comm, 7);
690 
691 	/*
692 	 * Create pageout scanner thread
693 	 */
694 	mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
695 	mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
696 
697 	/*
698 	 * Allocate and initialize the async request structures
699 	 * for pageout.
700 	 */
701 	push_req = (struct async_reqs *)
702 	    kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
703 
704 	req_freelist = push_req;
705 	for (i = 0; i < async_list_size - 1; i++)
706 		push_req[i].a_next = &push_req[i + 1];
707 
708 	pageout_pri = curthread->t_pri;
709 
710 	/* Create the pageout scanner thread. */
711 	(void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
712 	    pageout_pri - 1);
713 
714 	/*
715 	 * kick off pageout scheduler.
716 	 */
717 	schedpaging(NULL);
718 
719 	/*
720 	 * Create kernel cage thread.
721 	 * The kernel cage thread is started under the pageout process
722 	 * to take advantage of the less restricted page allocation
723 	 * in page_create_throttle().
724 	 */
725 	kcage_cageout_init();
726 
727 	/*
728 	 * Limit pushes to avoid saturating pageout devices.
729 	 */
730 	max_pushes = maxpgio / RATETOSCHEDPAGING;
731 	CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
732 
733 	for (;;) {
734 		mutex_enter(&push_lock);
735 
736 		while ((arg = push_list) == NULL || pushes > max_pushes) {
737 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
738 			cv_wait(&push_cv, &push_lock);
739 			pushes = 0;
740 			CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
741 		}
742 		push_list = arg->a_next;
743 		arg->a_next = NULL;
744 		pageout_pushing = true;
745 		mutex_exit(&push_lock);
746 
747 		if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
748 		    arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
749 			pushes++;
750 		}
751 
752 		/* vp held by checkpage() */
753 		VN_RELE(arg->a_vp);
754 
755 		mutex_enter(&push_lock);
756 		pageout_pushing = false;
757 		pageout_pushcount++;
758 		arg->a_next = req_freelist;	/* back on freelist */
759 		req_freelist = arg;
760 		push_list_size--;
761 		mutex_exit(&push_lock);
762 	}
763 }
764 
765 /*
766  * Kernel thread that scans pages looking for ones to free
767  */
768 static void
769 pageout_scanner(void)
770 {
771 	struct page *fronthand, *backhand;
772 	uint_t count;
773 	callb_cpr_t cprinfo;
774 	pgcnt_t	nscan_limit;
775 	pgcnt_t	pcount;
776 
777 	CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
778 	mutex_enter(&pageout_mutex);
779 
780 	/*
781 	 * The restart case does not attempt to point the hands at roughly
782 	 * the right point on the assumption that after one circuit things
783 	 * will have settled down - and restarts shouldn't be that often.
784 	 */
785 
786 	/*
787 	 * Set the two clock hands to be separated by a reasonable amount,
788 	 * but no more than 360 degrees apart.
789 	 */
790 	backhand = page_first();
791 	if (handspreadpages >= total_pages)
792 		fronthand = page_nextn(backhand, total_pages - 1);
793 	else
794 		fronthand = page_nextn(backhand, handspreadpages);
795 
796 	min_pageout_ticks = MAX(1,
797 	    ((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING);
798 	max_pageout_ticks = MAX(min_pageout_ticks,
799 	    ((hz * max_percent_cpu) / 100) / RATETOSCHEDPAGING);
800 
801 loop:
802 	cv_signal_pageout();
803 
804 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
805 	cv_wait(&proc_pageout->p_cv, &pageout_mutex);
806 	CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
807 
808 	if (!dopageout)
809 		goto loop;
810 
811 	if (reset_hands) {
812 		reset_hands = 0;
813 
814 		backhand = page_first();
815 		if (handspreadpages >= total_pages)
816 			fronthand = page_nextn(backhand, total_pages - 1);
817 		else
818 			fronthand = page_nextn(backhand, handspreadpages);
819 	}
820 
821 	CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
822 	count = 0;
823 
824 	TRACE_4(TR_FAC_VM, TR_PAGEOUT_START,
825 	    "pageout_start:freemem %ld lotsfree %ld nscan %ld desscan %ld",
826 	    freemem, lotsfree, nscan, desscan);
827 
828 	/* Kernel probe */
829 	TNF_PROBE_2(pageout_scan_start, "vm pagedaemon", /* CSTYLED */,
830 	    tnf_ulong, pages_free, freemem, tnf_ulong, pages_needed, needfree);
831 
832 	pcount = 0;
833 	if (pageout_sample_cnt < pageout_sample_lim) {
834 		nscan_limit = total_pages;
835 	} else {
836 		nscan_limit = desscan;
837 	}
838 	pageout_lbolt = ddi_get_lbolt();
839 	sample_start = gethrtime();
840 
841 	/*
842 	 * Scan the appropriate number of pages for a single duty cycle.
843 	 * However, stop scanning as soon as there is enough free memory.
844 	 * For a short while, we will be sampling the performance of the
845 	 * scanner and need to keep running just to get sample data, in
846 	 * which case we keep going and don't pay attention to whether
847 	 * or not there is enough free memory.
848 	 */
849 
850 	while (nscan < nscan_limit && (freemem < lotsfree + needfree ||
851 	    pageout_sample_cnt < pageout_sample_lim)) {
852 		int rvfront, rvback;
853 
854 		/*
855 		 * Check to see if we have exceeded our %CPU budget
856 		 * for this wakeup, but not on every single page visited,
857 		 * just every once in a while.
858 		 */
859 		if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
860 			pageout_cycle_ticks = ddi_get_lbolt() - pageout_lbolt;
861 			if (pageout_cycle_ticks >= pageout_ticks) {
862 				++pageout_timeouts;
863 				break;
864 			}
865 		}
866 
867 		/*
868 		 * If checkpage manages to add a page to the free list,
869 		 * we give ourselves another couple of trips around the loop.
870 		 */
871 		if ((rvfront = checkpage(fronthand, FRONT)) == 1)
872 			count = 0;
873 		if ((rvback = checkpage(backhand, BACK)) == 1)
874 			count = 0;
875 
876 		++pcount;
877 
878 		/*
879 		 * protected by pageout_mutex instead of cpu_stat_lock
880 		 */
881 		CPU_STATS_ADDQ(CPU, vm, scan, 1);
882 
883 		/*
884 		 * Don't include ineligible pages in the number scanned.
885 		 */
886 		if (rvfront != -1 || rvback != -1)
887 			nscan++;
888 
889 		backhand = page_next(backhand);
890 
891 		/*
892 		 * backhand update and wraparound check are done separately
893 		 * because lint barks when it finds an empty "if" body
894 		 */
895 
896 		if ((fronthand = page_next(fronthand)) == page_first())	{
897 			TRACE_2(TR_FAC_VM, TR_PAGEOUT_HAND_WRAP,
898 			    "pageout_hand_wrap:freemem %ld whichhand %d",
899 			    freemem, FRONT);
900 
901 			/*
902 			 * protected by pageout_mutex instead of cpu_stat_lock
903 			 */
904 			CPU_STATS_ADDQ(CPU, vm, rev, 1);
905 			if (++count > 1) {
906 				/*
907 				 * Extremely unlikely, but it happens.
908 				 * We went around the loop at least once
909 				 * and didn't get far enough.
910 				 * If we are still skipping `highly shared'
911 				 * pages, skip fewer of them.  Otherwise,
912 				 * give up till the next clock tick.
913 				 */
914 				if (po_share < MAX_PO_SHARE) {
915 					po_share <<= 1;
916 				} else {
917 					/*
918 					 * Really a "goto loop", but
919 					 * if someone is TRACing or
920 					 * TNF_PROBE_ing, at least
921 					 * make records to show
922 					 * where we are.
923 					 */
924 					break;
925 				}
926 			}
927 		}
928 	}
929 
930 	sample_end = gethrtime();
931 
932 	TRACE_5(TR_FAC_VM, TR_PAGEOUT_END,
933 	    "pageout_end:freemem %ld lots %ld nscan %ld des %ld count %u",
934 	    freemem, lotsfree, nscan, desscan, count);
935 
936 	/* Kernel probe */
937 	TNF_PROBE_2(pageout_scan_end, "vm pagedaemon", /* CSTYLED */,
938 	    tnf_ulong, pages_scanned, nscan, tnf_ulong, pages_free, freemem);
939 
940 	if (pageout_sample_cnt < pageout_sample_lim) {
941 		pageout_sample_pages += pcount;
942 		pageout_sample_etime += sample_end - sample_start;
943 		++pageout_sample_cnt;
944 	}
945 	if (pageout_sample_cnt >= pageout_sample_lim &&
946 	    pageout_new_spread == 0) {
947 		pageout_rate = (hrrate_t)pageout_sample_pages *
948 		    (hrrate_t)(NANOSEC) / pageout_sample_etime;
949 		pageout_new_spread = pageout_rate / 10;
950 		setupclock(1);
951 	}
952 
953 	goto loop;
954 }
955 
956 /*
957  * The pageout deadman is run once per second by clock().
958  */
959 void
960 pageout_deadman(void)
961 {
962 	if (panicstr != NULL) {
963 		/*
964 		 * There is no pageout after panic.
965 		 */
966 		return;
967 	}
968 
969 	if (pageout_deadman_seconds == 0) {
970 		/*
971 		 * The deadman is not enabled.
972 		 */
973 		return;
974 	}
975 
976 	if (!pageout_pushing) {
977 		goto reset;
978 	}
979 
980 	/*
981 	 * We are pushing a page.  Check to see if it is the same call we saw
982 	 * last time we looked:
983 	 */
984 	if (pageout_pushcount != pageout_pushcount_seen) {
985 		/*
986 		 * It is a different call from the last check, so we are not
987 		 * stuck.
988 		 */
989 		goto reset;
990 	}
991 
992 	if (++pageout_stucktime >= pageout_deadman_seconds) {
993 		panic("pageout_deadman: stuck pushing the same page for %d "
994 		    "seconds (freemem is %lu)", pageout_deadman_seconds,
995 		    freemem);
996 	}
997 
998 	return;
999 
1000 reset:
1001 	/*
1002 	 * Reset our tracking state to reflect that we are not stuck:
1003 	 */
1004 	pageout_stucktime = 0;
1005 	pageout_pushcount_seen = pageout_pushcount;
1006 }
1007 
1008 /*
1009  * Look at the page at hand.  If it is locked (e.g., for physical i/o),
1010  * system (u., page table) or free, then leave it alone.  Otherwise,
1011  * if we are running the front hand, turn off the page's reference bit.
1012  * If the proc is over maxrss, we take it.  If running the back hand,
1013  * check whether the page has been reclaimed.  If not, free the page,
1014  * pushing it to disk first if necessary.
1015  *
1016  * Return values:
1017  *	-1 if the page is not a candidate at all,
1018  *	 0 if not freed, or
1019  *	 1 if we freed it.
1020  */
1021 static int
1022 checkpage(struct page *pp, int whichhand)
1023 {
1024 	int ppattr;
1025 	int isfs = 0;
1026 	int isexec = 0;
1027 	int pagesync_flag;
1028 
1029 	/*
1030 	 * Skip pages:
1031 	 *	- associated with the kernel vnode since
1032 	 *	    they are always "exclusively" locked.
1033 	 *	- that are free
1034 	 *	- that are shared more than po_share'd times
1035 	 *	- its already locked
1036 	 *
1037 	 * NOTE:  These optimizations assume that reads are atomic.
1038 	 */
1039 
1040 	if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
1041 	    pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
1042 	    hat_page_checkshare(pp, po_share)) {
1043 		return (-1);
1044 	}
1045 
1046 	if (!page_trylock(pp, SE_EXCL)) {
1047 		/*
1048 		 * Skip the page if we can't acquire the "exclusive" lock.
1049 		 */
1050 		return (-1);
1051 	} else if (PP_ISFREE(pp)) {
1052 		/*
1053 		 * It became free between the above check and our actually
1054 		 * locking the page.  Oh, well there will be other pages.
1055 		 */
1056 		page_unlock(pp);
1057 		return (-1);
1058 	}
1059 
1060 	/*
1061 	 * Reject pages that cannot be freed. The page_struct_lock
1062 	 * need not be acquired to examine these
1063 	 * fields since the page has an "exclusive" lock.
1064 	 */
1065 	if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1066 		page_unlock(pp);
1067 		return (-1);
1068 	}
1069 
1070 	/*
1071 	 * Maintain statistics for what we are freeing
1072 	 */
1073 
1074 	if (pp->p_vnode != NULL) {
1075 		if (pp->p_vnode->v_flag & VVMEXEC)
1076 			isexec = 1;
1077 
1078 		if (!IS_SWAPFSVP(pp->p_vnode))
1079 			isfs = 1;
1080 	}
1081 
1082 	/*
1083 	 * Turn off REF and MOD bits with the front hand.
1084 	 * The back hand examines the REF bit and always considers
1085 	 * SHARED pages as referenced.
1086 	 */
1087 	if (whichhand == FRONT)
1088 		pagesync_flag = HAT_SYNC_ZERORM;
1089 	else
1090 		pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
1091 		    HAT_SYNC_STOPON_SHARED;
1092 
1093 	ppattr = hat_pagesync(pp, pagesync_flag);
1094 
1095 recheck:
1096 	/*
1097 	 * If page is referenced; make unreferenced but reclaimable.
1098 	 * If this page is not referenced, then it must be reclaimable
1099 	 * and we can add it to the free list.
1100 	 */
1101 	if (ppattr & P_REF) {
1102 		TRACE_2(TR_FAC_VM, TR_PAGEOUT_ISREF,
1103 		    "pageout_isref:pp %p whichhand %d", pp, whichhand);
1104 		if (whichhand == FRONT) {
1105 			/*
1106 			 * Checking of rss or madvise flags needed here...
1107 			 *
1108 			 * If not "well-behaved", fall through into the code
1109 			 * for not referenced.
1110 			 */
1111 			hat_clrref(pp);
1112 		}
1113 		/*
1114 		 * Somebody referenced the page since the front
1115 		 * hand went by, so it's not a candidate for
1116 		 * freeing up.
1117 		 */
1118 		page_unlock(pp);
1119 		return (0);
1120 	}
1121 
1122 	VM_STAT_ADD(pageoutvmstats.checkpage[0]);
1123 
1124 	/*
1125 	 * If large page, attempt to demote it. If successfully demoted,
1126 	 * retry the checkpage.
1127 	 */
1128 	if (pp->p_szc != 0) {
1129 		if (!page_try_demote_pages(pp)) {
1130 			VM_STAT_ADD(pageoutvmstats.checkpage[1]);
1131 			page_unlock(pp);
1132 			return (-1);
1133 		}
1134 		ASSERT(pp->p_szc == 0);
1135 		VM_STAT_ADD(pageoutvmstats.checkpage[2]);
1136 		/*
1137 		 * since page_try_demote_pages() could have unloaded some
1138 		 * mappings it makes sense to reload ppattr.
1139 		 */
1140 		ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1141 	}
1142 
1143 	/*
1144 	 * If the page is currently dirty, we have to arrange
1145 	 * to have it cleaned before it can be freed.
1146 	 *
1147 	 * XXX - ASSERT(pp->p_vnode != NULL);
1148 	 */
1149 	if ((ppattr & P_MOD) && pp->p_vnode) {
1150 		struct vnode *vp = pp->p_vnode;
1151 		u_offset_t offset = pp->p_offset;
1152 
1153 		/*
1154 		 * XXX - Test for process being swapped out or about to exit?
1155 		 * [Can't get back to process(es) using the page.]
1156 		 */
1157 
1158 		/*
1159 		 * Hold the vnode before releasing the page lock to
1160 		 * prevent it from being freed and re-used by some
1161 		 * other thread.
1162 		 */
1163 		VN_HOLD(vp);
1164 		page_unlock(pp);
1165 
1166 		/*
1167 		 * Queue i/o request for the pageout thread.
1168 		 */
1169 		if (!queue_io_request(vp, offset)) {
1170 			VN_RELE(vp);
1171 			return (0);
1172 		}
1173 		return (1);
1174 	}
1175 
1176 	/*
1177 	 * Now we unload all the translations,
1178 	 * and put the page back on to the free list.
1179 	 * If the page was used (referenced or modified) after
1180 	 * the pagesync but before it was unloaded we catch it
1181 	 * and handle the page properly.
1182 	 */
1183 	TRACE_2(TR_FAC_VM, TR_PAGEOUT_FREE,
1184 	    "pageout_free:pp %p whichhand %d", pp, whichhand);
1185 	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1186 	ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1187 	if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode))
1188 		goto recheck;
1189 
1190 	/*LINTED: constant in conditional context*/
1191 	VN_DISPOSE(pp, B_FREE, 0, kcred);
1192 
1193 	CPU_STATS_ADD_K(vm, dfree, 1);
1194 
1195 	if (isfs) {
1196 		if (isexec) {
1197 			CPU_STATS_ADD_K(vm, execfree, 1);
1198 		} else {
1199 			CPU_STATS_ADD_K(vm, fsfree, 1);
1200 		}
1201 	} else {
1202 		CPU_STATS_ADD_K(vm, anonfree, 1);
1203 	}
1204 
1205 	return (1);		/* freed a page! */
1206 }
1207 
1208 /*
1209  * Queue async i/o request from pageout_scanner and segment swapout
1210  * routines on one common list.  This ensures that pageout devices (swap)
1211  * are not saturated by pageout_scanner or swapout requests.
1212  * The pageout thread empties this list by initiating i/o operations.
1213  */
1214 int
1215 queue_io_request(vnode_t *vp, u_offset_t off)
1216 {
1217 	struct async_reqs *arg;
1218 
1219 	/*
1220 	 * If we cannot allocate an async request struct,
1221 	 * skip this page.
1222 	 */
1223 	mutex_enter(&push_lock);
1224 	if ((arg = req_freelist) == NULL) {
1225 		mutex_exit(&push_lock);
1226 		return (0);
1227 	}
1228 	req_freelist = arg->a_next;		/* adjust freelist */
1229 	push_list_size++;
1230 
1231 	arg->a_vp = vp;
1232 	arg->a_off = off;
1233 	arg->a_len = PAGESIZE;
1234 	arg->a_flags = B_ASYNC | B_FREE;
1235 	arg->a_cred = kcred;		/* always held */
1236 
1237 	/*
1238 	 * Add to list of pending write requests.
1239 	 */
1240 	arg->a_next = push_list;
1241 	push_list = arg;
1242 
1243 	if (req_freelist == NULL) {
1244 		/*
1245 		 * No free async requests left. The lock is held so we
1246 		 * might as well signal the pusher thread now.
1247 		 */
1248 		cv_signal(&push_cv);
1249 	}
1250 	mutex_exit(&push_lock);
1251 	return (1);
1252 }
1253 
1254 /*
1255  * Wakeup pageout to initiate i/o if push_list is not empty.
1256  */
1257 void
1258 cv_signal_pageout()
1259 {
1260 	if (push_list != NULL) {
1261 		mutex_enter(&push_lock);
1262 		cv_signal(&push_cv);
1263 		mutex_exit(&push_lock);
1264 	}
1265 }
1266