xref: /titanic_52/usr/src/uts/common/os/vm_pageout.c (revision ea8dc4b6d2251b437950c0056bc626b311c73c27)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 /*
31  * University Copyright- Copyright (c) 1982, 1986, 1988
32  * The Regents of the University of California
33  * All Rights Reserved
34  *
35  * University Acknowledgment- Portions of this document are derived from
36  * software developed by the University of California, Berkeley, and its
37  * contributors.
38  */
39 
40 #pragma ident	"%Z%%M%	%I%	%E% SMI"
41 
42 #include <sys/types.h>
43 #include <sys/t_lock.h>
44 #include <sys/param.h>
45 #include <sys/buf.h>
46 #include <sys/uio.h>
47 #include <sys/proc.h>
48 #include <sys/systm.h>
49 #include <sys/mman.h>
50 #include <sys/cred.h>
51 #include <sys/vnode.h>
52 #include <sys/vm.h>
53 #include <sys/vmparam.h>
54 #include <sys/vtrace.h>
55 #include <sys/cmn_err.h>
56 #include <sys/cpuvar.h>
57 #include <sys/user.h>
58 #include <sys/kmem.h>
59 #include <sys/debug.h>
60 #include <sys/callb.h>
61 #include <sys/tnf_probe.h>
62 #include <sys/mem_cage.h>
63 #include <sys/time.h>
64 
65 #include <vm/hat.h>
66 #include <vm/as.h>
67 #include <vm/seg.h>
68 #include <vm/page.h>
69 #include <vm/pvn.h>
70 #include <vm/seg_kmem.h>
71 
72 static int checkpage(page_t *, int);
73 
74 /*
75  * The following parameters control operation of the page replacement
76  * algorithm.  They are initialized to 0, and then computed at boot time
77  * based on the size of the system.  If they are patched non-zero in
78  * a loaded vmunix they are left alone and may thus be changed per system
79  * using adb on the loaded system.
80  */
81 pgcnt_t		slowscan = 0;
82 pgcnt_t		fastscan = 0;
83 
84 static pgcnt_t	handspreadpages = 0;
85 static int	loopfraction = 2;
86 static pgcnt_t	looppages;
87 static int	min_percent_cpu = 4;
88 static int	max_percent_cpu = 80;
89 static pgcnt_t	maxfastscan = 0;
90 static pgcnt_t	maxslowscan = 100;
91 
92 pgcnt_t	maxpgio = 0;
93 pgcnt_t	minfree = 0;
94 pgcnt_t	desfree = 0;
95 pgcnt_t	lotsfree = 0;
96 pgcnt_t	needfree = 0;
97 pgcnt_t	throttlefree = 0;
98 pgcnt_t	pageout_reserve = 0;
99 
100 pgcnt_t	deficit;
101 pgcnt_t	nscan;
102 pgcnt_t	desscan;
103 
104 /*
105  * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks
106  * are the number of ticks in each wakeup cycle that gives the
107  * equivalent of some underlying %CPU duty cycle.
108  * When RATETOSCHEDPAGING is 4,  and hz is 100, pageout_scanner is
109  * awakened every 25 clock ticks.  So, converting from %CPU to ticks
110  * per wakeup cycle would be x% of 25, that is (x * 100) / 25.
111  * So, for example, 4% == 1 tick and 80% == 20 ticks.
112  *
113  * min_pageout_ticks:
114  *     ticks/wakeup equivalent of min_percent_cpu.
115  *
116  * max_pageout_ticks:
117  *     ticks/wakeup equivalent of max_percent_cpu.
118  *
119  * pageout_ticks:
120  *     Number of clock ticks budgeted for each wakeup cycle.
121  *     Computed each time around by schedpaging().
122  *     Varies between min_pageout_ticks .. max_pageout_ticks,
123  *     depending on memory pressure.
124  *
125  * pageout_lbolt:
126  *     Timestamp of the last time pageout_scanner woke up and started
127  *     (or resumed) scanning for not recently referenced pages.
128  */
129 
130 static clock_t	min_pageout_ticks;
131 static clock_t	max_pageout_ticks;
132 static clock_t	pageout_ticks;
133 static clock_t	pageout_lbolt;
134 
135 static uint_t	reset_hands;
136 
137 #define	PAGES_POLL_MASK	1023
138 
139 /*
140  * pageout_sample_lim:
141  *     The limit on the number of samples needed to establish a value
142  *     for new pageout parameters, fastscan, slowscan, and handspreadpages.
143  *
144  * pageout_sample_cnt:
145  *     Current sample number.  Once the sample gets large enough,
146  *     set new values for handspreadpages, fastscan and slowscan.
147  *
148  * pageout_sample_pages:
149  *     The accumulated number of pages scanned during sampling.
150  *
151  * pageout_sample_ticks:
152  *     The accumulated clock ticks for the sample.
153  *
154  * pageout_rate:
155  *     Rate in pages/nanosecond, computed at the end of sampling.
156  *
157  * pageout_new_spread:
158  *     The new value to use for fastscan and handspreadpages.
159  *     Calculated after enough samples have been taken.
160  */
161 
162 typedef hrtime_t hrrate_t;
163 
164 static uint64_t	pageout_sample_lim = 4;
165 static uint64_t	pageout_sample_cnt = 0;
166 static pgcnt_t	pageout_sample_pages = 0;
167 static hrrate_t	pageout_rate = 0;
168 static pgcnt_t	pageout_new_spread = 0;
169 
170 static clock_t	pageout_cycle_ticks;
171 static hrtime_t	sample_start, sample_end;
172 static hrtime_t	pageout_sample_etime = 0;
173 
174 /*
175  * Record number of times a pageout_scanner wakeup cycle finished because it
176  * timed out (exceeded its CPU budget), rather than because it visited
177  * its budgeted number of pages.
178  */
179 uint64_t pageout_timeouts = 0;
180 
181 #ifdef VM_STATS
182 static struct pageoutvmstats_str {
183 	ulong_t	checkpage[3];
184 } pageoutvmstats;
185 #endif /* VM_STATS */
186 
187 /*
188  * Threads waiting for free memory use this condition variable and lock until
189  * memory becomes available.
190  */
191 kmutex_t	memavail_lock;
192 kcondvar_t	memavail_cv;
193 
194 /*
195  * The size of the clock loop.
196  */
197 #define	LOOPPAGES	total_pages
198 
199 /*
200  * Set up the paging constants for the clock algorithm.
201  * Called after the system is initialized and the amount of memory
202  * and number of paging devices is known.
203  *
204  * lotsfree is 1/64 of memory, but at least 512K.
205  * desfree is 1/2 of lotsfree.
206  * minfree is 1/2 of desfree.
207  *
208  * Note: to revert to the paging algorithm of Solaris 2.4/2.5, set:
209  *
210  *	lotsfree = btop(512K)
211  *	desfree = btop(200K)
212  *	minfree = btop(100K)
213  *	throttlefree = INT_MIN
214  *	max_percent_cpu = 4
215  */
216 void
217 setupclock(int recalc)
218 {
219 
220 	static spgcnt_t init_lfree, init_dfree, init_mfree;
221 	static spgcnt_t init_tfree, init_preserve, init_mpgio;
222 	static spgcnt_t init_mfscan, init_fscan, init_sscan, init_hspages;
223 
224 	looppages = LOOPPAGES;
225 
226 	/*
227 	 * setupclock can now be called to recalculate the paging
228 	 * parameters in the case of dynamic addition of memory.
229 	 * So to make sure we make the proper calculations, if such a
230 	 * situation should arise, we save away the initial values
231 	 * of each parameter so we can recall them when needed. This
232 	 * way we don't lose the settings an admin might have made
233 	 * through the /etc/system file.
234 	 */
235 
236 	if (!recalc) {
237 		init_lfree = lotsfree;
238 		init_dfree = desfree;
239 		init_mfree = minfree;
240 		init_tfree = throttlefree;
241 		init_preserve = pageout_reserve;
242 		init_mpgio = maxpgio;
243 		init_mfscan = maxfastscan;
244 		init_fscan = fastscan;
245 		init_sscan = slowscan;
246 		init_hspages = handspreadpages;
247 	}
248 
249 	/*
250 	 * Set up thresholds for paging:
251 	 */
252 
253 	/*
254 	 * Lotsfree is threshold where paging daemon turns on.
255 	 */
256 	if (init_lfree == 0 || init_lfree >= looppages)
257 		lotsfree = MAX(looppages / 64, btop(512 * 1024));
258 	else
259 		lotsfree = init_lfree;
260 
261 	/*
262 	 * Desfree is amount of memory desired free.
263 	 * If less than this for extended period, start swapping.
264 	 */
265 	if (init_dfree == 0 || init_dfree >= lotsfree)
266 		desfree = lotsfree / 2;
267 	else
268 		desfree = init_dfree;
269 
270 	/*
271 	 * Minfree is minimal amount of free memory which is tolerable.
272 	 */
273 	if (init_mfree == 0 || init_mfree >= desfree)
274 		minfree = desfree / 2;
275 	else
276 		minfree = init_mfree;
277 
278 	/*
279 	 * Throttlefree is the point at which we start throttling
280 	 * PG_WAIT requests until enough memory becomes available.
281 	 */
282 	if (init_tfree == 0 || init_tfree >= desfree)
283 		throttlefree = minfree;
284 	else
285 		throttlefree = init_tfree;
286 
287 	/*
288 	 * Pageout_reserve is the number of pages that we keep in
289 	 * stock for pageout's own use.  Having a few such pages
290 	 * provides insurance against system deadlock due to
291 	 * pageout needing pages.  When freemem < pageout_reserve,
292 	 * non-blocking allocations are denied to any threads
293 	 * other than pageout and sched.  (At some point we might
294 	 * want to consider a per-thread flag like T_PUSHING_PAGES
295 	 * to indicate that a thread is part of the page-pushing
296 	 * dance (e.g. an interrupt thread) and thus is entitled
297 	 * to the same special dispensation we accord pageout.)
298 	 */
299 	if (init_preserve == 0 || init_preserve >= throttlefree)
300 		pageout_reserve = throttlefree / 2;
301 	else
302 		pageout_reserve = init_preserve;
303 
304 	/*
305 	 * Maxpgio thresholds how much paging is acceptable.
306 	 * This figures that 2/3 busy on an arm is all that is
307 	 * tolerable for paging.  We assume one operation per disk rev.
308 	 *
309 	 * XXX - Does not account for multiple swap devices.
310 	 */
311 	if (init_mpgio == 0)
312 		maxpgio = (DISKRPM * 2) / 3;
313 	else
314 		maxpgio = init_mpgio;
315 
316 	/*
317 	 * The clock scan rate varies between fastscan and slowscan
318 	 * based on the amount of free memory available.  Fastscan
319 	 * rate should be set based on the number pages that can be
320 	 * scanned per sec using ~10% of processor time.  Since this
321 	 * value depends on the processor, MMU, Mhz etc., it is
322 	 * difficult to determine it in a generic manner for all
323 	 * architectures.
324 	 *
325 	 * Instead of trying to determine the number of pages scanned
326 	 * per sec for every processor, fastscan is set to be the smaller
327 	 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
328 	 * time is limited to ~4% of processor time.
329 	 *
330 	 * Setting fastscan to be 1/2 of memory allows pageout to scan
331 	 * all of memory in ~2 secs.  This implies that user pages not
332 	 * accessed within 1 sec (assuming, handspreadpages == fastscan)
333 	 * can be reclaimed when free memory is very low.  Stealing pages
334 	 * not accessed within 1 sec seems reasonable and ensures that
335 	 * active user processes don't thrash.
336 	 *
337 	 * Smaller values of fastscan result in scanning fewer pages
338 	 * every second and consequently pageout may not be able to free
339 	 * sufficient memory to maintain the minimum threshold.  Larger
340 	 * values of fastscan result in scanning a lot more pages which
341 	 * could lead to thrashing and higher CPU usage.
342 	 *
343 	 * Fastscan needs to be limited to a maximum value and should not
344 	 * scale with memory to prevent pageout from consuming too much
345 	 * time for scanning on slow CPU's and avoid thrashing, as a
346 	 * result of scanning too many pages, on faster CPU's.
347 	 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
348 	 * (the upper bound for fastscan) based on the average number
349 	 * of pages that can potentially be scanned in ~1 sec (using ~4%
350 	 * of the CPU) on some of the following machines that currently
351 	 * run Solaris 2.x:
352 	 *
353 	 *			average memory scanned in ~1 sec
354 	 *
355 	 *	25 Mhz SS1+:		23 Meg
356 	 *	LX:			37 Meg
357 	 *	50 Mhz SC2000:		68 Meg
358 	 *
359 	 *	40 Mhz 486:		26 Meg
360 	 *	66 Mhz 486:		42 Meg
361 	 *
362 	 * When free memory falls just below lotsfree, the scan rate
363 	 * goes from 0 to slowscan (i.e., pageout starts running).  This
364 	 * transition needs to be smooth and is achieved by ensuring that
365 	 * pageout scans a small number of pages to satisfy the transient
366 	 * memory demand.  This is set to not exceed 100 pages/sec (25 per
367 	 * wakeup) since scanning that many pages has no noticible impact
368 	 * on system performance.
369 	 *
370 	 * In addition to setting fastscan and slowscan, pageout is
371 	 * limited to using ~4% of the CPU.  This results in increasing
372 	 * the time taken to scan all of memory, which in turn means that
373 	 * user processes have a better opportunity of preventing their
374 	 * pages from being stolen.  This has a positive effect on
375 	 * interactive and overall system performance when memory demand
376 	 * is high.
377 	 *
378 	 * Thus, the rate at which pages are scanned for replacement will
379 	 * vary linearly between slowscan and the number of pages that
380 	 * can be scanned using ~4% of processor time instead of varying
381 	 * linearly between slowscan and fastscan.
382 	 *
383 	 * Also, the processor time used by pageout will vary from ~1%
384 	 * at slowscan to ~4% at fastscan instead of varying between
385 	 * ~1% at slowscan and ~10% at fastscan.
386 	 *
387 	 * The values chosen for the various VM parameters (fastscan,
388 	 * handspreadpages, etc) are not universally true for all machines,
389 	 * but appear to be a good rule of thumb for the machines we've
390 	 * tested.  They have the following ranges:
391 	 *
392 	 *	cpu speed:	20 to 70 Mhz
393 	 *	page size:	4K to 8K
394 	 *	memory size:	16M to 5G
395 	 *	page scan rate:	4000 - 17400 4K pages per sec
396 	 *
397 	 * The values need to be re-examined for machines which don't
398 	 * fall into the various ranges (e.g., slower or faster CPUs,
399 	 * smaller or larger pagesizes etc) shown above.
400 	 *
401 	 * On an MP machine, pageout is often unable to maintain the
402 	 * minimum paging thresholds under heavy load.  This is due to
403 	 * the fact that user processes running on other CPU's can be
404 	 * dirtying memory at a much faster pace than pageout can find
405 	 * pages to free.  The memory demands could be met by enabling
406 	 * more than one CPU to run the clock algorithm in such a manner
407 	 * that the various clock hands don't overlap.  This also makes
408 	 * it more difficult to determine the values for fastscan, slowscan
409 	 * and handspreadpages.
410 	 *
411 	 * The swapper is currently used to free up memory when pageout
412 	 * is unable to meet memory demands by swapping out processes.
413 	 * In addition to freeing up memory, swapping also reduces the
414 	 * demand for memory by preventing user processes from running
415 	 * and thereby consuming memory.
416 	 */
417 	if (init_mfscan == 0) {
418 		if (pageout_new_spread != 0)
419 			maxfastscan = pageout_new_spread;
420 		else
421 			maxfastscan = MAXHANDSPREADPAGES;
422 	} else {
423 		maxfastscan = init_mfscan;
424 	}
425 	if (init_fscan == 0)
426 		fastscan = MIN(looppages / loopfraction, maxfastscan);
427 	else
428 		fastscan = init_fscan;
429 	if (fastscan > looppages / loopfraction)
430 		fastscan = looppages / loopfraction;
431 
432 	/*
433 	 * Set slow scan time to 1/10 the fast scan time, but
434 	 * not to exceed maxslowscan.
435 	 */
436 	if (init_sscan == 0)
437 		slowscan = MIN(fastscan / 10, maxslowscan);
438 	else
439 		slowscan = init_sscan;
440 	if (slowscan > fastscan / 2)
441 		slowscan = fastscan / 2;
442 
443 	/*
444 	 * Handspreadpages is distance (in pages) between front and back
445 	 * pageout daemon hands.  The amount of time to reclaim a page
446 	 * once pageout examines it increases with this distance and
447 	 * decreases as the scan rate rises. It must be < the amount
448 	 * of pageable memory.
449 	 *
450 	 * Since pageout is limited to ~4% of the CPU, setting handspreadpages
451 	 * to be "fastscan" results in the front hand being a few secs
452 	 * (varies based on the processor speed) ahead of the back hand
453 	 * at fastscan rates.  This distance can be further reduced, if
454 	 * necessary, by increasing the processor time used by pageout
455 	 * to be more than ~4% and preferrably not more than ~10%.
456 	 *
457 	 * As a result, user processes have a much better chance of
458 	 * referencing their pages before the back hand examines them.
459 	 * This also significantly lowers the number of reclaims from
460 	 * the freelist since pageout does not end up freeing pages which
461 	 * may be referenced a sec later.
462 	 */
463 	if (init_hspages == 0)
464 		handspreadpages = fastscan;
465 	else
466 		handspreadpages = init_hspages;
467 
468 	/*
469 	 * Make sure that back hand follows front hand by at least
470 	 * 1/RATETOSCHEDPAGING seconds.  Without this test, it is possible
471 	 * for the back hand to look at a page during the same wakeup of
472 	 * the pageout daemon in which the front hand cleared its ref bit.
473 	 */
474 	if (handspreadpages >= looppages)
475 		handspreadpages = looppages - 1;
476 
477 	/*
478 	 * If we have been called to recalculate the parameters,
479 	 * set a flag to re-evaluate the clock hand pointers.
480 	 */
481 	if (recalc)
482 		reset_hands = 1;
483 }
484 
485 /*
486  * Pageout scheduling.
487  *
488  * Schedpaging controls the rate at which the page out daemon runs by
489  * setting the global variables nscan and desscan RATETOSCHEDPAGING
490  * times a second.  Nscan records the number of pages pageout has examined
491  * in its current pass; schedpaging resets this value to zero each time
492  * it runs.  Desscan records the number of pages pageout should examine
493  * in its next pass; schedpaging sets this value based on the amount of
494  * currently available memory.
495  */
496 
497 #define	RATETOSCHEDPAGING	4		/* hz that is */
498 
499 static kmutex_t	pageout_mutex;	/* held while pageout or schedpaging running */
500 
501 /*
502  * Pool of available async pageout putpage requests.
503  */
504 static struct async_reqs *push_req;
505 static struct async_reqs *req_freelist;	/* available req structs */
506 static struct async_reqs *push_list;	/* pending reqs */
507 static kmutex_t push_lock;		/* protects req pool */
508 static kcondvar_t push_cv;
509 
510 static int async_list_size = 256;	/* number of async request structs */
511 
512 static void pageout_scanner(void);
513 
514 /*
515  * If a page is being shared more than "po_share" times
516  * then leave it alone- don't page it out.
517  */
518 #define	MIN_PO_SHARE	(8)
519 #define	MAX_PO_SHARE	((MIN_PO_SHARE) << 24)
520 ulong_t	po_share = MIN_PO_SHARE;
521 
522 /*
523  * Schedule rate for paging.
524  * Rate is linear interpolation between
525  * slowscan with lotsfree and fastscan when out of memory.
526  */
527 static void
528 schedpaging(void *arg)
529 {
530 	spgcnt_t vavail;
531 
532 	if (freemem < lotsfree + needfree + kmem_reapahead)
533 		kmem_reap();
534 
535 	if (freemem < lotsfree + needfree + seg_preapahead)
536 		seg_preap();
537 
538 	if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
539 		kcage_cageout_wakeup();
540 
541 	if (mutex_tryenter(&pageout_mutex)) {
542 		/* pageout() not running */
543 		nscan = 0;
544 		vavail = freemem - deficit;
545 		if (vavail < 0)
546 			vavail = 0;
547 		if (vavail > lotsfree)
548 			vavail = lotsfree;
549 
550 		/*
551 		 * Fix for 1161438 (CRS SPR# 73922).  All variables
552 		 * in the original calculation for desscan were 32 bit signed
553 		 * ints.  As freemem approaches 0x0 on a system with 1 Gig or
554 		 * more of memory, the calculation can overflow.  When this
555 		 * happens, desscan becomes negative and pageout_scanner()
556 		 * stops paging out.
557 		 */
558 		if (needfree) {
559 			desscan = fastscan / RATETOSCHEDPAGING;
560 		} else {
561 			spgcnt_t faststmp, slowstmp, result;
562 
563 			slowstmp = slowscan * vavail;
564 			faststmp = fastscan * (lotsfree - vavail);
565 			result = (slowstmp + faststmp) /
566 				nz(lotsfree) / RATETOSCHEDPAGING;
567 			desscan = (pgcnt_t)result;
568 		}
569 
570 		pageout_ticks = min_pageout_ticks + (lotsfree - vavail) *
571 		    (max_pageout_ticks - min_pageout_ticks) / nz(lotsfree);
572 
573 		if (freemem < lotsfree + needfree ||
574 		    pageout_sample_cnt < pageout_sample_lim) {
575 			TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
576 				"pageout_cv_signal:freemem %ld", freemem);
577 			cv_signal(&proc_pageout->p_cv);
578 		} else {
579 			/*
580 			 * There are enough free pages, no need to
581 			 * kick the scanner thread.  And next time
582 			 * around, keep more of the `highly shared'
583 			 * pages.
584 			 */
585 			cv_signal_pageout();
586 			if (po_share > MIN_PO_SHARE) {
587 				po_share >>= 1;
588 			}
589 		}
590 		mutex_exit(&pageout_mutex);
591 	}
592 
593 	/*
594 	 * Signal threads waiting for available memory.
595 	 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
596 	 * in this case it is not needed - the waiters will be waken up during
597 	 * the next invocation of this function.
598 	 */
599 	if (kmem_avail() > 0)
600 		cv_broadcast(&memavail_cv);
601 
602 	(void) timeout(schedpaging, arg, hz / RATETOSCHEDPAGING);
603 }
604 
605 pgcnt_t		pushes;
606 ulong_t		push_list_size;		/* # of requests on pageout queue */
607 
608 #define	FRONT	1
609 #define	BACK	2
610 
611 int dopageout = 1;	/* must be non-zero to turn page stealing on */
612 
613 /*
614  * The page out daemon, which runs as process 2.
615  *
616  * As long as there are at least lotsfree pages,
617  * this process is not run.  When the number of free
618  * pages stays in the range desfree to lotsfree,
619  * this daemon runs through the pages in the loop
620  * at a rate determined in schedpaging().  Pageout manages
621  * two hands on the clock.  The front hand moves through
622  * memory, clearing the reference bit,
623  * and stealing pages from procs that are over maxrss.
624  * The back hand travels a distance behind the front hand,
625  * freeing the pages that have not been referenced in the time
626  * since the front hand passed.  If modified, they are pushed to
627  * swap before being freed.
628  *
629  * There are 2 threads that act on behalf of the pageout process.
630  * One thread scans pages (pageout_scanner) and frees them up if
631  * they don't require any VOP_PUTPAGE operation. If a page must be
632  * written back to its backing store, the request is put on a list
633  * and the other (pageout) thread is signaled. The pageout thread
634  * grabs VOP_PUTPAGE requests from the list, and processes them.
635  * Some filesystems may require resources for the VOP_PUTPAGE
636  * operations (like memory) and hence can block the pageout
637  * thread, but the scanner thread can still operate. There is still
638  * no gaurentee that memory deadlocks cannot occur.
639  *
640  * For now, this thing is in very rough form.
641  */
642 void
643 pageout()
644 {
645 	struct async_reqs *arg;
646 	pri_t pageout_pri;
647 	int i;
648 	pgcnt_t max_pushes;
649 	callb_cpr_t cprinfo;
650 
651 	proc_pageout = ttoproc(curthread);
652 	proc_pageout->p_cstime = 0;
653 	proc_pageout->p_stime =  0;
654 	proc_pageout->p_cutime =  0;
655 	proc_pageout->p_utime = 0;
656 	bcopy("pageout", u.u_psargs, 8);
657 	bcopy("pageout", u.u_comm, 7);
658 
659 	/*
660 	 * Create pageout scanner thread
661 	 */
662 	mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
663 	mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
664 
665 	/*
666 	 * Allocate and initialize the async request structures
667 	 * for pageout.
668 	 */
669 	push_req = (struct async_reqs *)
670 	    kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
671 
672 	req_freelist = push_req;
673 	for (i = 0; i < async_list_size - 1; i++)
674 		push_req[i].a_next = &push_req[i + 1];
675 
676 	pageout_pri = curthread->t_pri;
677 	pageout_init(pageout_scanner, proc_pageout, pageout_pri - 1);
678 
679 	/*
680 	 * kick off pageout scheduler.
681 	 */
682 	schedpaging(NULL);
683 
684 	/*
685 	 * Create kernel cage thread.
686 	 * The kernel cage thread is started under the pageout process
687 	 * to take advantage of the less restricted page allocation
688 	 * in page_create_throttle().
689 	 */
690 	kcage_cageout_init();
691 
692 	/*
693 	 * Limit pushes to avoid saturating pageout devices.
694 	 */
695 	max_pushes = maxpgio / RATETOSCHEDPAGING;
696 	CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
697 
698 	for (;;) {
699 		mutex_enter(&push_lock);
700 
701 		while ((arg = push_list) == NULL || pushes > max_pushes) {
702 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
703 			cv_wait(&push_cv, &push_lock);
704 			pushes = 0;
705 			CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
706 		}
707 		push_list = arg->a_next;
708 		arg->a_next = NULL;
709 		mutex_exit(&push_lock);
710 
711 		if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
712 			arg->a_len, arg->a_flags,
713 			    arg->a_cred) == 0) {
714 			pushes++;
715 		}
716 
717 		/* vp held by checkpage() */
718 		VN_RELE(arg->a_vp);
719 
720 		mutex_enter(&push_lock);
721 		arg->a_next = req_freelist;	/* back on freelist */
722 		req_freelist = arg;
723 		push_list_size--;
724 		mutex_exit(&push_lock);
725 	}
726 }
727 
728 /*
729  * Kernel thread that scans pages looking for ones to free
730  */
731 static void
732 pageout_scanner(void)
733 {
734 	struct page *fronthand, *backhand;
735 	uint_t count;
736 	callb_cpr_t cprinfo;
737 	pgcnt_t	nscan_limit;
738 	pgcnt_t	pcount;
739 
740 	CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
741 	mutex_enter(&pageout_mutex);
742 
743 	/*
744 	 * The restart case does not attempt to point the hands at roughly
745 	 * the right point on the assumption that after one circuit things
746 	 * will have settled down - and restarts shouldn't be that often.
747 	 */
748 
749 	/*
750 	 * Set the two clock hands to be separated by a reasonable amount,
751 	 * but no more than 360 degrees apart.
752 	 */
753 	backhand = page_first();
754 	if (handspreadpages >= total_pages)
755 		fronthand = page_nextn(backhand, total_pages - 1);
756 	else
757 		fronthand = page_nextn(backhand, handspreadpages);
758 
759 	min_pageout_ticks = MAX(1,
760 	    ((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING);
761 	max_pageout_ticks = MAX(min_pageout_ticks,
762 	    ((hz * max_percent_cpu) / 100) / RATETOSCHEDPAGING);
763 
764 loop:
765 	cv_signal_pageout();
766 
767 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
768 	cv_wait(&proc_pageout->p_cv, &pageout_mutex);
769 	CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
770 
771 	if (!dopageout)
772 		goto loop;
773 
774 	if (reset_hands) {
775 		reset_hands = 0;
776 
777 		backhand = page_first();
778 		if (handspreadpages >= total_pages)
779 			fronthand = page_nextn(backhand, total_pages - 1);
780 		else
781 			fronthand = page_nextn(backhand, handspreadpages);
782 	}
783 
784 	CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
785 	count = 0;
786 
787 	TRACE_4(TR_FAC_VM, TR_PAGEOUT_START,
788 		"pageout_start:freemem %ld lotsfree %ld nscan %ld desscan %ld",
789 		freemem, lotsfree, nscan, desscan);
790 
791 	/* Kernel probe */
792 	TNF_PROBE_2(pageout_scan_start, "vm pagedaemon", /* CSTYLED */,
793 		tnf_ulong, pages_free, freemem,
794 		tnf_ulong, pages_needed, needfree);
795 
796 	pcount = 0;
797 	if (pageout_sample_cnt < pageout_sample_lim) {
798 		nscan_limit = total_pages;
799 	} else {
800 		nscan_limit = desscan;
801 	}
802 	pageout_lbolt = lbolt;
803 	sample_start = gethrtime();
804 
805 	/*
806 	 * Scan the appropriate number of pages for a single duty cycle.
807 	 * However, stop scanning as soon as there is enough free memory.
808 	 * For a short while, we will be sampling the performance of the
809 	 * scanner and need to keep running just to get sample data, in
810 	 * which case we keep going and don't pay attention to whether
811 	 * or not there is enough free memory.
812 	 */
813 
814 	while (nscan < nscan_limit && (freemem < lotsfree + needfree ||
815 	    pageout_sample_cnt < pageout_sample_lim)) {
816 		int rvfront, rvback;
817 
818 		/*
819 		 * Check to see if we have exceeded our %CPU budget
820 		 * for this wakeup, but not on every single page visited,
821 		 * just every once in a while.
822 		 */
823 		if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
824 			pageout_cycle_ticks = lbolt - pageout_lbolt;
825 			if (pageout_cycle_ticks >= pageout_ticks) {
826 				++pageout_timeouts;
827 				break;
828 			}
829 		}
830 
831 		/*
832 		 * If checkpage manages to add a page to the free list,
833 		 * we give ourselves another couple of trips around the loop.
834 		 */
835 		if ((rvfront = checkpage(fronthand, FRONT)) == 1)
836 			count = 0;
837 		if ((rvback = checkpage(backhand, BACK)) == 1)
838 			count = 0;
839 
840 		++pcount;
841 
842 		/*
843 		 * protected by pageout_mutex instead of cpu_stat_lock
844 		 */
845 		CPU_STATS_ADDQ(CPU, vm, scan, 1);
846 
847 		/*
848 		 * Don't include ineligible pages in the number scanned.
849 		 */
850 		if (rvfront != -1 || rvback != -1)
851 			nscan++;
852 
853 		backhand = page_next(backhand);
854 
855 		/*
856 		 * backhand update and wraparound check are done separately
857 		 * because lint barks when it finds an empty "if" body
858 		 */
859 
860 		if ((fronthand = page_next(fronthand)) == page_first())	{
861 			TRACE_2(TR_FAC_VM, TR_PAGEOUT_HAND_WRAP,
862 				"pageout_hand_wrap:freemem %ld whichhand %d",
863 				freemem, FRONT);
864 
865 			/*
866 			 * protected by pageout_mutex instead of cpu_stat_lock
867 			 */
868 			CPU_STATS_ADDQ(CPU, vm, rev, 1);
869 			if (++count > 1) {
870 				/*
871 				 * Extremely unlikely, but it happens.
872 				 * We went around the loop at least once
873 				 * and didn't get far enough.
874 				 * If we are still skipping `highly shared'
875 				 * pages, skip fewer of them.  Otherwise,
876 				 * give up till the next clock tick.
877 				 */
878 				if (po_share < MAX_PO_SHARE) {
879 					po_share <<= 1;
880 				} else {
881 					/*
882 					 * Really a "goto loop", but
883 					 * if someone is TRACing or
884 					 * TNF_PROBE_ing, at least
885 					 * make records to show
886 					 * where we are.
887 					 */
888 					break;
889 				}
890 			}
891 		}
892 	}
893 
894 	sample_end = gethrtime();
895 
896 	TRACE_5(TR_FAC_VM, TR_PAGEOUT_END,
897 		"pageout_end:freemem %ld lots %ld nscan %ld des %ld count %u",
898 		freemem, lotsfree, nscan, desscan, count);
899 
900 	/* Kernel probe */
901 	TNF_PROBE_2(pageout_scan_end, "vm pagedaemon", /* CSTYLED */,
902 		tnf_ulong, pages_scanned, nscan,
903 		tnf_ulong, pages_free, freemem);
904 
905 	if (pageout_sample_cnt < pageout_sample_lim) {
906 		pageout_sample_pages += pcount;
907 		pageout_sample_etime += sample_end - sample_start;
908 		++pageout_sample_cnt;
909 	}
910 	if (pageout_sample_cnt >= pageout_sample_lim &&
911 	    pageout_new_spread == 0) {
912 		pageout_rate = (hrrate_t)pageout_sample_pages *
913 		    (hrrate_t)(NANOSEC) / pageout_sample_etime;
914 		pageout_new_spread = pageout_rate / 10;
915 		setupclock(1);
916 	}
917 
918 	goto loop;
919 }
920 
921 /*
922  * Look at the page at hand.  If it is locked (e.g., for physical i/o),
923  * system (u., page table) or free, then leave it alone.  Otherwise,
924  * if we are running the front hand, turn off the page's reference bit.
925  * If the proc is over maxrss, we take it.  If running the back hand,
926  * check whether the page has been reclaimed.  If not, free the page,
927  * pushing it to disk first if necessary.
928  *
929  * Return values:
930  *	-1 if the page is not a candidate at all,
931  *	 0 if not freed, or
932  *	 1 if we freed it.
933  */
934 static int
935 checkpage(struct page *pp, int whichhand)
936 {
937 	int ppattr;
938 	int isfs = 0;
939 	int isexec = 0;
940 	int pagesync_flag;
941 
942 	/*
943 	 * Skip pages:
944 	 * 	- associated with the kernel vnode since
945 	 *	    they are always "exclusively" locked.
946 	 *	- that are free
947 	 *	- that are shared more than po_share'd times
948 	 *	- its already locked
949 	 *
950 	 * NOTE:  These optimizations assume that reads are atomic.
951 	 */
952 top:
953 	if ((pp->p_vnode == &kvp) ||
954 	    (PP_ISFREE(pp)) ||
955 	    (hat_page_getshare(pp) > po_share) || PAGE_LOCKED(pp)) {
956 		return (-1);
957 	}
958 
959 	if (!page_trylock(pp, SE_EXCL)) {
960 		/*
961 		 * Skip the page if we can't acquire the "exclusive" lock.
962 		 */
963 		return (-1);
964 	} else if (PP_ISFREE(pp)) {
965 		/*
966 		 * It became free between the above check and our actually
967 		 * locking the page.  Oh, well there will be other pages.
968 		 */
969 		page_unlock(pp);
970 		return (-1);
971 	}
972 
973 	/*
974 	 * Reject pages that cannot be freed. The page_struct_lock
975 	 * need not be acquired to examine these
976 	 * fields since the page has an "exclusive" lock.
977 	 */
978 	if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
979 		page_unlock(pp);
980 		return (-1);
981 	}
982 
983 	/*
984 	 * Maintain statistics for what we are freeing
985 	 */
986 
987 	if (pp->p_vnode != NULL) {
988 		if (pp->p_vnode->v_flag & VVMEXEC)
989 			isexec = 1;
990 
991 		if (!IS_SWAPFSVP(pp->p_vnode))
992 			isfs = 1;
993 	}
994 
995 	/*
996 	 * Turn off REF and MOD bits with the front hand.
997 	 * The back hand examines the REF bit and always considers
998 	 * SHARED pages as referenced.
999 	 */
1000 	if (whichhand == FRONT)
1001 		pagesync_flag = HAT_SYNC_ZERORM;
1002 	else
1003 		pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
1004 		    HAT_SYNC_STOPON_SHARED;
1005 
1006 	ppattr = hat_pagesync(pp, pagesync_flag);
1007 
1008 recheck:
1009 	/*
1010 	 * If page is referenced; make unreferenced but reclaimable.
1011 	 * If this page is not referenced, then it must be reclaimable
1012 	 * and we can add it to the free list.
1013 	 */
1014 	if (ppattr & P_REF) {
1015 		TRACE_2(TR_FAC_VM, TR_PAGEOUT_ISREF,
1016 		    "pageout_isref:pp %p whichhand %d", pp, whichhand);
1017 		if (whichhand == FRONT) {
1018 			/*
1019 			 * Checking of rss or madvise flags needed here...
1020 			 *
1021 			 * If not "well-behaved", fall through into the code
1022 			 * for not referenced.
1023 			 */
1024 			hat_clrref(pp);
1025 		}
1026 		/*
1027 		 * Somebody referenced the page since the front
1028 		 * hand went by, so it's not a candidate for
1029 		 * freeing up.
1030 		 */
1031 		page_unlock(pp);
1032 		return (0);
1033 	}
1034 
1035 	VM_STAT_ADD(pageoutvmstats.checkpage[0]);
1036 
1037 	/*
1038 	 * If large page, attempt to demote it. If successfully demoted,
1039 	 * retry the checkpage.
1040 	 */
1041 	if (pp->p_szc != 0) {
1042 		if (!page_try_demote_pages(pp)) {
1043 			VM_STAT_ADD(pageoutvmstats.checkpage[1]);
1044 			page_unlock(pp);
1045 			return (-1);
1046 		}
1047 		ASSERT(pp->p_szc == 0);
1048 		VM_STAT_ADD(pageoutvmstats.checkpage[2]);
1049 		/*
1050 		 * since page_try_demote_pages() could have unloaded some
1051 		 * mappings it makes sense to reload ppattr.
1052 		 */
1053 		ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1054 	}
1055 
1056 	/*
1057 	 * If the page is currently dirty, we have to arrange
1058 	 * to have it cleaned before it can be freed.
1059 	 *
1060 	 * XXX - ASSERT(pp->p_vnode != NULL);
1061 	 */
1062 	if ((ppattr & P_MOD) && pp->p_vnode) {
1063 		struct vnode *vp = pp->p_vnode;
1064 		u_offset_t offset = pp->p_offset;
1065 
1066 		/*
1067 		 * XXX - Test for process being swapped out or about to exit?
1068 		 * [Can't get back to process(es) using the page.]
1069 		 */
1070 
1071 		/*
1072 		 * Hold the vnode before releasing the page lock to
1073 		 * prevent it from being freed and re-used by some
1074 		 * other thread.
1075 		 */
1076 		VN_HOLD(vp);
1077 		page_unlock(pp);
1078 
1079 		/*
1080 		 * Queue i/o request for the pageout thread.
1081 		 */
1082 		if (!queue_io_request(vp, offset)) {
1083 			VN_RELE(vp);
1084 			return (0);
1085 		}
1086 		return (1);
1087 	}
1088 
1089 	/*
1090 	 * Now we unload all the translations,
1091 	 * and put the page back on to the free list.
1092 	 * If the page was used (referenced or modified) after
1093 	 * the pagesync but before it was unloaded we catch it
1094 	 * and handle the page properly.
1095 	 */
1096 	TRACE_2(TR_FAC_VM, TR_PAGEOUT_FREE,
1097 		"pageout_free:pp %p whichhand %d", pp, whichhand);
1098 	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1099 	ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1100 	if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode))
1101 		goto recheck;
1102 
1103 	/*LINTED: constant in conditional context*/
1104 	VN_DISPOSE(pp, B_FREE, 0, kcred);
1105 
1106 	CPU_STATS_ADD_K(vm, dfree, 1);
1107 
1108 	if (isfs) {
1109 		if (isexec) {
1110 			CPU_STATS_ADD_K(vm, execfree, 1);
1111 		} else {
1112 			CPU_STATS_ADD_K(vm, fsfree, 1);
1113 		}
1114 	} else {
1115 		CPU_STATS_ADD_K(vm, anonfree, 1);
1116 	}
1117 
1118 	return (1);		/* freed a page! */
1119 }
1120 
1121 /*
1122  * Queue async i/o request from pageout_scanner and segment swapout
1123  * routines on one common list.  This ensures that pageout devices (swap)
1124  * are not saturated by pageout_scanner or swapout requests.
1125  * The pageout thread empties this list by initiating i/o operations.
1126  */
1127 int
1128 queue_io_request(vnode_t *vp, u_offset_t off)
1129 {
1130 	struct async_reqs *arg;
1131 
1132 	/*
1133 	 * If we cannot allocate an async request struct,
1134 	 * skip this page.
1135 	 */
1136 	mutex_enter(&push_lock);
1137 	if ((arg = req_freelist) == NULL) {
1138 		mutex_exit(&push_lock);
1139 		return (0);
1140 	}
1141 	req_freelist = arg->a_next;		/* adjust freelist */
1142 	push_list_size++;
1143 
1144 	arg->a_vp = vp;
1145 	arg->a_off = off;
1146 	arg->a_len = PAGESIZE;
1147 	arg->a_flags = B_ASYNC | B_FREE;
1148 	arg->a_cred = kcred;		/* always held */
1149 
1150 	/*
1151 	 * Add to list of pending write requests.
1152 	 */
1153 	arg->a_next = push_list;
1154 	push_list = arg;
1155 
1156 	if (req_freelist == NULL) {
1157 		/*
1158 		 * No free async requests left. The lock is held so we
1159 		 * might as well signal the pusher thread now.
1160 		 */
1161 		cv_signal(&push_cv);
1162 	}
1163 	mutex_exit(&push_lock);
1164 	return (1);
1165 }
1166 
1167 /*
1168  * Wakeup pageout to initiate i/o if push_list is not empty.
1169  */
1170 void
1171 cv_signal_pageout()
1172 {
1173 	if (push_list != NULL) {
1174 		mutex_enter(&push_lock);
1175 		cv_signal(&push_cv);
1176 		mutex_exit(&push_lock);
1177 	}
1178 }
1179