17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate * CDDL HEADER START
37c478bd9Sstevel@tonic-gate *
47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the
5ad23a2dbSjohansen * Common Development and Distribution License (the "License").
6ad23a2dbSjohansen * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate *
87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate * and limitations under the License.
127c478bd9Sstevel@tonic-gate *
137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate *
197c478bd9Sstevel@tonic-gate * CDDL HEADER END
207c478bd9Sstevel@tonic-gate */
217c478bd9Sstevel@tonic-gate /*
22d3d50737SRafael Vanoni * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
237c478bd9Sstevel@tonic-gate * Use is subject to license terms.
247c478bd9Sstevel@tonic-gate */
257c478bd9Sstevel@tonic-gate
267c478bd9Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
277c478bd9Sstevel@tonic-gate /* All Rights Reserved */
287c478bd9Sstevel@tonic-gate
297c478bd9Sstevel@tonic-gate /*
307c478bd9Sstevel@tonic-gate * University Copyright- Copyright (c) 1982, 1986, 1988
317c478bd9Sstevel@tonic-gate * The Regents of the University of California
327c478bd9Sstevel@tonic-gate * All Rights Reserved
337c478bd9Sstevel@tonic-gate *
347c478bd9Sstevel@tonic-gate * University Acknowledgment- Portions of this document are derived from
357c478bd9Sstevel@tonic-gate * software developed by the University of California, Berkeley, and its
367c478bd9Sstevel@tonic-gate * contributors.
377c478bd9Sstevel@tonic-gate */
387c478bd9Sstevel@tonic-gate
397c478bd9Sstevel@tonic-gate #include <sys/types.h>
407c478bd9Sstevel@tonic-gate #include <sys/t_lock.h>
417c478bd9Sstevel@tonic-gate #include <sys/param.h>
427c478bd9Sstevel@tonic-gate #include <sys/buf.h>
437c478bd9Sstevel@tonic-gate #include <sys/uio.h>
447c478bd9Sstevel@tonic-gate #include <sys/proc.h>
457c478bd9Sstevel@tonic-gate #include <sys/systm.h>
467c478bd9Sstevel@tonic-gate #include <sys/mman.h>
477c478bd9Sstevel@tonic-gate #include <sys/cred.h>
487c478bd9Sstevel@tonic-gate #include <sys/vnode.h>
497c478bd9Sstevel@tonic-gate #include <sys/vm.h>
507c478bd9Sstevel@tonic-gate #include <sys/vmparam.h>
517c478bd9Sstevel@tonic-gate #include <sys/vtrace.h>
527c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
537c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h>
547c478bd9Sstevel@tonic-gate #include <sys/user.h>
557c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
567c478bd9Sstevel@tonic-gate #include <sys/debug.h>
577c478bd9Sstevel@tonic-gate #include <sys/callb.h>
587c478bd9Sstevel@tonic-gate #include <sys/tnf_probe.h>
597c478bd9Sstevel@tonic-gate #include <sys/mem_cage.h>
607c478bd9Sstevel@tonic-gate #include <sys/time.h>
617c478bd9Sstevel@tonic-gate
627c478bd9Sstevel@tonic-gate #include <vm/hat.h>
637c478bd9Sstevel@tonic-gate #include <vm/as.h>
647c478bd9Sstevel@tonic-gate #include <vm/seg.h>
657c478bd9Sstevel@tonic-gate #include <vm/page.h>
667c478bd9Sstevel@tonic-gate #include <vm/pvn.h>
677c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h>
687c478bd9Sstevel@tonic-gate
697c478bd9Sstevel@tonic-gate static int checkpage(page_t *, int);
707c478bd9Sstevel@tonic-gate
717c478bd9Sstevel@tonic-gate /*
727c478bd9Sstevel@tonic-gate * The following parameters control operation of the page replacement
737c478bd9Sstevel@tonic-gate * algorithm. They are initialized to 0, and then computed at boot time
747c478bd9Sstevel@tonic-gate * based on the size of the system. If they are patched non-zero in
757c478bd9Sstevel@tonic-gate * a loaded vmunix they are left alone and may thus be changed per system
767c478bd9Sstevel@tonic-gate * using adb on the loaded system.
777c478bd9Sstevel@tonic-gate */
787c478bd9Sstevel@tonic-gate pgcnt_t slowscan = 0;
797c478bd9Sstevel@tonic-gate pgcnt_t fastscan = 0;
807c478bd9Sstevel@tonic-gate
817c478bd9Sstevel@tonic-gate static pgcnt_t handspreadpages = 0;
827c478bd9Sstevel@tonic-gate static int loopfraction = 2;
837c478bd9Sstevel@tonic-gate static pgcnt_t looppages;
847c478bd9Sstevel@tonic-gate static int min_percent_cpu = 4;
857c478bd9Sstevel@tonic-gate static int max_percent_cpu = 80;
867c478bd9Sstevel@tonic-gate static pgcnt_t maxfastscan = 0;
877c478bd9Sstevel@tonic-gate static pgcnt_t maxslowscan = 100;
887c478bd9Sstevel@tonic-gate
897c478bd9Sstevel@tonic-gate pgcnt_t maxpgio = 0;
907c478bd9Sstevel@tonic-gate pgcnt_t minfree = 0;
917c478bd9Sstevel@tonic-gate pgcnt_t desfree = 0;
927c478bd9Sstevel@tonic-gate pgcnt_t lotsfree = 0;
937c478bd9Sstevel@tonic-gate pgcnt_t needfree = 0;
947c478bd9Sstevel@tonic-gate pgcnt_t throttlefree = 0;
957c478bd9Sstevel@tonic-gate pgcnt_t pageout_reserve = 0;
967c478bd9Sstevel@tonic-gate
977c478bd9Sstevel@tonic-gate pgcnt_t deficit;
987c478bd9Sstevel@tonic-gate pgcnt_t nscan;
997c478bd9Sstevel@tonic-gate pgcnt_t desscan;
1007c478bd9Sstevel@tonic-gate
1017c478bd9Sstevel@tonic-gate /*
1027c478bd9Sstevel@tonic-gate * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks
1037c478bd9Sstevel@tonic-gate * are the number of ticks in each wakeup cycle that gives the
1047c478bd9Sstevel@tonic-gate * equivalent of some underlying %CPU duty cycle.
1057c478bd9Sstevel@tonic-gate * When RATETOSCHEDPAGING is 4, and hz is 100, pageout_scanner is
1067c478bd9Sstevel@tonic-gate * awakened every 25 clock ticks. So, converting from %CPU to ticks
1077c478bd9Sstevel@tonic-gate * per wakeup cycle would be x% of 25, that is (x * 100) / 25.
1087c478bd9Sstevel@tonic-gate * So, for example, 4% == 1 tick and 80% == 20 ticks.
1097c478bd9Sstevel@tonic-gate *
1107c478bd9Sstevel@tonic-gate * min_pageout_ticks:
1117c478bd9Sstevel@tonic-gate * ticks/wakeup equivalent of min_percent_cpu.
1127c478bd9Sstevel@tonic-gate *
1137c478bd9Sstevel@tonic-gate * max_pageout_ticks:
1147c478bd9Sstevel@tonic-gate * ticks/wakeup equivalent of max_percent_cpu.
1157c478bd9Sstevel@tonic-gate *
1167c478bd9Sstevel@tonic-gate * pageout_ticks:
1177c478bd9Sstevel@tonic-gate * Number of clock ticks budgeted for each wakeup cycle.
1187c478bd9Sstevel@tonic-gate * Computed each time around by schedpaging().
1197c478bd9Sstevel@tonic-gate * Varies between min_pageout_ticks .. max_pageout_ticks,
1207c478bd9Sstevel@tonic-gate * depending on memory pressure.
1217c478bd9Sstevel@tonic-gate *
1227c478bd9Sstevel@tonic-gate * pageout_lbolt:
1237c478bd9Sstevel@tonic-gate * Timestamp of the last time pageout_scanner woke up and started
1247c478bd9Sstevel@tonic-gate * (or resumed) scanning for not recently referenced pages.
1257c478bd9Sstevel@tonic-gate */
1267c478bd9Sstevel@tonic-gate
1277c478bd9Sstevel@tonic-gate static clock_t min_pageout_ticks;
1287c478bd9Sstevel@tonic-gate static clock_t max_pageout_ticks;
1297c478bd9Sstevel@tonic-gate static clock_t pageout_ticks;
1307c478bd9Sstevel@tonic-gate static clock_t pageout_lbolt;
1317c478bd9Sstevel@tonic-gate
1327c478bd9Sstevel@tonic-gate static uint_t reset_hands;
1337c478bd9Sstevel@tonic-gate
1347c478bd9Sstevel@tonic-gate #define PAGES_POLL_MASK 1023
1357c478bd9Sstevel@tonic-gate
1367c478bd9Sstevel@tonic-gate /*
1377c478bd9Sstevel@tonic-gate * pageout_sample_lim:
1387c478bd9Sstevel@tonic-gate * The limit on the number of samples needed to establish a value
1397c478bd9Sstevel@tonic-gate * for new pageout parameters, fastscan, slowscan, and handspreadpages.
1407c478bd9Sstevel@tonic-gate *
1417c478bd9Sstevel@tonic-gate * pageout_sample_cnt:
1427c478bd9Sstevel@tonic-gate * Current sample number. Once the sample gets large enough,
1437c478bd9Sstevel@tonic-gate * set new values for handspreadpages, fastscan and slowscan.
1447c478bd9Sstevel@tonic-gate *
1457c478bd9Sstevel@tonic-gate * pageout_sample_pages:
1467c478bd9Sstevel@tonic-gate * The accumulated number of pages scanned during sampling.
1477c478bd9Sstevel@tonic-gate *
1487c478bd9Sstevel@tonic-gate * pageout_sample_ticks:
1497c478bd9Sstevel@tonic-gate * The accumulated clock ticks for the sample.
1507c478bd9Sstevel@tonic-gate *
1517c478bd9Sstevel@tonic-gate * pageout_rate:
1527c478bd9Sstevel@tonic-gate * Rate in pages/nanosecond, computed at the end of sampling.
1537c478bd9Sstevel@tonic-gate *
1547c478bd9Sstevel@tonic-gate * pageout_new_spread:
1557c478bd9Sstevel@tonic-gate * The new value to use for fastscan and handspreadpages.
1567c478bd9Sstevel@tonic-gate * Calculated after enough samples have been taken.
1577c478bd9Sstevel@tonic-gate */
1587c478bd9Sstevel@tonic-gate
1597c478bd9Sstevel@tonic-gate typedef hrtime_t hrrate_t;
1607c478bd9Sstevel@tonic-gate
1617c478bd9Sstevel@tonic-gate static uint64_t pageout_sample_lim = 4;
1627c478bd9Sstevel@tonic-gate static uint64_t pageout_sample_cnt = 0;
1637c478bd9Sstevel@tonic-gate static pgcnt_t pageout_sample_pages = 0;
1647c478bd9Sstevel@tonic-gate static hrrate_t pageout_rate = 0;
1657c478bd9Sstevel@tonic-gate static pgcnt_t pageout_new_spread = 0;
1667c478bd9Sstevel@tonic-gate
1677c478bd9Sstevel@tonic-gate static clock_t pageout_cycle_ticks;
1687c478bd9Sstevel@tonic-gate static hrtime_t sample_start, sample_end;
1697c478bd9Sstevel@tonic-gate static hrtime_t pageout_sample_etime = 0;
1707c478bd9Sstevel@tonic-gate
1717c478bd9Sstevel@tonic-gate /*
1727c478bd9Sstevel@tonic-gate * Record number of times a pageout_scanner wakeup cycle finished because it
1737c478bd9Sstevel@tonic-gate * timed out (exceeded its CPU budget), rather than because it visited
1747c478bd9Sstevel@tonic-gate * its budgeted number of pages.
1757c478bd9Sstevel@tonic-gate */
1767c478bd9Sstevel@tonic-gate uint64_t pageout_timeouts = 0;
1777c478bd9Sstevel@tonic-gate
1787c478bd9Sstevel@tonic-gate #ifdef VM_STATS
1797c478bd9Sstevel@tonic-gate static struct pageoutvmstats_str {
1807c478bd9Sstevel@tonic-gate ulong_t checkpage[3];
1817c478bd9Sstevel@tonic-gate } pageoutvmstats;
1827c478bd9Sstevel@tonic-gate #endif /* VM_STATS */
1837c478bd9Sstevel@tonic-gate
1847c478bd9Sstevel@tonic-gate /*
1857c478bd9Sstevel@tonic-gate * Threads waiting for free memory use this condition variable and lock until
1867c478bd9Sstevel@tonic-gate * memory becomes available.
1877c478bd9Sstevel@tonic-gate */
1887c478bd9Sstevel@tonic-gate kmutex_t memavail_lock;
1897c478bd9Sstevel@tonic-gate kcondvar_t memavail_cv;
1907c478bd9Sstevel@tonic-gate
1917c478bd9Sstevel@tonic-gate /*
1927c478bd9Sstevel@tonic-gate * The size of the clock loop.
1937c478bd9Sstevel@tonic-gate */
1947c478bd9Sstevel@tonic-gate #define LOOPPAGES total_pages
1957c478bd9Sstevel@tonic-gate
1967c478bd9Sstevel@tonic-gate /*
1977c478bd9Sstevel@tonic-gate * Set up the paging constants for the clock algorithm.
1987c478bd9Sstevel@tonic-gate * Called after the system is initialized and the amount of memory
1997c478bd9Sstevel@tonic-gate * and number of paging devices is known.
2007c478bd9Sstevel@tonic-gate *
2017c478bd9Sstevel@tonic-gate * lotsfree is 1/64 of memory, but at least 512K.
2027c478bd9Sstevel@tonic-gate * desfree is 1/2 of lotsfree.
2037c478bd9Sstevel@tonic-gate * minfree is 1/2 of desfree.
2047c478bd9Sstevel@tonic-gate *
2057c478bd9Sstevel@tonic-gate * Note: to revert to the paging algorithm of Solaris 2.4/2.5, set:
2067c478bd9Sstevel@tonic-gate *
2077c478bd9Sstevel@tonic-gate * lotsfree = btop(512K)
2087c478bd9Sstevel@tonic-gate * desfree = btop(200K)
2097c478bd9Sstevel@tonic-gate * minfree = btop(100K)
2107c478bd9Sstevel@tonic-gate * throttlefree = INT_MIN
2117c478bd9Sstevel@tonic-gate * max_percent_cpu = 4
2127c478bd9Sstevel@tonic-gate */
2137c478bd9Sstevel@tonic-gate void
setupclock(int recalc)2147c478bd9Sstevel@tonic-gate setupclock(int recalc)
2157c478bd9Sstevel@tonic-gate {
2167c478bd9Sstevel@tonic-gate
2177c478bd9Sstevel@tonic-gate static spgcnt_t init_lfree, init_dfree, init_mfree;
2187c478bd9Sstevel@tonic-gate static spgcnt_t init_tfree, init_preserve, init_mpgio;
2197c478bd9Sstevel@tonic-gate static spgcnt_t init_mfscan, init_fscan, init_sscan, init_hspages;
2207c478bd9Sstevel@tonic-gate
2217c478bd9Sstevel@tonic-gate looppages = LOOPPAGES;
2227c478bd9Sstevel@tonic-gate
2237c478bd9Sstevel@tonic-gate /*
2247c478bd9Sstevel@tonic-gate * setupclock can now be called to recalculate the paging
2257c478bd9Sstevel@tonic-gate * parameters in the case of dynamic addition of memory.
2267c478bd9Sstevel@tonic-gate * So to make sure we make the proper calculations, if such a
2277c478bd9Sstevel@tonic-gate * situation should arise, we save away the initial values
2287c478bd9Sstevel@tonic-gate * of each parameter so we can recall them when needed. This
2297c478bd9Sstevel@tonic-gate * way we don't lose the settings an admin might have made
2307c478bd9Sstevel@tonic-gate * through the /etc/system file.
2317c478bd9Sstevel@tonic-gate */
2327c478bd9Sstevel@tonic-gate
2337c478bd9Sstevel@tonic-gate if (!recalc) {
2347c478bd9Sstevel@tonic-gate init_lfree = lotsfree;
2357c478bd9Sstevel@tonic-gate init_dfree = desfree;
2367c478bd9Sstevel@tonic-gate init_mfree = minfree;
2377c478bd9Sstevel@tonic-gate init_tfree = throttlefree;
2387c478bd9Sstevel@tonic-gate init_preserve = pageout_reserve;
2397c478bd9Sstevel@tonic-gate init_mpgio = maxpgio;
2407c478bd9Sstevel@tonic-gate init_mfscan = maxfastscan;
2417c478bd9Sstevel@tonic-gate init_fscan = fastscan;
2427c478bd9Sstevel@tonic-gate init_sscan = slowscan;
2437c478bd9Sstevel@tonic-gate init_hspages = handspreadpages;
2447c478bd9Sstevel@tonic-gate }
2457c478bd9Sstevel@tonic-gate
2467c478bd9Sstevel@tonic-gate /*
2477c478bd9Sstevel@tonic-gate * Set up thresholds for paging:
2487c478bd9Sstevel@tonic-gate */
2497c478bd9Sstevel@tonic-gate
2507c478bd9Sstevel@tonic-gate /*
2517c478bd9Sstevel@tonic-gate * Lotsfree is threshold where paging daemon turns on.
2527c478bd9Sstevel@tonic-gate */
2537c478bd9Sstevel@tonic-gate if (init_lfree == 0 || init_lfree >= looppages)
2547c478bd9Sstevel@tonic-gate lotsfree = MAX(looppages / 64, btop(512 * 1024));
2557c478bd9Sstevel@tonic-gate else
2567c478bd9Sstevel@tonic-gate lotsfree = init_lfree;
2577c478bd9Sstevel@tonic-gate
2587c478bd9Sstevel@tonic-gate /*
2597c478bd9Sstevel@tonic-gate * Desfree is amount of memory desired free.
2607c478bd9Sstevel@tonic-gate * If less than this for extended period, start swapping.
2617c478bd9Sstevel@tonic-gate */
2627c478bd9Sstevel@tonic-gate if (init_dfree == 0 || init_dfree >= lotsfree)
2637c478bd9Sstevel@tonic-gate desfree = lotsfree / 2;
2647c478bd9Sstevel@tonic-gate else
2657c478bd9Sstevel@tonic-gate desfree = init_dfree;
2667c478bd9Sstevel@tonic-gate
2677c478bd9Sstevel@tonic-gate /*
2687c478bd9Sstevel@tonic-gate * Minfree is minimal amount of free memory which is tolerable.
2697c478bd9Sstevel@tonic-gate */
2707c478bd9Sstevel@tonic-gate if (init_mfree == 0 || init_mfree >= desfree)
2717c478bd9Sstevel@tonic-gate minfree = desfree / 2;
2727c478bd9Sstevel@tonic-gate else
2737c478bd9Sstevel@tonic-gate minfree = init_mfree;
2747c478bd9Sstevel@tonic-gate
2757c478bd9Sstevel@tonic-gate /*
2767c478bd9Sstevel@tonic-gate * Throttlefree is the point at which we start throttling
2777c478bd9Sstevel@tonic-gate * PG_WAIT requests until enough memory becomes available.
2787c478bd9Sstevel@tonic-gate */
2797c478bd9Sstevel@tonic-gate if (init_tfree == 0 || init_tfree >= desfree)
2807c478bd9Sstevel@tonic-gate throttlefree = minfree;
2817c478bd9Sstevel@tonic-gate else
2827c478bd9Sstevel@tonic-gate throttlefree = init_tfree;
2837c478bd9Sstevel@tonic-gate
2847c478bd9Sstevel@tonic-gate /*
2857c478bd9Sstevel@tonic-gate * Pageout_reserve is the number of pages that we keep in
2867c478bd9Sstevel@tonic-gate * stock for pageout's own use. Having a few such pages
2877c478bd9Sstevel@tonic-gate * provides insurance against system deadlock due to
2887c478bd9Sstevel@tonic-gate * pageout needing pages. When freemem < pageout_reserve,
2897c478bd9Sstevel@tonic-gate * non-blocking allocations are denied to any threads
2907c478bd9Sstevel@tonic-gate * other than pageout and sched. (At some point we might
2917c478bd9Sstevel@tonic-gate * want to consider a per-thread flag like T_PUSHING_PAGES
2927c478bd9Sstevel@tonic-gate * to indicate that a thread is part of the page-pushing
2937c478bd9Sstevel@tonic-gate * dance (e.g. an interrupt thread) and thus is entitled
2947c478bd9Sstevel@tonic-gate * to the same special dispensation we accord pageout.)
2957c478bd9Sstevel@tonic-gate */
2967c478bd9Sstevel@tonic-gate if (init_preserve == 0 || init_preserve >= throttlefree)
2977c478bd9Sstevel@tonic-gate pageout_reserve = throttlefree / 2;
2987c478bd9Sstevel@tonic-gate else
2997c478bd9Sstevel@tonic-gate pageout_reserve = init_preserve;
3007c478bd9Sstevel@tonic-gate
3017c478bd9Sstevel@tonic-gate /*
3027c478bd9Sstevel@tonic-gate * Maxpgio thresholds how much paging is acceptable.
3037c478bd9Sstevel@tonic-gate * This figures that 2/3 busy on an arm is all that is
3047c478bd9Sstevel@tonic-gate * tolerable for paging. We assume one operation per disk rev.
3057c478bd9Sstevel@tonic-gate *
3067c478bd9Sstevel@tonic-gate * XXX - Does not account for multiple swap devices.
3077c478bd9Sstevel@tonic-gate */
3087c478bd9Sstevel@tonic-gate if (init_mpgio == 0)
3097c478bd9Sstevel@tonic-gate maxpgio = (DISKRPM * 2) / 3;
3107c478bd9Sstevel@tonic-gate else
3117c478bd9Sstevel@tonic-gate maxpgio = init_mpgio;
3127c478bd9Sstevel@tonic-gate
3137c478bd9Sstevel@tonic-gate /*
3147c478bd9Sstevel@tonic-gate * The clock scan rate varies between fastscan and slowscan
3157c478bd9Sstevel@tonic-gate * based on the amount of free memory available. Fastscan
3167c478bd9Sstevel@tonic-gate * rate should be set based on the number pages that can be
3177c478bd9Sstevel@tonic-gate * scanned per sec using ~10% of processor time. Since this
3187c478bd9Sstevel@tonic-gate * value depends on the processor, MMU, Mhz etc., it is
3197c478bd9Sstevel@tonic-gate * difficult to determine it in a generic manner for all
3207c478bd9Sstevel@tonic-gate * architectures.
3217c478bd9Sstevel@tonic-gate *
3227c478bd9Sstevel@tonic-gate * Instead of trying to determine the number of pages scanned
3237c478bd9Sstevel@tonic-gate * per sec for every processor, fastscan is set to be the smaller
3247c478bd9Sstevel@tonic-gate * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
3257c478bd9Sstevel@tonic-gate * time is limited to ~4% of processor time.
3267c478bd9Sstevel@tonic-gate *
3277c478bd9Sstevel@tonic-gate * Setting fastscan to be 1/2 of memory allows pageout to scan
3287c478bd9Sstevel@tonic-gate * all of memory in ~2 secs. This implies that user pages not
3297c478bd9Sstevel@tonic-gate * accessed within 1 sec (assuming, handspreadpages == fastscan)
3307c478bd9Sstevel@tonic-gate * can be reclaimed when free memory is very low. Stealing pages
3317c478bd9Sstevel@tonic-gate * not accessed within 1 sec seems reasonable and ensures that
3327c478bd9Sstevel@tonic-gate * active user processes don't thrash.
3337c478bd9Sstevel@tonic-gate *
3347c478bd9Sstevel@tonic-gate * Smaller values of fastscan result in scanning fewer pages
3357c478bd9Sstevel@tonic-gate * every second and consequently pageout may not be able to free
3367c478bd9Sstevel@tonic-gate * sufficient memory to maintain the minimum threshold. Larger
3377c478bd9Sstevel@tonic-gate * values of fastscan result in scanning a lot more pages which
3387c478bd9Sstevel@tonic-gate * could lead to thrashing and higher CPU usage.
3397c478bd9Sstevel@tonic-gate *
3407c478bd9Sstevel@tonic-gate * Fastscan needs to be limited to a maximum value and should not
3417c478bd9Sstevel@tonic-gate * scale with memory to prevent pageout from consuming too much
3427c478bd9Sstevel@tonic-gate * time for scanning on slow CPU's and avoid thrashing, as a
3437c478bd9Sstevel@tonic-gate * result of scanning too many pages, on faster CPU's.
3447c478bd9Sstevel@tonic-gate * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
3457c478bd9Sstevel@tonic-gate * (the upper bound for fastscan) based on the average number
3467c478bd9Sstevel@tonic-gate * of pages that can potentially be scanned in ~1 sec (using ~4%
3477c478bd9Sstevel@tonic-gate * of the CPU) on some of the following machines that currently
3487c478bd9Sstevel@tonic-gate * run Solaris 2.x:
3497c478bd9Sstevel@tonic-gate *
3507c478bd9Sstevel@tonic-gate * average memory scanned in ~1 sec
3517c478bd9Sstevel@tonic-gate *
3527c478bd9Sstevel@tonic-gate * 25 Mhz SS1+: 23 Meg
3537c478bd9Sstevel@tonic-gate * LX: 37 Meg
3547c478bd9Sstevel@tonic-gate * 50 Mhz SC2000: 68 Meg
3557c478bd9Sstevel@tonic-gate *
3567c478bd9Sstevel@tonic-gate * 40 Mhz 486: 26 Meg
3577c478bd9Sstevel@tonic-gate * 66 Mhz 486: 42 Meg
3587c478bd9Sstevel@tonic-gate *
3597c478bd9Sstevel@tonic-gate * When free memory falls just below lotsfree, the scan rate
3607c478bd9Sstevel@tonic-gate * goes from 0 to slowscan (i.e., pageout starts running). This
3617c478bd9Sstevel@tonic-gate * transition needs to be smooth and is achieved by ensuring that
3627c478bd9Sstevel@tonic-gate * pageout scans a small number of pages to satisfy the transient
3637c478bd9Sstevel@tonic-gate * memory demand. This is set to not exceed 100 pages/sec (25 per
3647c478bd9Sstevel@tonic-gate * wakeup) since scanning that many pages has no noticible impact
3657c478bd9Sstevel@tonic-gate * on system performance.
3667c478bd9Sstevel@tonic-gate *
3677c478bd9Sstevel@tonic-gate * In addition to setting fastscan and slowscan, pageout is
3687c478bd9Sstevel@tonic-gate * limited to using ~4% of the CPU. This results in increasing
3697c478bd9Sstevel@tonic-gate * the time taken to scan all of memory, which in turn means that
3707c478bd9Sstevel@tonic-gate * user processes have a better opportunity of preventing their
3717c478bd9Sstevel@tonic-gate * pages from being stolen. This has a positive effect on
3727c478bd9Sstevel@tonic-gate * interactive and overall system performance when memory demand
3737c478bd9Sstevel@tonic-gate * is high.
3747c478bd9Sstevel@tonic-gate *
3757c478bd9Sstevel@tonic-gate * Thus, the rate at which pages are scanned for replacement will
3767c478bd9Sstevel@tonic-gate * vary linearly between slowscan and the number of pages that
3777c478bd9Sstevel@tonic-gate * can be scanned using ~4% of processor time instead of varying
3787c478bd9Sstevel@tonic-gate * linearly between slowscan and fastscan.
3797c478bd9Sstevel@tonic-gate *
3807c478bd9Sstevel@tonic-gate * Also, the processor time used by pageout will vary from ~1%
3817c478bd9Sstevel@tonic-gate * at slowscan to ~4% at fastscan instead of varying between
3827c478bd9Sstevel@tonic-gate * ~1% at slowscan and ~10% at fastscan.
3837c478bd9Sstevel@tonic-gate *
3847c478bd9Sstevel@tonic-gate * The values chosen for the various VM parameters (fastscan,
3857c478bd9Sstevel@tonic-gate * handspreadpages, etc) are not universally true for all machines,
3867c478bd9Sstevel@tonic-gate * but appear to be a good rule of thumb for the machines we've
3877c478bd9Sstevel@tonic-gate * tested. They have the following ranges:
3887c478bd9Sstevel@tonic-gate *
3897c478bd9Sstevel@tonic-gate * cpu speed: 20 to 70 Mhz
3907c478bd9Sstevel@tonic-gate * page size: 4K to 8K
3917c478bd9Sstevel@tonic-gate * memory size: 16M to 5G
3927c478bd9Sstevel@tonic-gate * page scan rate: 4000 - 17400 4K pages per sec
3937c478bd9Sstevel@tonic-gate *
3947c478bd9Sstevel@tonic-gate * The values need to be re-examined for machines which don't
3957c478bd9Sstevel@tonic-gate * fall into the various ranges (e.g., slower or faster CPUs,
3967c478bd9Sstevel@tonic-gate * smaller or larger pagesizes etc) shown above.
3977c478bd9Sstevel@tonic-gate *
3987c478bd9Sstevel@tonic-gate * On an MP machine, pageout is often unable to maintain the
3997c478bd9Sstevel@tonic-gate * minimum paging thresholds under heavy load. This is due to
4007c478bd9Sstevel@tonic-gate * the fact that user processes running on other CPU's can be
4017c478bd9Sstevel@tonic-gate * dirtying memory at a much faster pace than pageout can find
4027c478bd9Sstevel@tonic-gate * pages to free. The memory demands could be met by enabling
4037c478bd9Sstevel@tonic-gate * more than one CPU to run the clock algorithm in such a manner
4047c478bd9Sstevel@tonic-gate * that the various clock hands don't overlap. This also makes
4057c478bd9Sstevel@tonic-gate * it more difficult to determine the values for fastscan, slowscan
4067c478bd9Sstevel@tonic-gate * and handspreadpages.
4077c478bd9Sstevel@tonic-gate *
4087c478bd9Sstevel@tonic-gate * The swapper is currently used to free up memory when pageout
4097c478bd9Sstevel@tonic-gate * is unable to meet memory demands by swapping out processes.
4107c478bd9Sstevel@tonic-gate * In addition to freeing up memory, swapping also reduces the
4117c478bd9Sstevel@tonic-gate * demand for memory by preventing user processes from running
4127c478bd9Sstevel@tonic-gate * and thereby consuming memory.
4137c478bd9Sstevel@tonic-gate */
4147c478bd9Sstevel@tonic-gate if (init_mfscan == 0) {
4157c478bd9Sstevel@tonic-gate if (pageout_new_spread != 0)
4167c478bd9Sstevel@tonic-gate maxfastscan = pageout_new_spread;
4177c478bd9Sstevel@tonic-gate else
4187c478bd9Sstevel@tonic-gate maxfastscan = MAXHANDSPREADPAGES;
4197c478bd9Sstevel@tonic-gate } else {
4207c478bd9Sstevel@tonic-gate maxfastscan = init_mfscan;
4217c478bd9Sstevel@tonic-gate }
4227c478bd9Sstevel@tonic-gate if (init_fscan == 0)
4237c478bd9Sstevel@tonic-gate fastscan = MIN(looppages / loopfraction, maxfastscan);
4247c478bd9Sstevel@tonic-gate else
4257c478bd9Sstevel@tonic-gate fastscan = init_fscan;
4267c478bd9Sstevel@tonic-gate if (fastscan > looppages / loopfraction)
4277c478bd9Sstevel@tonic-gate fastscan = looppages / loopfraction;
4287c478bd9Sstevel@tonic-gate
4297c478bd9Sstevel@tonic-gate /*
4307c478bd9Sstevel@tonic-gate * Set slow scan time to 1/10 the fast scan time, but
4317c478bd9Sstevel@tonic-gate * not to exceed maxslowscan.
4327c478bd9Sstevel@tonic-gate */
4337c478bd9Sstevel@tonic-gate if (init_sscan == 0)
4347c478bd9Sstevel@tonic-gate slowscan = MIN(fastscan / 10, maxslowscan);
4357c478bd9Sstevel@tonic-gate else
4367c478bd9Sstevel@tonic-gate slowscan = init_sscan;
4377c478bd9Sstevel@tonic-gate if (slowscan > fastscan / 2)
4387c478bd9Sstevel@tonic-gate slowscan = fastscan / 2;
4397c478bd9Sstevel@tonic-gate
4407c478bd9Sstevel@tonic-gate /*
4417c478bd9Sstevel@tonic-gate * Handspreadpages is distance (in pages) between front and back
4427c478bd9Sstevel@tonic-gate * pageout daemon hands. The amount of time to reclaim a page
4437c478bd9Sstevel@tonic-gate * once pageout examines it increases with this distance and
4447c478bd9Sstevel@tonic-gate * decreases as the scan rate rises. It must be < the amount
4457c478bd9Sstevel@tonic-gate * of pageable memory.
4467c478bd9Sstevel@tonic-gate *
4477c478bd9Sstevel@tonic-gate * Since pageout is limited to ~4% of the CPU, setting handspreadpages
4487c478bd9Sstevel@tonic-gate * to be "fastscan" results in the front hand being a few secs
4497c478bd9Sstevel@tonic-gate * (varies based on the processor speed) ahead of the back hand
4507c478bd9Sstevel@tonic-gate * at fastscan rates. This distance can be further reduced, if
4517c478bd9Sstevel@tonic-gate * necessary, by increasing the processor time used by pageout
4527c478bd9Sstevel@tonic-gate * to be more than ~4% and preferrably not more than ~10%.
4537c478bd9Sstevel@tonic-gate *
4547c478bd9Sstevel@tonic-gate * As a result, user processes have a much better chance of
4557c478bd9Sstevel@tonic-gate * referencing their pages before the back hand examines them.
4567c478bd9Sstevel@tonic-gate * This also significantly lowers the number of reclaims from
4577c478bd9Sstevel@tonic-gate * the freelist since pageout does not end up freeing pages which
4587c478bd9Sstevel@tonic-gate * may be referenced a sec later.
4597c478bd9Sstevel@tonic-gate */
4607c478bd9Sstevel@tonic-gate if (init_hspages == 0)
4617c478bd9Sstevel@tonic-gate handspreadpages = fastscan;
4627c478bd9Sstevel@tonic-gate else
4637c478bd9Sstevel@tonic-gate handspreadpages = init_hspages;
4647c478bd9Sstevel@tonic-gate
4657c478bd9Sstevel@tonic-gate /*
4667c478bd9Sstevel@tonic-gate * Make sure that back hand follows front hand by at least
4677c478bd9Sstevel@tonic-gate * 1/RATETOSCHEDPAGING seconds. Without this test, it is possible
4687c478bd9Sstevel@tonic-gate * for the back hand to look at a page during the same wakeup of
4697c478bd9Sstevel@tonic-gate * the pageout daemon in which the front hand cleared its ref bit.
4707c478bd9Sstevel@tonic-gate */
4717c478bd9Sstevel@tonic-gate if (handspreadpages >= looppages)
4727c478bd9Sstevel@tonic-gate handspreadpages = looppages - 1;
4737c478bd9Sstevel@tonic-gate
4747c478bd9Sstevel@tonic-gate /*
4757c478bd9Sstevel@tonic-gate * If we have been called to recalculate the parameters,
4767c478bd9Sstevel@tonic-gate * set a flag to re-evaluate the clock hand pointers.
4777c478bd9Sstevel@tonic-gate */
4787c478bd9Sstevel@tonic-gate if (recalc)
4797c478bd9Sstevel@tonic-gate reset_hands = 1;
4807c478bd9Sstevel@tonic-gate }
4817c478bd9Sstevel@tonic-gate
4827c478bd9Sstevel@tonic-gate /*
4837c478bd9Sstevel@tonic-gate * Pageout scheduling.
4847c478bd9Sstevel@tonic-gate *
4857c478bd9Sstevel@tonic-gate * Schedpaging controls the rate at which the page out daemon runs by
4867c478bd9Sstevel@tonic-gate * setting the global variables nscan and desscan RATETOSCHEDPAGING
4877c478bd9Sstevel@tonic-gate * times a second. Nscan records the number of pages pageout has examined
4887c478bd9Sstevel@tonic-gate * in its current pass; schedpaging resets this value to zero each time
4897c478bd9Sstevel@tonic-gate * it runs. Desscan records the number of pages pageout should examine
4907c478bd9Sstevel@tonic-gate * in its next pass; schedpaging sets this value based on the amount of
4917c478bd9Sstevel@tonic-gate * currently available memory.
4927c478bd9Sstevel@tonic-gate */
4937c478bd9Sstevel@tonic-gate
4947c478bd9Sstevel@tonic-gate #define RATETOSCHEDPAGING 4 /* hz that is */
4957c478bd9Sstevel@tonic-gate
4967c478bd9Sstevel@tonic-gate static kmutex_t pageout_mutex; /* held while pageout or schedpaging running */
4977c478bd9Sstevel@tonic-gate
4987c478bd9Sstevel@tonic-gate /*
4997c478bd9Sstevel@tonic-gate * Pool of available async pageout putpage requests.
5007c478bd9Sstevel@tonic-gate */
5017c478bd9Sstevel@tonic-gate static struct async_reqs *push_req;
5027c478bd9Sstevel@tonic-gate static struct async_reqs *req_freelist; /* available req structs */
5037c478bd9Sstevel@tonic-gate static struct async_reqs *push_list; /* pending reqs */
5047c478bd9Sstevel@tonic-gate static kmutex_t push_lock; /* protects req pool */
5057c478bd9Sstevel@tonic-gate static kcondvar_t push_cv;
5067c478bd9Sstevel@tonic-gate
5077c478bd9Sstevel@tonic-gate static int async_list_size = 256; /* number of async request structs */
5087c478bd9Sstevel@tonic-gate
5097c478bd9Sstevel@tonic-gate static void pageout_scanner(void);
5107c478bd9Sstevel@tonic-gate
5117c478bd9Sstevel@tonic-gate /*
5127c478bd9Sstevel@tonic-gate * If a page is being shared more than "po_share" times
5137c478bd9Sstevel@tonic-gate * then leave it alone- don't page it out.
5147c478bd9Sstevel@tonic-gate */
5157c478bd9Sstevel@tonic-gate #define MIN_PO_SHARE (8)
5167c478bd9Sstevel@tonic-gate #define MAX_PO_SHARE ((MIN_PO_SHARE) << 24)
5177c478bd9Sstevel@tonic-gate ulong_t po_share = MIN_PO_SHARE;
5187c478bd9Sstevel@tonic-gate
5197c478bd9Sstevel@tonic-gate /*
5207c478bd9Sstevel@tonic-gate * Schedule rate for paging.
5217c478bd9Sstevel@tonic-gate * Rate is linear interpolation between
5227c478bd9Sstevel@tonic-gate * slowscan with lotsfree and fastscan when out of memory.
5237c478bd9Sstevel@tonic-gate */
5247c478bd9Sstevel@tonic-gate static void
schedpaging(void * arg)5257c478bd9Sstevel@tonic-gate schedpaging(void *arg)
5267c478bd9Sstevel@tonic-gate {
5277c478bd9Sstevel@tonic-gate spgcnt_t vavail;
5287c478bd9Sstevel@tonic-gate
5297c478bd9Sstevel@tonic-gate if (freemem < lotsfree + needfree + kmem_reapahead)
5307c478bd9Sstevel@tonic-gate kmem_reap();
5317c478bd9Sstevel@tonic-gate
532a98e9dbfSaguzovsk if (freemem < lotsfree + needfree)
5337c478bd9Sstevel@tonic-gate seg_preap();
5347c478bd9Sstevel@tonic-gate
5357c478bd9Sstevel@tonic-gate if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
5367c478bd9Sstevel@tonic-gate kcage_cageout_wakeup();
5377c478bd9Sstevel@tonic-gate
5387c478bd9Sstevel@tonic-gate if (mutex_tryenter(&pageout_mutex)) {
5397c478bd9Sstevel@tonic-gate /* pageout() not running */
5407c478bd9Sstevel@tonic-gate nscan = 0;
5417c478bd9Sstevel@tonic-gate vavail = freemem - deficit;
54206cfbf35Sjimp if (pageout_new_spread != 0)
54306cfbf35Sjimp vavail -= needfree;
5447c478bd9Sstevel@tonic-gate if (vavail < 0)
5457c478bd9Sstevel@tonic-gate vavail = 0;
5467c478bd9Sstevel@tonic-gate if (vavail > lotsfree)
5477c478bd9Sstevel@tonic-gate vavail = lotsfree;
5487c478bd9Sstevel@tonic-gate
5497c478bd9Sstevel@tonic-gate /*
5507c478bd9Sstevel@tonic-gate * Fix for 1161438 (CRS SPR# 73922). All variables
5517c478bd9Sstevel@tonic-gate * in the original calculation for desscan were 32 bit signed
5527c478bd9Sstevel@tonic-gate * ints. As freemem approaches 0x0 on a system with 1 Gig or
5537c478bd9Sstevel@tonic-gate * more of memory, the calculation can overflow. When this
5547c478bd9Sstevel@tonic-gate * happens, desscan becomes negative and pageout_scanner()
5557c478bd9Sstevel@tonic-gate * stops paging out.
5567c478bd9Sstevel@tonic-gate */
55706cfbf35Sjimp if ((needfree) && (pageout_new_spread == 0)) {
55806cfbf35Sjimp /*
55906cfbf35Sjimp * If we've not yet collected enough samples to
56006cfbf35Sjimp * calculate a spread, use the old logic of kicking
56106cfbf35Sjimp * into high gear anytime needfree is non-zero.
56206cfbf35Sjimp */
5637c478bd9Sstevel@tonic-gate desscan = fastscan / RATETOSCHEDPAGING;
5647c478bd9Sstevel@tonic-gate } else {
56506cfbf35Sjimp /*
56606cfbf35Sjimp * Once we've calculated a spread based on system
56706cfbf35Sjimp * memory and usage, just treat needfree as another
56806cfbf35Sjimp * form of deficit.
56906cfbf35Sjimp */
5707c478bd9Sstevel@tonic-gate spgcnt_t faststmp, slowstmp, result;
5717c478bd9Sstevel@tonic-gate
5727c478bd9Sstevel@tonic-gate slowstmp = slowscan * vavail;
5737c478bd9Sstevel@tonic-gate faststmp = fastscan * (lotsfree - vavail);
5747c478bd9Sstevel@tonic-gate result = (slowstmp + faststmp) /
5757c478bd9Sstevel@tonic-gate nz(lotsfree) / RATETOSCHEDPAGING;
5767c478bd9Sstevel@tonic-gate desscan = (pgcnt_t)result;
5777c478bd9Sstevel@tonic-gate }
5787c478bd9Sstevel@tonic-gate
5797c478bd9Sstevel@tonic-gate pageout_ticks = min_pageout_ticks + (lotsfree - vavail) *
5807c478bd9Sstevel@tonic-gate (max_pageout_ticks - min_pageout_ticks) / nz(lotsfree);
5817c478bd9Sstevel@tonic-gate
5827c478bd9Sstevel@tonic-gate if (freemem < lotsfree + needfree ||
5837c478bd9Sstevel@tonic-gate pageout_sample_cnt < pageout_sample_lim) {
5847c478bd9Sstevel@tonic-gate TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
5857c478bd9Sstevel@tonic-gate "pageout_cv_signal:freemem %ld", freemem);
5867c478bd9Sstevel@tonic-gate cv_signal(&proc_pageout->p_cv);
5877c478bd9Sstevel@tonic-gate } else {
5887c478bd9Sstevel@tonic-gate /*
5897c478bd9Sstevel@tonic-gate * There are enough free pages, no need to
5907c478bd9Sstevel@tonic-gate * kick the scanner thread. And next time
5917c478bd9Sstevel@tonic-gate * around, keep more of the `highly shared'
5927c478bd9Sstevel@tonic-gate * pages.
5937c478bd9Sstevel@tonic-gate */
5947c478bd9Sstevel@tonic-gate cv_signal_pageout();
5957c478bd9Sstevel@tonic-gate if (po_share > MIN_PO_SHARE) {
5967c478bd9Sstevel@tonic-gate po_share >>= 1;
5977c478bd9Sstevel@tonic-gate }
5987c478bd9Sstevel@tonic-gate }
5997c478bd9Sstevel@tonic-gate mutex_exit(&pageout_mutex);
6007c478bd9Sstevel@tonic-gate }
6017c478bd9Sstevel@tonic-gate
6027c478bd9Sstevel@tonic-gate /*
6037c478bd9Sstevel@tonic-gate * Signal threads waiting for available memory.
6047c478bd9Sstevel@tonic-gate * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
6057c478bd9Sstevel@tonic-gate * in this case it is not needed - the waiters will be waken up during
6067c478bd9Sstevel@tonic-gate * the next invocation of this function.
6077c478bd9Sstevel@tonic-gate */
6087c478bd9Sstevel@tonic-gate if (kmem_avail() > 0)
6097c478bd9Sstevel@tonic-gate cv_broadcast(&memavail_cv);
6107c478bd9Sstevel@tonic-gate
6117c478bd9Sstevel@tonic-gate (void) timeout(schedpaging, arg, hz / RATETOSCHEDPAGING);
6127c478bd9Sstevel@tonic-gate }
6137c478bd9Sstevel@tonic-gate
6147c478bd9Sstevel@tonic-gate pgcnt_t pushes;
6157c478bd9Sstevel@tonic-gate ulong_t push_list_size; /* # of requests on pageout queue */
6167c478bd9Sstevel@tonic-gate
6177c478bd9Sstevel@tonic-gate #define FRONT 1
6187c478bd9Sstevel@tonic-gate #define BACK 2
6197c478bd9Sstevel@tonic-gate
6207c478bd9Sstevel@tonic-gate int dopageout = 1; /* must be non-zero to turn page stealing on */
6217c478bd9Sstevel@tonic-gate
6227c478bd9Sstevel@tonic-gate /*
6237c478bd9Sstevel@tonic-gate * The page out daemon, which runs as process 2.
6247c478bd9Sstevel@tonic-gate *
6257c478bd9Sstevel@tonic-gate * As long as there are at least lotsfree pages,
6267c478bd9Sstevel@tonic-gate * this process is not run. When the number of free
6277c478bd9Sstevel@tonic-gate * pages stays in the range desfree to lotsfree,
6287c478bd9Sstevel@tonic-gate * this daemon runs through the pages in the loop
6297c478bd9Sstevel@tonic-gate * at a rate determined in schedpaging(). Pageout manages
6307c478bd9Sstevel@tonic-gate * two hands on the clock. The front hand moves through
6317c478bd9Sstevel@tonic-gate * memory, clearing the reference bit,
6327c478bd9Sstevel@tonic-gate * and stealing pages from procs that are over maxrss.
6337c478bd9Sstevel@tonic-gate * The back hand travels a distance behind the front hand,
6347c478bd9Sstevel@tonic-gate * freeing the pages that have not been referenced in the time
6357c478bd9Sstevel@tonic-gate * since the front hand passed. If modified, they are pushed to
6367c478bd9Sstevel@tonic-gate * swap before being freed.
6377c478bd9Sstevel@tonic-gate *
6387c478bd9Sstevel@tonic-gate * There are 2 threads that act on behalf of the pageout process.
6397c478bd9Sstevel@tonic-gate * One thread scans pages (pageout_scanner) and frees them up if
6407c478bd9Sstevel@tonic-gate * they don't require any VOP_PUTPAGE operation. If a page must be
6417c478bd9Sstevel@tonic-gate * written back to its backing store, the request is put on a list
6427c478bd9Sstevel@tonic-gate * and the other (pageout) thread is signaled. The pageout thread
6437c478bd9Sstevel@tonic-gate * grabs VOP_PUTPAGE requests from the list, and processes them.
6447c478bd9Sstevel@tonic-gate * Some filesystems may require resources for the VOP_PUTPAGE
6457c478bd9Sstevel@tonic-gate * operations (like memory) and hence can block the pageout
6467c478bd9Sstevel@tonic-gate * thread, but the scanner thread can still operate. There is still
647da6c28aaSamw * no guarantee that memory deadlocks cannot occur.
6487c478bd9Sstevel@tonic-gate *
6497c478bd9Sstevel@tonic-gate * For now, this thing is in very rough form.
6507c478bd9Sstevel@tonic-gate */
6517c478bd9Sstevel@tonic-gate void
pageout()6527c478bd9Sstevel@tonic-gate pageout()
6537c478bd9Sstevel@tonic-gate {
6547c478bd9Sstevel@tonic-gate struct async_reqs *arg;
6557c478bd9Sstevel@tonic-gate pri_t pageout_pri;
6567c478bd9Sstevel@tonic-gate int i;
6577c478bd9Sstevel@tonic-gate pgcnt_t max_pushes;
6587c478bd9Sstevel@tonic-gate callb_cpr_t cprinfo;
6597c478bd9Sstevel@tonic-gate
6607c478bd9Sstevel@tonic-gate proc_pageout = ttoproc(curthread);
6617c478bd9Sstevel@tonic-gate proc_pageout->p_cstime = 0;
6627c478bd9Sstevel@tonic-gate proc_pageout->p_stime = 0;
6637c478bd9Sstevel@tonic-gate proc_pageout->p_cutime = 0;
6647c478bd9Sstevel@tonic-gate proc_pageout->p_utime = 0;
665ae115bc7Smrj bcopy("pageout", PTOU(curproc)->u_psargs, 8);
666ae115bc7Smrj bcopy("pageout", PTOU(curproc)->u_comm, 7);
6677c478bd9Sstevel@tonic-gate
6687c478bd9Sstevel@tonic-gate /*
6697c478bd9Sstevel@tonic-gate * Create pageout scanner thread
6707c478bd9Sstevel@tonic-gate */
6717c478bd9Sstevel@tonic-gate mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
6727c478bd9Sstevel@tonic-gate mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
6737c478bd9Sstevel@tonic-gate
6747c478bd9Sstevel@tonic-gate /*
6757c478bd9Sstevel@tonic-gate * Allocate and initialize the async request structures
6767c478bd9Sstevel@tonic-gate * for pageout.
6777c478bd9Sstevel@tonic-gate */
6787c478bd9Sstevel@tonic-gate push_req = (struct async_reqs *)
6797c478bd9Sstevel@tonic-gate kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
6807c478bd9Sstevel@tonic-gate
6817c478bd9Sstevel@tonic-gate req_freelist = push_req;
6827c478bd9Sstevel@tonic-gate for (i = 0; i < async_list_size - 1; i++)
6837c478bd9Sstevel@tonic-gate push_req[i].a_next = &push_req[i + 1];
6847c478bd9Sstevel@tonic-gate
6857c478bd9Sstevel@tonic-gate pageout_pri = curthread->t_pri;
686*35a5a358SJonathan Adams
687*35a5a358SJonathan Adams /* Create the pageout scanner thread. */
688*35a5a358SJonathan Adams (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
689*35a5a358SJonathan Adams pageout_pri - 1);
6907c478bd9Sstevel@tonic-gate
6917c478bd9Sstevel@tonic-gate /*
6927c478bd9Sstevel@tonic-gate * kick off pageout scheduler.
6937c478bd9Sstevel@tonic-gate */
6947c478bd9Sstevel@tonic-gate schedpaging(NULL);
6957c478bd9Sstevel@tonic-gate
6967c478bd9Sstevel@tonic-gate /*
6977c478bd9Sstevel@tonic-gate * Create kernel cage thread.
6987c478bd9Sstevel@tonic-gate * The kernel cage thread is started under the pageout process
6997c478bd9Sstevel@tonic-gate * to take advantage of the less restricted page allocation
7007c478bd9Sstevel@tonic-gate * in page_create_throttle().
7017c478bd9Sstevel@tonic-gate */
7027c478bd9Sstevel@tonic-gate kcage_cageout_init();
7037c478bd9Sstevel@tonic-gate
7047c478bd9Sstevel@tonic-gate /*
7057c478bd9Sstevel@tonic-gate * Limit pushes to avoid saturating pageout devices.
7067c478bd9Sstevel@tonic-gate */
7077c478bd9Sstevel@tonic-gate max_pushes = maxpgio / RATETOSCHEDPAGING;
7087c478bd9Sstevel@tonic-gate CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
7097c478bd9Sstevel@tonic-gate
7107c478bd9Sstevel@tonic-gate for (;;) {
7117c478bd9Sstevel@tonic-gate mutex_enter(&push_lock);
7127c478bd9Sstevel@tonic-gate
7137c478bd9Sstevel@tonic-gate while ((arg = push_list) == NULL || pushes > max_pushes) {
7147c478bd9Sstevel@tonic-gate CALLB_CPR_SAFE_BEGIN(&cprinfo);
7157c478bd9Sstevel@tonic-gate cv_wait(&push_cv, &push_lock);
7167c478bd9Sstevel@tonic-gate pushes = 0;
7177c478bd9Sstevel@tonic-gate CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
7187c478bd9Sstevel@tonic-gate }
7197c478bd9Sstevel@tonic-gate push_list = arg->a_next;
7207c478bd9Sstevel@tonic-gate arg->a_next = NULL;
7217c478bd9Sstevel@tonic-gate mutex_exit(&push_lock);
7227c478bd9Sstevel@tonic-gate
7237c478bd9Sstevel@tonic-gate if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
72406cfbf35Sjimp arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
7257c478bd9Sstevel@tonic-gate pushes++;
7267c478bd9Sstevel@tonic-gate }
7277c478bd9Sstevel@tonic-gate
7287c478bd9Sstevel@tonic-gate /* vp held by checkpage() */
7297c478bd9Sstevel@tonic-gate VN_RELE(arg->a_vp);
7307c478bd9Sstevel@tonic-gate
7317c478bd9Sstevel@tonic-gate mutex_enter(&push_lock);
7327c478bd9Sstevel@tonic-gate arg->a_next = req_freelist; /* back on freelist */
7337c478bd9Sstevel@tonic-gate req_freelist = arg;
7347c478bd9Sstevel@tonic-gate push_list_size--;
7357c478bd9Sstevel@tonic-gate mutex_exit(&push_lock);
7367c478bd9Sstevel@tonic-gate }
7377c478bd9Sstevel@tonic-gate }
7387c478bd9Sstevel@tonic-gate
7397c478bd9Sstevel@tonic-gate /*
7407c478bd9Sstevel@tonic-gate * Kernel thread that scans pages looking for ones to free
7417c478bd9Sstevel@tonic-gate */
7427c478bd9Sstevel@tonic-gate static void
pageout_scanner(void)7437c478bd9Sstevel@tonic-gate pageout_scanner(void)
7447c478bd9Sstevel@tonic-gate {
7457c478bd9Sstevel@tonic-gate struct page *fronthand, *backhand;
7467c478bd9Sstevel@tonic-gate uint_t count;
7477c478bd9Sstevel@tonic-gate callb_cpr_t cprinfo;
7487c478bd9Sstevel@tonic-gate pgcnt_t nscan_limit;
7497c478bd9Sstevel@tonic-gate pgcnt_t pcount;
7507c478bd9Sstevel@tonic-gate
7517c478bd9Sstevel@tonic-gate CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
7527c478bd9Sstevel@tonic-gate mutex_enter(&pageout_mutex);
7537c478bd9Sstevel@tonic-gate
7547c478bd9Sstevel@tonic-gate /*
7557c478bd9Sstevel@tonic-gate * The restart case does not attempt to point the hands at roughly
7567c478bd9Sstevel@tonic-gate * the right point on the assumption that after one circuit things
7577c478bd9Sstevel@tonic-gate * will have settled down - and restarts shouldn't be that often.
7587c478bd9Sstevel@tonic-gate */
7597c478bd9Sstevel@tonic-gate
7607c478bd9Sstevel@tonic-gate /*
7617c478bd9Sstevel@tonic-gate * Set the two clock hands to be separated by a reasonable amount,
7627c478bd9Sstevel@tonic-gate * but no more than 360 degrees apart.
7637c478bd9Sstevel@tonic-gate */
7647c478bd9Sstevel@tonic-gate backhand = page_first();
7657c478bd9Sstevel@tonic-gate if (handspreadpages >= total_pages)
7667c478bd9Sstevel@tonic-gate fronthand = page_nextn(backhand, total_pages - 1);
7677c478bd9Sstevel@tonic-gate else
7687c478bd9Sstevel@tonic-gate fronthand = page_nextn(backhand, handspreadpages);
7697c478bd9Sstevel@tonic-gate
7707c478bd9Sstevel@tonic-gate min_pageout_ticks = MAX(1,
7717c478bd9Sstevel@tonic-gate ((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING);
7727c478bd9Sstevel@tonic-gate max_pageout_ticks = MAX(min_pageout_ticks,
7737c478bd9Sstevel@tonic-gate ((hz * max_percent_cpu) / 100) / RATETOSCHEDPAGING);
7747c478bd9Sstevel@tonic-gate
7757c478bd9Sstevel@tonic-gate loop:
7767c478bd9Sstevel@tonic-gate cv_signal_pageout();
7777c478bd9Sstevel@tonic-gate
7787c478bd9Sstevel@tonic-gate CALLB_CPR_SAFE_BEGIN(&cprinfo);
7797c478bd9Sstevel@tonic-gate cv_wait(&proc_pageout->p_cv, &pageout_mutex);
7807c478bd9Sstevel@tonic-gate CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
7817c478bd9Sstevel@tonic-gate
7827c478bd9Sstevel@tonic-gate if (!dopageout)
7837c478bd9Sstevel@tonic-gate goto loop;
7847c478bd9Sstevel@tonic-gate
7857c478bd9Sstevel@tonic-gate if (reset_hands) {
7867c478bd9Sstevel@tonic-gate reset_hands = 0;
7877c478bd9Sstevel@tonic-gate
7887c478bd9Sstevel@tonic-gate backhand = page_first();
7897c478bd9Sstevel@tonic-gate if (handspreadpages >= total_pages)
7907c478bd9Sstevel@tonic-gate fronthand = page_nextn(backhand, total_pages - 1);
7917c478bd9Sstevel@tonic-gate else
7927c478bd9Sstevel@tonic-gate fronthand = page_nextn(backhand, handspreadpages);
7937c478bd9Sstevel@tonic-gate }
7947c478bd9Sstevel@tonic-gate
7957c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
7967c478bd9Sstevel@tonic-gate count = 0;
7977c478bd9Sstevel@tonic-gate
7987c478bd9Sstevel@tonic-gate TRACE_4(TR_FAC_VM, TR_PAGEOUT_START,
7997c478bd9Sstevel@tonic-gate "pageout_start:freemem %ld lotsfree %ld nscan %ld desscan %ld",
8007c478bd9Sstevel@tonic-gate freemem, lotsfree, nscan, desscan);
8017c478bd9Sstevel@tonic-gate
8027c478bd9Sstevel@tonic-gate /* Kernel probe */
8037c478bd9Sstevel@tonic-gate TNF_PROBE_2(pageout_scan_start, "vm pagedaemon", /* CSTYLED */,
80406cfbf35Sjimp tnf_ulong, pages_free, freemem, tnf_ulong, pages_needed, needfree);
8057c478bd9Sstevel@tonic-gate
8067c478bd9Sstevel@tonic-gate pcount = 0;
8077c478bd9Sstevel@tonic-gate if (pageout_sample_cnt < pageout_sample_lim) {
8087c478bd9Sstevel@tonic-gate nscan_limit = total_pages;
8097c478bd9Sstevel@tonic-gate } else {
8107c478bd9Sstevel@tonic-gate nscan_limit = desscan;
8117c478bd9Sstevel@tonic-gate }
812d3d50737SRafael Vanoni pageout_lbolt = ddi_get_lbolt();
8137c478bd9Sstevel@tonic-gate sample_start = gethrtime();
8147c478bd9Sstevel@tonic-gate
8157c478bd9Sstevel@tonic-gate /*
8167c478bd9Sstevel@tonic-gate * Scan the appropriate number of pages for a single duty cycle.
8177c478bd9Sstevel@tonic-gate * However, stop scanning as soon as there is enough free memory.
8187c478bd9Sstevel@tonic-gate * For a short while, we will be sampling the performance of the
8197c478bd9Sstevel@tonic-gate * scanner and need to keep running just to get sample data, in
8207c478bd9Sstevel@tonic-gate * which case we keep going and don't pay attention to whether
8217c478bd9Sstevel@tonic-gate * or not there is enough free memory.
8227c478bd9Sstevel@tonic-gate */
8237c478bd9Sstevel@tonic-gate
8247c478bd9Sstevel@tonic-gate while (nscan < nscan_limit && (freemem < lotsfree + needfree ||
8257c478bd9Sstevel@tonic-gate pageout_sample_cnt < pageout_sample_lim)) {
8267c478bd9Sstevel@tonic-gate int rvfront, rvback;
8277c478bd9Sstevel@tonic-gate
8287c478bd9Sstevel@tonic-gate /*
8297c478bd9Sstevel@tonic-gate * Check to see if we have exceeded our %CPU budget
8307c478bd9Sstevel@tonic-gate * for this wakeup, but not on every single page visited,
8317c478bd9Sstevel@tonic-gate * just every once in a while.
8327c478bd9Sstevel@tonic-gate */
8337c478bd9Sstevel@tonic-gate if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
834d3d50737SRafael Vanoni pageout_cycle_ticks = ddi_get_lbolt() - pageout_lbolt;
8357c478bd9Sstevel@tonic-gate if (pageout_cycle_ticks >= pageout_ticks) {
8367c478bd9Sstevel@tonic-gate ++pageout_timeouts;
8377c478bd9Sstevel@tonic-gate break;
8387c478bd9Sstevel@tonic-gate }
8397c478bd9Sstevel@tonic-gate }
8407c478bd9Sstevel@tonic-gate
8417c478bd9Sstevel@tonic-gate /*
8427c478bd9Sstevel@tonic-gate * If checkpage manages to add a page to the free list,
8437c478bd9Sstevel@tonic-gate * we give ourselves another couple of trips around the loop.
8447c478bd9Sstevel@tonic-gate */
8457c478bd9Sstevel@tonic-gate if ((rvfront = checkpage(fronthand, FRONT)) == 1)
8467c478bd9Sstevel@tonic-gate count = 0;
8477c478bd9Sstevel@tonic-gate if ((rvback = checkpage(backhand, BACK)) == 1)
8487c478bd9Sstevel@tonic-gate count = 0;
8497c478bd9Sstevel@tonic-gate
8507c478bd9Sstevel@tonic-gate ++pcount;
8517c478bd9Sstevel@tonic-gate
8527c478bd9Sstevel@tonic-gate /*
8537c478bd9Sstevel@tonic-gate * protected by pageout_mutex instead of cpu_stat_lock
8547c478bd9Sstevel@tonic-gate */
8557c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(CPU, vm, scan, 1);
8567c478bd9Sstevel@tonic-gate
8577c478bd9Sstevel@tonic-gate /*
8587c478bd9Sstevel@tonic-gate * Don't include ineligible pages in the number scanned.
8597c478bd9Sstevel@tonic-gate */
8607c478bd9Sstevel@tonic-gate if (rvfront != -1 || rvback != -1)
8617c478bd9Sstevel@tonic-gate nscan++;
8627c478bd9Sstevel@tonic-gate
8637c478bd9Sstevel@tonic-gate backhand = page_next(backhand);
8647c478bd9Sstevel@tonic-gate
8657c478bd9Sstevel@tonic-gate /*
8667c478bd9Sstevel@tonic-gate * backhand update and wraparound check are done separately
8677c478bd9Sstevel@tonic-gate * because lint barks when it finds an empty "if" body
8687c478bd9Sstevel@tonic-gate */
8697c478bd9Sstevel@tonic-gate
8707c478bd9Sstevel@tonic-gate if ((fronthand = page_next(fronthand)) == page_first()) {
8717c478bd9Sstevel@tonic-gate TRACE_2(TR_FAC_VM, TR_PAGEOUT_HAND_WRAP,
8727c478bd9Sstevel@tonic-gate "pageout_hand_wrap:freemem %ld whichhand %d",
8737c478bd9Sstevel@tonic-gate freemem, FRONT);
8747c478bd9Sstevel@tonic-gate
8757c478bd9Sstevel@tonic-gate /*
8767c478bd9Sstevel@tonic-gate * protected by pageout_mutex instead of cpu_stat_lock
8777c478bd9Sstevel@tonic-gate */
8787c478bd9Sstevel@tonic-gate CPU_STATS_ADDQ(CPU, vm, rev, 1);
8797c478bd9Sstevel@tonic-gate if (++count > 1) {
8807c478bd9Sstevel@tonic-gate /*
8817c478bd9Sstevel@tonic-gate * Extremely unlikely, but it happens.
8827c478bd9Sstevel@tonic-gate * We went around the loop at least once
8837c478bd9Sstevel@tonic-gate * and didn't get far enough.
8847c478bd9Sstevel@tonic-gate * If we are still skipping `highly shared'
8857c478bd9Sstevel@tonic-gate * pages, skip fewer of them. Otherwise,
8867c478bd9Sstevel@tonic-gate * give up till the next clock tick.
8877c478bd9Sstevel@tonic-gate */
8887c478bd9Sstevel@tonic-gate if (po_share < MAX_PO_SHARE) {
8897c478bd9Sstevel@tonic-gate po_share <<= 1;
8907c478bd9Sstevel@tonic-gate } else {
8917c478bd9Sstevel@tonic-gate /*
8927c478bd9Sstevel@tonic-gate * Really a "goto loop", but
8937c478bd9Sstevel@tonic-gate * if someone is TRACing or
8947c478bd9Sstevel@tonic-gate * TNF_PROBE_ing, at least
8957c478bd9Sstevel@tonic-gate * make records to show
8967c478bd9Sstevel@tonic-gate * where we are.
8977c478bd9Sstevel@tonic-gate */
8987c478bd9Sstevel@tonic-gate break;
8997c478bd9Sstevel@tonic-gate }
9007c478bd9Sstevel@tonic-gate }
9017c478bd9Sstevel@tonic-gate }
9027c478bd9Sstevel@tonic-gate }
9037c478bd9Sstevel@tonic-gate
9047c478bd9Sstevel@tonic-gate sample_end = gethrtime();
9057c478bd9Sstevel@tonic-gate
9067c478bd9Sstevel@tonic-gate TRACE_5(TR_FAC_VM, TR_PAGEOUT_END,
9077c478bd9Sstevel@tonic-gate "pageout_end:freemem %ld lots %ld nscan %ld des %ld count %u",
9087c478bd9Sstevel@tonic-gate freemem, lotsfree, nscan, desscan, count);
9097c478bd9Sstevel@tonic-gate
9107c478bd9Sstevel@tonic-gate /* Kernel probe */
9117c478bd9Sstevel@tonic-gate TNF_PROBE_2(pageout_scan_end, "vm pagedaemon", /* CSTYLED */,
91206cfbf35Sjimp tnf_ulong, pages_scanned, nscan, tnf_ulong, pages_free, freemem);
9137c478bd9Sstevel@tonic-gate
9147c478bd9Sstevel@tonic-gate if (pageout_sample_cnt < pageout_sample_lim) {
9157c478bd9Sstevel@tonic-gate pageout_sample_pages += pcount;
9167c478bd9Sstevel@tonic-gate pageout_sample_etime += sample_end - sample_start;
9177c478bd9Sstevel@tonic-gate ++pageout_sample_cnt;
9187c478bd9Sstevel@tonic-gate }
9197c478bd9Sstevel@tonic-gate if (pageout_sample_cnt >= pageout_sample_lim &&
9207c478bd9Sstevel@tonic-gate pageout_new_spread == 0) {
9217c478bd9Sstevel@tonic-gate pageout_rate = (hrrate_t)pageout_sample_pages *
9227c478bd9Sstevel@tonic-gate (hrrate_t)(NANOSEC) / pageout_sample_etime;
9237c478bd9Sstevel@tonic-gate pageout_new_spread = pageout_rate / 10;
9247c478bd9Sstevel@tonic-gate setupclock(1);
9257c478bd9Sstevel@tonic-gate }
9267c478bd9Sstevel@tonic-gate
9277c478bd9Sstevel@tonic-gate goto loop;
9287c478bd9Sstevel@tonic-gate }
9297c478bd9Sstevel@tonic-gate
9307c478bd9Sstevel@tonic-gate /*
9317c478bd9Sstevel@tonic-gate * Look at the page at hand. If it is locked (e.g., for physical i/o),
9327c478bd9Sstevel@tonic-gate * system (u., page table) or free, then leave it alone. Otherwise,
9337c478bd9Sstevel@tonic-gate * if we are running the front hand, turn off the page's reference bit.
9347c478bd9Sstevel@tonic-gate * If the proc is over maxrss, we take it. If running the back hand,
9357c478bd9Sstevel@tonic-gate * check whether the page has been reclaimed. If not, free the page,
9367c478bd9Sstevel@tonic-gate * pushing it to disk first if necessary.
9377c478bd9Sstevel@tonic-gate *
9387c478bd9Sstevel@tonic-gate * Return values:
9397c478bd9Sstevel@tonic-gate * -1 if the page is not a candidate at all,
9407c478bd9Sstevel@tonic-gate * 0 if not freed, or
9417c478bd9Sstevel@tonic-gate * 1 if we freed it.
9427c478bd9Sstevel@tonic-gate */
9437c478bd9Sstevel@tonic-gate static int
checkpage(struct page * pp,int whichhand)9447c478bd9Sstevel@tonic-gate checkpage(struct page *pp, int whichhand)
9457c478bd9Sstevel@tonic-gate {
9467c478bd9Sstevel@tonic-gate int ppattr;
9477c478bd9Sstevel@tonic-gate int isfs = 0;
9487c478bd9Sstevel@tonic-gate int isexec = 0;
9497c478bd9Sstevel@tonic-gate int pagesync_flag;
9507c478bd9Sstevel@tonic-gate
9517c478bd9Sstevel@tonic-gate /*
9527c478bd9Sstevel@tonic-gate * Skip pages:
9537c478bd9Sstevel@tonic-gate * - associated with the kernel vnode since
9547c478bd9Sstevel@tonic-gate * they are always "exclusively" locked.
9557c478bd9Sstevel@tonic-gate * - that are free
9567c478bd9Sstevel@tonic-gate * - that are shared more than po_share'd times
9577c478bd9Sstevel@tonic-gate * - its already locked
9587c478bd9Sstevel@tonic-gate *
9597c478bd9Sstevel@tonic-gate * NOTE: These optimizations assume that reads are atomic.
9607c478bd9Sstevel@tonic-gate */
961a98e9dbfSaguzovsk
962a98e9dbfSaguzovsk if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
963a98e9dbfSaguzovsk pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
964a98e9dbfSaguzovsk hat_page_checkshare(pp, po_share)) {
9657c478bd9Sstevel@tonic-gate return (-1);
9667c478bd9Sstevel@tonic-gate }
9677c478bd9Sstevel@tonic-gate
9687c478bd9Sstevel@tonic-gate if (!page_trylock(pp, SE_EXCL)) {
9697c478bd9Sstevel@tonic-gate /*
9707c478bd9Sstevel@tonic-gate * Skip the page if we can't acquire the "exclusive" lock.
9717c478bd9Sstevel@tonic-gate */
9727c478bd9Sstevel@tonic-gate return (-1);
9737c478bd9Sstevel@tonic-gate } else if (PP_ISFREE(pp)) {
9747c478bd9Sstevel@tonic-gate /*
9757c478bd9Sstevel@tonic-gate * It became free between the above check and our actually
9767c478bd9Sstevel@tonic-gate * locking the page. Oh, well there will be other pages.
9777c478bd9Sstevel@tonic-gate */
9787c478bd9Sstevel@tonic-gate page_unlock(pp);
9797c478bd9Sstevel@tonic-gate return (-1);
9807c478bd9Sstevel@tonic-gate }
9817c478bd9Sstevel@tonic-gate
9827c478bd9Sstevel@tonic-gate /*
9837c478bd9Sstevel@tonic-gate * Reject pages that cannot be freed. The page_struct_lock
9847c478bd9Sstevel@tonic-gate * need not be acquired to examine these
9857c478bd9Sstevel@tonic-gate * fields since the page has an "exclusive" lock.
9867c478bd9Sstevel@tonic-gate */
9877c478bd9Sstevel@tonic-gate if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
9887c478bd9Sstevel@tonic-gate page_unlock(pp);
9897c478bd9Sstevel@tonic-gate return (-1);
9907c478bd9Sstevel@tonic-gate }
9917c478bd9Sstevel@tonic-gate
9927c478bd9Sstevel@tonic-gate /*
9937c478bd9Sstevel@tonic-gate * Maintain statistics for what we are freeing
9947c478bd9Sstevel@tonic-gate */
9957c478bd9Sstevel@tonic-gate
9967c478bd9Sstevel@tonic-gate if (pp->p_vnode != NULL) {
9977c478bd9Sstevel@tonic-gate if (pp->p_vnode->v_flag & VVMEXEC)
9987c478bd9Sstevel@tonic-gate isexec = 1;
9997c478bd9Sstevel@tonic-gate
10007c478bd9Sstevel@tonic-gate if (!IS_SWAPFSVP(pp->p_vnode))
10017c478bd9Sstevel@tonic-gate isfs = 1;
10027c478bd9Sstevel@tonic-gate }
10037c478bd9Sstevel@tonic-gate
10047c478bd9Sstevel@tonic-gate /*
10057c478bd9Sstevel@tonic-gate * Turn off REF and MOD bits with the front hand.
10067c478bd9Sstevel@tonic-gate * The back hand examines the REF bit and always considers
10077c478bd9Sstevel@tonic-gate * SHARED pages as referenced.
10087c478bd9Sstevel@tonic-gate */
10097c478bd9Sstevel@tonic-gate if (whichhand == FRONT)
10107c478bd9Sstevel@tonic-gate pagesync_flag = HAT_SYNC_ZERORM;
10117c478bd9Sstevel@tonic-gate else
10127c478bd9Sstevel@tonic-gate pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
10137c478bd9Sstevel@tonic-gate HAT_SYNC_STOPON_SHARED;
10147c478bd9Sstevel@tonic-gate
10157c478bd9Sstevel@tonic-gate ppattr = hat_pagesync(pp, pagesync_flag);
10167c478bd9Sstevel@tonic-gate
10177c478bd9Sstevel@tonic-gate recheck:
10187c478bd9Sstevel@tonic-gate /*
10197c478bd9Sstevel@tonic-gate * If page is referenced; make unreferenced but reclaimable.
10207c478bd9Sstevel@tonic-gate * If this page is not referenced, then it must be reclaimable
10217c478bd9Sstevel@tonic-gate * and we can add it to the free list.
10227c478bd9Sstevel@tonic-gate */
10237c478bd9Sstevel@tonic-gate if (ppattr & P_REF) {
10247c478bd9Sstevel@tonic-gate TRACE_2(TR_FAC_VM, TR_PAGEOUT_ISREF,
10257c478bd9Sstevel@tonic-gate "pageout_isref:pp %p whichhand %d", pp, whichhand);
10267c478bd9Sstevel@tonic-gate if (whichhand == FRONT) {
10277c478bd9Sstevel@tonic-gate /*
10287c478bd9Sstevel@tonic-gate * Checking of rss or madvise flags needed here...
10297c478bd9Sstevel@tonic-gate *
10307c478bd9Sstevel@tonic-gate * If not "well-behaved", fall through into the code
10317c478bd9Sstevel@tonic-gate * for not referenced.
10327c478bd9Sstevel@tonic-gate */
10337c478bd9Sstevel@tonic-gate hat_clrref(pp);
10347c478bd9Sstevel@tonic-gate }
10357c478bd9Sstevel@tonic-gate /*
10367c478bd9Sstevel@tonic-gate * Somebody referenced the page since the front
10377c478bd9Sstevel@tonic-gate * hand went by, so it's not a candidate for
10387c478bd9Sstevel@tonic-gate * freeing up.
10397c478bd9Sstevel@tonic-gate */
10407c478bd9Sstevel@tonic-gate page_unlock(pp);
10417c478bd9Sstevel@tonic-gate return (0);
10427c478bd9Sstevel@tonic-gate }
10437c478bd9Sstevel@tonic-gate
10447c478bd9Sstevel@tonic-gate VM_STAT_ADD(pageoutvmstats.checkpage[0]);
10457c478bd9Sstevel@tonic-gate
10467c478bd9Sstevel@tonic-gate /*
10477c478bd9Sstevel@tonic-gate * If large page, attempt to demote it. If successfully demoted,
10487c478bd9Sstevel@tonic-gate * retry the checkpage.
10497c478bd9Sstevel@tonic-gate */
10507c478bd9Sstevel@tonic-gate if (pp->p_szc != 0) {
10517c478bd9Sstevel@tonic-gate if (!page_try_demote_pages(pp)) {
10527c478bd9Sstevel@tonic-gate VM_STAT_ADD(pageoutvmstats.checkpage[1]);
10537c478bd9Sstevel@tonic-gate page_unlock(pp);
10547c478bd9Sstevel@tonic-gate return (-1);
10557c478bd9Sstevel@tonic-gate }
10567c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc == 0);
10577c478bd9Sstevel@tonic-gate VM_STAT_ADD(pageoutvmstats.checkpage[2]);
10587c478bd9Sstevel@tonic-gate /*
10597c478bd9Sstevel@tonic-gate * since page_try_demote_pages() could have unloaded some
10607c478bd9Sstevel@tonic-gate * mappings it makes sense to reload ppattr.
10617c478bd9Sstevel@tonic-gate */
10627c478bd9Sstevel@tonic-gate ppattr = hat_page_getattr(pp, P_MOD | P_REF);
10637c478bd9Sstevel@tonic-gate }
10647c478bd9Sstevel@tonic-gate
10657c478bd9Sstevel@tonic-gate /*
10667c478bd9Sstevel@tonic-gate * If the page is currently dirty, we have to arrange
10677c478bd9Sstevel@tonic-gate * to have it cleaned before it can be freed.
10687c478bd9Sstevel@tonic-gate *
10697c478bd9Sstevel@tonic-gate * XXX - ASSERT(pp->p_vnode != NULL);
10707c478bd9Sstevel@tonic-gate */
10717c478bd9Sstevel@tonic-gate if ((ppattr & P_MOD) && pp->p_vnode) {
10727c478bd9Sstevel@tonic-gate struct vnode *vp = pp->p_vnode;
10737c478bd9Sstevel@tonic-gate u_offset_t offset = pp->p_offset;
10747c478bd9Sstevel@tonic-gate
10757c478bd9Sstevel@tonic-gate /*
10767c478bd9Sstevel@tonic-gate * XXX - Test for process being swapped out or about to exit?
10777c478bd9Sstevel@tonic-gate * [Can't get back to process(es) using the page.]
10787c478bd9Sstevel@tonic-gate */
10797c478bd9Sstevel@tonic-gate
10807c478bd9Sstevel@tonic-gate /*
10817c478bd9Sstevel@tonic-gate * Hold the vnode before releasing the page lock to
10827c478bd9Sstevel@tonic-gate * prevent it from being freed and re-used by some
10837c478bd9Sstevel@tonic-gate * other thread.
10847c478bd9Sstevel@tonic-gate */
10857c478bd9Sstevel@tonic-gate VN_HOLD(vp);
10867c478bd9Sstevel@tonic-gate page_unlock(pp);
10877c478bd9Sstevel@tonic-gate
10887c478bd9Sstevel@tonic-gate /*
10897c478bd9Sstevel@tonic-gate * Queue i/o request for the pageout thread.
10907c478bd9Sstevel@tonic-gate */
10917c478bd9Sstevel@tonic-gate if (!queue_io_request(vp, offset)) {
10927c478bd9Sstevel@tonic-gate VN_RELE(vp);
10937c478bd9Sstevel@tonic-gate return (0);
10947c478bd9Sstevel@tonic-gate }
10957c478bd9Sstevel@tonic-gate return (1);
10967c478bd9Sstevel@tonic-gate }
10977c478bd9Sstevel@tonic-gate
10987c478bd9Sstevel@tonic-gate /*
10997c478bd9Sstevel@tonic-gate * Now we unload all the translations,
11007c478bd9Sstevel@tonic-gate * and put the page back on to the free list.
11017c478bd9Sstevel@tonic-gate * If the page was used (referenced or modified) after
11027c478bd9Sstevel@tonic-gate * the pagesync but before it was unloaded we catch it
11037c478bd9Sstevel@tonic-gate * and handle the page properly.
11047c478bd9Sstevel@tonic-gate */
11057c478bd9Sstevel@tonic-gate TRACE_2(TR_FAC_VM, TR_PAGEOUT_FREE,
11067c478bd9Sstevel@tonic-gate "pageout_free:pp %p whichhand %d", pp, whichhand);
11077c478bd9Sstevel@tonic-gate (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
11087c478bd9Sstevel@tonic-gate ppattr = hat_page_getattr(pp, P_MOD | P_REF);
11097c478bd9Sstevel@tonic-gate if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode))
11107c478bd9Sstevel@tonic-gate goto recheck;
11117c478bd9Sstevel@tonic-gate
11127c478bd9Sstevel@tonic-gate /*LINTED: constant in conditional context*/
11137c478bd9Sstevel@tonic-gate VN_DISPOSE(pp, B_FREE, 0, kcred);
11147c478bd9Sstevel@tonic-gate
11157c478bd9Sstevel@tonic-gate CPU_STATS_ADD_K(vm, dfree, 1);
11167c478bd9Sstevel@tonic-gate
11177c478bd9Sstevel@tonic-gate if (isfs) {
11187c478bd9Sstevel@tonic-gate if (isexec) {
11197c478bd9Sstevel@tonic-gate CPU_STATS_ADD_K(vm, execfree, 1);
11207c478bd9Sstevel@tonic-gate } else {
11217c478bd9Sstevel@tonic-gate CPU_STATS_ADD_K(vm, fsfree, 1);
11227c478bd9Sstevel@tonic-gate }
11237c478bd9Sstevel@tonic-gate } else {
11247c478bd9Sstevel@tonic-gate CPU_STATS_ADD_K(vm, anonfree, 1);
11257c478bd9Sstevel@tonic-gate }
11267c478bd9Sstevel@tonic-gate
11277c478bd9Sstevel@tonic-gate return (1); /* freed a page! */
11287c478bd9Sstevel@tonic-gate }
11297c478bd9Sstevel@tonic-gate
11307c478bd9Sstevel@tonic-gate /*
11317c478bd9Sstevel@tonic-gate * Queue async i/o request from pageout_scanner and segment swapout
11327c478bd9Sstevel@tonic-gate * routines on one common list. This ensures that pageout devices (swap)
11337c478bd9Sstevel@tonic-gate * are not saturated by pageout_scanner or swapout requests.
11347c478bd9Sstevel@tonic-gate * The pageout thread empties this list by initiating i/o operations.
11357c478bd9Sstevel@tonic-gate */
11367c478bd9Sstevel@tonic-gate int
queue_io_request(vnode_t * vp,u_offset_t off)11377c478bd9Sstevel@tonic-gate queue_io_request(vnode_t *vp, u_offset_t off)
11387c478bd9Sstevel@tonic-gate {
11397c478bd9Sstevel@tonic-gate struct async_reqs *arg;
11407c478bd9Sstevel@tonic-gate
11417c478bd9Sstevel@tonic-gate /*
11427c478bd9Sstevel@tonic-gate * If we cannot allocate an async request struct,
11437c478bd9Sstevel@tonic-gate * skip this page.
11447c478bd9Sstevel@tonic-gate */
11457c478bd9Sstevel@tonic-gate mutex_enter(&push_lock);
11467c478bd9Sstevel@tonic-gate if ((arg = req_freelist) == NULL) {
11477c478bd9Sstevel@tonic-gate mutex_exit(&push_lock);
11487c478bd9Sstevel@tonic-gate return (0);
11497c478bd9Sstevel@tonic-gate }
11507c478bd9Sstevel@tonic-gate req_freelist = arg->a_next; /* adjust freelist */
11517c478bd9Sstevel@tonic-gate push_list_size++;
11527c478bd9Sstevel@tonic-gate
11537c478bd9Sstevel@tonic-gate arg->a_vp = vp;
11547c478bd9Sstevel@tonic-gate arg->a_off = off;
11557c478bd9Sstevel@tonic-gate arg->a_len = PAGESIZE;
11567c478bd9Sstevel@tonic-gate arg->a_flags = B_ASYNC | B_FREE;
11577c478bd9Sstevel@tonic-gate arg->a_cred = kcred; /* always held */
11587c478bd9Sstevel@tonic-gate
11597c478bd9Sstevel@tonic-gate /*
11607c478bd9Sstevel@tonic-gate * Add to list of pending write requests.
11617c478bd9Sstevel@tonic-gate */
11627c478bd9Sstevel@tonic-gate arg->a_next = push_list;
11637c478bd9Sstevel@tonic-gate push_list = arg;
11647c478bd9Sstevel@tonic-gate
11657c478bd9Sstevel@tonic-gate if (req_freelist == NULL) {
11667c478bd9Sstevel@tonic-gate /*
11677c478bd9Sstevel@tonic-gate * No free async requests left. The lock is held so we
11687c478bd9Sstevel@tonic-gate * might as well signal the pusher thread now.
11697c478bd9Sstevel@tonic-gate */
11707c478bd9Sstevel@tonic-gate cv_signal(&push_cv);
11717c478bd9Sstevel@tonic-gate }
11727c478bd9Sstevel@tonic-gate mutex_exit(&push_lock);
11737c478bd9Sstevel@tonic-gate return (1);
11747c478bd9Sstevel@tonic-gate }
11757c478bd9Sstevel@tonic-gate
11767c478bd9Sstevel@tonic-gate /*
11777c478bd9Sstevel@tonic-gate * Wakeup pageout to initiate i/o if push_list is not empty.
11787c478bd9Sstevel@tonic-gate */
11797c478bd9Sstevel@tonic-gate void
cv_signal_pageout()11807c478bd9Sstevel@tonic-gate cv_signal_pageout()
11817c478bd9Sstevel@tonic-gate {
11827c478bd9Sstevel@tonic-gate if (push_list != NULL) {
11837c478bd9Sstevel@tonic-gate mutex_enter(&push_lock);
11847c478bd9Sstevel@tonic-gate cv_signal(&push_cv);
11857c478bd9Sstevel@tonic-gate mutex_exit(&push_lock);
11867c478bd9Sstevel@tonic-gate }
11877c478bd9Sstevel@tonic-gate }
1188