1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2018 Joyent, Inc. 24 * Copyright 2023 Oxide Computer Company 25 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. 26 */ 27 28 /* 29 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 30 * Use is subject to license terms. 31 */ 32 33 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 34 /* All Rights Reserved */ 35 36 /* 37 * University Copyright- Copyright (c) 1982, 1986, 1988 38 * The Regents of the University of California 39 * All Rights Reserved 40 * 41 * University Acknowledgment- Portions of this document are derived from 42 * software developed by the University of California, Berkeley, and its 43 * contributors. 44 */ 45 46 #include <sys/types.h> 47 #include <sys/t_lock.h> 48 #include <sys/param.h> 49 #include <sys/buf.h> 50 #include <sys/uio.h> 51 #include <sys/proc.h> 52 #include <sys/systm.h> 53 #include <sys/mman.h> 54 #include <sys/cred.h> 55 #include <sys/vnode.h> 56 #include <sys/vm.h> 57 #include <sys/vmparam.h> 58 #include <sys/vtrace.h> 59 #include <sys/cmn_err.h> 60 #include <sys/cpuvar.h> 61 #include <sys/user.h> 62 #include <sys/kmem.h> 63 #include <sys/debug.h> 64 #include <sys/callb.h> 65 #include <sys/mem_cage.h> 66 #include <sys/time.h> 67 #include <sys/stdbool.h> 68 69 #include <vm/hat.h> 70 #include <vm/as.h> 71 #include <vm/seg.h> 72 #include <vm/page.h> 73 #include <vm/pvn.h> 74 #include <vm/seg_kmem.h> 75 76 /* 77 * FREE MEMORY MANAGEMENT 78 * 79 * Management of the pool of free pages is a tricky business. There are 80 * several critical threshold values which constrain our allocation of new 81 * pages and inform the rate of paging out of memory to swap. These threshold 82 * values, and the behaviour they induce, are described below in descending 83 * order of size -- and thus increasing order of severity! 84 * 85 * +---------------------------------------------------- physmem (all memory) 86 * | 87 * | Ordinarily there are no particular constraints placed on page 88 * v allocation. The page scanner is not running and page_create_va() 89 * | will effectively grant all page requests (whether from the kernel 90 * | or from user processes) without artificial delay. 91 * | 92 * +------------------------ lotsfree (1.56% of physmem, min. 16MB, max. 2GB) 93 * | 94 * | When we have less than "lotsfree" pages, pageout_scanner() is 95 * v signalled by schedpaging() to begin looking for pages that can 96 * | be evicted to disk to bring us back above lotsfree. At this 97 * | stage there is still no constraint on allocation of free pages. 98 * | 99 * | For small systems, we set a lower bound of 16MB for lotsfree; 100 * v this is the natural value for a system with 1GB memory. This is 101 * | to ensure that the pageout reserve pool contains at least 4MB 102 * | for use by ZFS. 103 * | 104 * | For systems with a large amount of memory, we constrain lotsfree 105 * | to be at most 2GB (with a pageout reserve of around 0.5GB), as 106 * v at some point the required slack relates more closely to the 107 * | rate at which paging can occur than to the total amount of memory. 108 * | 109 * +------------------- desfree (1/2 of lotsfree, 0.78% of physmem, min. 8MB) 110 * | 111 * | When we drop below desfree, a number of kernel facilities will 112 * v wait before allocating more memory, under the assumption that 113 * | pageout or reaping will make progress and free up some memory. 114 * | This behaviour is not especially coordinated; look for comparisons 115 * | of desfree and freemem. 116 * | 117 * | In addition to various attempts at advisory caution, clock() 118 * | will wake up the thread that is ordinarily parked in sched(). 119 * | This routine is responsible for the heavy-handed swapping out 120 * v of entire processes in an attempt to arrest the slide of free 121 * | memory. See comments in sched.c for more details. 122 * | 123 * +----- minfree & throttlefree (3/4 of desfree, 0.59% of physmem, min. 6MB) 124 * | 125 * | These two separate tunables have, by default, the same value. 126 * v Various parts of the kernel use minfree to signal the need for 127 * | more aggressive reclamation of memory, and sched() is more 128 * | aggressive at swapping processes out. 129 * | 130 * | If free memory falls below throttlefree, page_create_va() will 131 * | use page_create_throttle() to begin holding most requests for 132 * | new pages while pageout and reaping free up memory. Sleeping 133 * v allocations (e.g., KM_SLEEP) are held here while we wait for 134 * | more memory. Non-sleeping allocations are generally allowed to 135 * | proceed, unless their priority is explicitly lowered with 136 * | KM_NORMALPRI (Note: KM_NOSLEEP_LAZY == (KM_NOSLEEP | KM_NORMALPRI).). 137 * | 138 * +------- pageout_reserve (3/4 of throttlefree, 0.44% of physmem, min. 4MB) 139 * | 140 * | When we hit throttlefree, the situation is already dire. The 141 * v system is generally paging out memory and swapping out entire 142 * | processes in order to free up memory for continued operation. 143 * | 144 * | Unfortunately, evicting memory to disk generally requires short 145 * | term use of additional memory; e.g., allocation of buffers for 146 * | storage drivers, updating maps of free and used blocks, etc. 147 * | As such, pageout_reserve is the number of pages that we keep in 148 * | special reserve for use by pageout() and sched() and by any 149 * v other parts of the kernel that need to be working for those to 150 * | make forward progress such as the ZFS I/O pipeline. 151 * | 152 * | When we are below pageout_reserve, we fail or hold any allocation 153 * | that has not explicitly requested access to the reserve pool. 154 * | Access to the reserve is generally granted via the KM_PUSHPAGE 155 * | flag, or by marking a thread T_PUSHPAGE such that all allocations 156 * | can implicitly tap the reserve. For more details, see the 157 * v NOMEMWAIT() macro, the T_PUSHPAGE thread flag, the KM_PUSHPAGE 158 * | and VM_PUSHPAGE allocation flags, and page_create_throttle(). 159 * | 160 * +---------------------------------------------------------- no free memory 161 * | 162 * | If we have arrived here, things are very bad indeed. It is 163 * v surprisingly difficult to tell if this condition is even fatal, 164 * | as enough memory may have been granted to pageout() and to the 165 * | ZFS I/O pipeline that requests for eviction that have already been 166 * | made will complete and free up memory some time soon. 167 * | 168 * | If free memory does not materialise, the system generally remains 169 * | deadlocked. The pageout_deadman() below is run once per second 170 * | from clock(), seeking to limit the amount of time a single request 171 * v to page out can be blocked before the system panics to get a crash 172 * | dump and return to service. 173 * | 174 * +------------------------------------------------------------------------- 175 */ 176 177 /* 178 * The following parameters control operation of the page replacement 179 * algorithm. They are initialized to 0, and then computed at boot time based 180 * on the size of the system; see setupclock(). If they are patched non-zero 181 * in a loaded vmunix they are left alone and may thus be changed per system 182 * using "mdb -kw" on the loaded system. 183 */ 184 pgcnt_t slowscan = 0; 185 pgcnt_t fastscan = 0; 186 187 static pgcnt_t handspreadpages = 0; 188 189 /* 190 * looppages: 191 * Cached copy of the total number of pages in the system (total_pages). 192 * 193 * loopfraction: 194 * Divisor used to relate fastscan to looppages in setupclock(). 195 */ 196 static uint_t loopfraction = 2; 197 static pgcnt_t looppages; 198 199 static uint_t min_percent_cpu = 4; 200 static uint_t max_percent_cpu = 80; 201 static pgcnt_t maxfastscan = 0; 202 static pgcnt_t maxslowscan = 100; 203 204 #define MEGABYTES (1024ULL * 1024ULL) 205 206 /* 207 * pageout_threshold_style: 208 * set to 1 to use the previous default threshold size calculation; 209 * i.e., each threshold is half of the next largest value. 210 */ 211 uint_t pageout_threshold_style = 0; 212 213 /* 214 * The operator may override these tunables to request a different minimum or 215 * maximum lotsfree value, or to change the divisor we use for automatic 216 * sizing. 217 * 218 * By default, we make lotsfree 1/64th of the total memory in the machine. The 219 * minimum and maximum are specified in bytes, rather than pages; a zero value 220 * means the default values (below) are used. 221 */ 222 uint_t lotsfree_fraction = 64; 223 pgcnt_t lotsfree_min = 0; 224 pgcnt_t lotsfree_max = 0; 225 226 #define LOTSFREE_MIN_DEFAULT (16 * MEGABYTES) 227 #define LOTSFREE_MAX_DEFAULT (2048 * MEGABYTES) 228 229 /* 230 * If these tunables are set to non-zero values in /etc/system, and provided 231 * the value is not larger than the threshold above, the specified value will 232 * be used directly without any additional calculation or adjustment. The boot 233 * time value of these overrides is preserved in the "clockinit" struct. More 234 * detail is available in the comment at the top of the file. 235 */ 236 pgcnt_t maxpgio = 0; 237 pgcnt_t minfree = 0; 238 pgcnt_t desfree = 0; 239 pgcnt_t lotsfree = 0; 240 pgcnt_t needfree = 0; 241 pgcnt_t throttlefree = 0; 242 pgcnt_t pageout_reserve = 0; 243 244 pgcnt_t deficit; 245 pgcnt_t nscan; 246 pgcnt_t desscan; 247 248 /* kstats */ 249 uint64_t low_mem_scan; 250 251 /* The maximum supported number of page_scanner() threads */ 252 #define MAX_PSCAN_THREADS 16 253 254 /* 255 * Values for min_pageout_nsec, max_pageout_nsec and pageout_nsec are the 256 * number of nanoseconds in each wakeup cycle that gives the equivalent of some 257 * underlying %CPU duty cycle. 258 * 259 * min_pageout_nsec: 260 * nanoseconds/wakeup equivalent of min_percent_cpu. 261 * 262 * max_pageout_nsec: 263 * nanoseconds/wakeup equivalent of max_percent_cpu. 264 * 265 * pageout_nsec: 266 * Number of nanoseconds budgeted for each wakeup cycle. 267 * Computed each time around by schedpaging(). 268 * Varies between min_pageout_nsec and max_pageout_nsec, 269 * depending on memory pressure. 270 */ 271 static hrtime_t min_pageout_nsec; 272 static hrtime_t max_pageout_nsec; 273 static hrtime_t pageout_nsec; 274 275 static bool reset_hands[MAX_PSCAN_THREADS]; 276 277 #define PAGES_POLL_MASK 1023 278 279 /* 280 * Pageout scheduling. 281 * 282 * Schedpaging controls the rate at which the page out daemon runs by 283 * setting the global variables nscan and desscan SCHEDPAGING_HZ 284 * times a second. Nscan records the number of pages pageout has examined 285 * in its current pass; schedpaging() resets this value to zero each time 286 * it runs. Desscan records the number of pages pageout should examine 287 * in its next pass; schedpaging() sets this value based on the amount of 288 * currently available memory. 289 */ 290 #define SCHEDPAGING_HZ 4 291 292 /* 293 * despagescanners: 294 * The desired number of page scanner threads. For testing purposes, this 295 * value can be set in /etc/system or tuned directly with mdb(1). The 296 * system will bring the actual number of threads into line with the 297 * desired number. If set to an invalid value, the system will correct the 298 * setting. 299 */ 300 uint_t despagescanners = 0; 301 302 /* 303 * pageout_sample_lim: 304 * The limit on the number of samples needed to establish a value for new 305 * pageout parameters: fastscan, slowscan, pageout_new_spread, and 306 * handspreadpages. 307 * 308 * pageout_sample_cnt: 309 * Current sample number. Once the sample gets large enough, set new 310 * values for handspreadpages, pageout_new_spread, fastscan and slowscan. 311 * 312 * pageout_sample_pages: 313 * The accumulated number of pages scanned during sampling. 314 * 315 * pageout_sample_etime: 316 * The accumulated nanoseconds for the sample. 317 * 318 * pageout_sampling: 319 * True while sampling is still in progress. 320 * 321 * pageout_rate: 322 * Rate in pages/nanosecond, computed at the end of sampling. 323 * 324 * pageout_new_spread: 325 * Initially zero while the system scan rate is measured by 326 * pageout_scanner(), which then sets this value once per system boot after 327 * enough samples have been recorded (pageout_sample_cnt). Once set, this 328 * new value is used for fastscan and handspreadpages. 329 */ 330 typedef hrtime_t hrrate_t; 331 332 static uint64_t pageout_sample_lim = 4; 333 static uint64_t pageout_sample_cnt = 0; 334 static pgcnt_t pageout_sample_pages = 0; 335 static hrtime_t pageout_sample_etime = 0; 336 static bool pageout_sampling = true; 337 static hrrate_t pageout_rate = 0; 338 static pgcnt_t pageout_new_spread = 0; 339 340 /* The current number of page scanner threads */ 341 static uint_t n_page_scanners = 1; 342 /* The number of page scanner threads that are actively scanning. */ 343 static uint_t pageouts_running; 344 345 /* 346 * Record number of times a pageout_scanner() wakeup cycle finished because it 347 * timed out (exceeded its CPU budget), rather than because it visited 348 * its budgeted number of pages. 349 */ 350 uint64_t pageout_timeouts = 0; 351 352 #ifdef VM_STATS 353 static struct pageoutvmstats_str { 354 ulong_t checkpage[3]; 355 } pageoutvmstats; 356 #endif /* VM_STATS */ 357 358 /* 359 * Threads waiting for free memory use this condition variable and lock until 360 * memory becomes available. 361 */ 362 kmutex_t memavail_lock; 363 kcondvar_t memavail_cv; 364 365 typedef enum pageout_hand { 366 POH_FRONT = 1, 367 POH_BACK, 368 } pageout_hand_t; 369 370 typedef enum { 371 CKP_INELIGIBLE, 372 CKP_NOT_FREED, 373 CKP_FREED, 374 } checkpage_result_t; 375 376 static checkpage_result_t checkpage(page_t *, pageout_hand_t); 377 378 static struct clockinit { 379 bool ci_init; 380 pgcnt_t ci_lotsfree_min; 381 pgcnt_t ci_lotsfree_max; 382 pgcnt_t ci_lotsfree; 383 pgcnt_t ci_desfree; 384 pgcnt_t ci_minfree; 385 pgcnt_t ci_throttlefree; 386 pgcnt_t ci_pageout_reserve; 387 pgcnt_t ci_maxpgio; 388 pgcnt_t ci_maxfastscan; 389 pgcnt_t ci_fastscan; 390 pgcnt_t ci_slowscan; 391 pgcnt_t ci_handspreadpages; 392 uint_t ci_despagescanners; 393 } clockinit = { .ci_init = false }; 394 395 static inline pgcnt_t 396 clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum) 397 { 398 if (value < minimum) 399 return (minimum); 400 else if (value > maximum) 401 return (maximum); 402 else 403 return (value); 404 } 405 406 static pgcnt_t 407 tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval) 408 { 409 if (initval == 0 || initval >= initval_ceiling) 410 return (defval); 411 else 412 return (initval); 413 } 414 415 /* 416 * On large memory systems, multiple instances of the page scanner are run, 417 * each responsible for a separate region of memory. This speeds up page 418 * invalidation under low memory conditions. 419 * 420 * For testing purposes, despagescanners can be set in /etc/system or via 421 * mdb(1) and it will be used as a guide for how many page scanners to create; 422 * the value will be adjusted if it is not sensible. Otherwise, the number of 423 * page scanners is determined dynamically based on handspreadpages. 424 */ 425 static void 426 recalc_pagescanners(void) 427 { 428 uint_t des; 429 430 /* If the initial calibration has not been done, take no action. */ 431 if (pageout_new_spread == 0) 432 return; 433 434 /* 435 * If `clockinit.ci_despagescanners` is non-zero, then a value for 436 * `despagescanners` was set during initial boot. In this case, if 437 * `despagescanners` has been reset to 0 then we want to revert to 438 * that initial boot value. 439 */ 440 if (despagescanners == 0) 441 despagescanners = clockinit.ci_despagescanners; 442 443 if (despagescanners != 0) { 444 /* 445 * We have a desired number of page scanners, either from 446 * /etc/system or set via mdb. Try and use it (it will be 447 * adjusted below if necessary). 448 */ 449 des = despagescanners; 450 } else { 451 /* 452 * Calculate the number of desired scanners based on the 453 * system's memory size. 454 * 455 * A 64GiB region size is used as the basis for calculating how 456 * many scanner threads should be created. For systems with up 457 * to 64GiB of RAM, a single thread is used; for very large 458 * memory systems the threads are limited to MAX_PSCAN_THREADS. 459 */ 460 des = (looppages - 1) / btop(64ULL << 30) + 1; 461 } 462 463 /* 464 * Clamp the number of scanners so that we have no more than 465 * MAX_PSCAN_THREADS and so that each scanner covers at least 10% more 466 * than handspreadpages. 467 */ 468 pgcnt_t min_scanner_pages = handspreadpages + handspreadpages / 10; 469 pgcnt_t max_scanners = looppages / min_scanner_pages; 470 despagescanners = clamp(des, 1, 471 clamp(max_scanners, 1, MAX_PSCAN_THREADS)); 472 } 473 474 /* 475 * Set up the paging constants for the clock algorithm used by 476 * pageout_scanner(), and by the virtual memory system overall. See the 477 * comments at the top of this file for more information about the threshold 478 * values and system responses to memory pressure. 479 * 480 * This routine is called once by main() at startup, after the initial size of 481 * physical memory is determined. It may be called again later if memory is 482 * added to or removed from the system, or if new measurements of the page scan 483 * rate become available. 484 */ 485 void 486 setupclock(void) 487 { 488 bool half = (pageout_threshold_style == 1); 489 bool recalc = true; 490 491 looppages = total_pages; 492 493 /* 494 * The operator may have provided specific values for some of the 495 * tunables via /etc/system. On our first call, we preserve those 496 * values so that they can be used for subsequent recalculations. 497 * 498 * A value of zero for any tunable means we will use the default 499 * sizing. 500 */ 501 if (!clockinit.ci_init) { 502 clockinit.ci_init = true; 503 504 clockinit.ci_lotsfree_min = lotsfree_min; 505 clockinit.ci_lotsfree_max = lotsfree_max; 506 clockinit.ci_lotsfree = lotsfree; 507 clockinit.ci_desfree = desfree; 508 clockinit.ci_minfree = minfree; 509 clockinit.ci_throttlefree = throttlefree; 510 clockinit.ci_pageout_reserve = pageout_reserve; 511 clockinit.ci_maxpgio = maxpgio; 512 clockinit.ci_maxfastscan = maxfastscan; 513 clockinit.ci_fastscan = fastscan; 514 clockinit.ci_slowscan = slowscan; 515 clockinit.ci_handspreadpages = handspreadpages; 516 clockinit.ci_despagescanners = despagescanners; 517 518 /* 519 * The first call does not trigger a recalculation, only 520 * subsequent calls. 521 */ 522 recalc = false; 523 } 524 525 /* 526 * Configure paging threshold values. For more details on what each 527 * threshold signifies, see the comments at the top of this file. 528 */ 529 lotsfree_max = tune(clockinit.ci_lotsfree_max, looppages, 530 btop(LOTSFREE_MAX_DEFAULT)); 531 lotsfree_min = tune(clockinit.ci_lotsfree_min, lotsfree_max, 532 btop(LOTSFREE_MIN_DEFAULT)); 533 534 lotsfree = tune(clockinit.ci_lotsfree, looppages, 535 clamp(looppages / lotsfree_fraction, lotsfree_min, lotsfree_max)); 536 537 desfree = tune(clockinit.ci_desfree, lotsfree, 538 lotsfree / 2); 539 540 minfree = tune(clockinit.ci_minfree, desfree, 541 half ? desfree / 2 : 3 * desfree / 4); 542 543 throttlefree = tune(clockinit.ci_throttlefree, desfree, 544 minfree); 545 546 pageout_reserve = tune(clockinit.ci_pageout_reserve, throttlefree, 547 half ? throttlefree / 2 : 3 * throttlefree / 4); 548 549 /* 550 * Maxpgio thresholds how much paging is acceptable. 551 * This figures that 2/3 busy on an arm is all that is 552 * tolerable for paging. We assume one operation per disk rev. 553 * 554 * XXX - Does not account for multiple swap devices. 555 */ 556 if (clockinit.ci_maxpgio == 0) { 557 maxpgio = (DISKRPM * 2) / 3; 558 } else { 559 maxpgio = clockinit.ci_maxpgio; 560 } 561 562 /* 563 * The clock scan rate varies between fastscan and slowscan 564 * based on the amount of free memory available. Fastscan 565 * rate should be set based on the number pages that can be 566 * scanned per sec using ~10% of processor time. Since this 567 * value depends on the processor, MMU, Mhz etc., it is 568 * difficult to determine it in a generic manner for all 569 * architectures. 570 * 571 * Instead of trying to determine the number of pages scanned 572 * per sec for every processor, fastscan is set to be the smaller 573 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling 574 * time is limited to ~4% of processor time. 575 * 576 * Setting fastscan to be 1/2 of memory allows pageout to scan 577 * all of memory in ~2 secs. This implies that user pages not 578 * accessed within 1 sec (assuming, handspreadpages == fastscan) 579 * can be reclaimed when free memory is very low. Stealing pages 580 * not accessed within 1 sec seems reasonable and ensures that 581 * active user processes don't thrash. 582 * 583 * Smaller values of fastscan result in scanning fewer pages 584 * every second and consequently pageout may not be able to free 585 * sufficient memory to maintain the minimum threshold. Larger 586 * values of fastscan result in scanning a lot more pages which 587 * could lead to thrashing and higher CPU usage. 588 * 589 * Fastscan needs to be limited to a maximum value and should not 590 * scale with memory to prevent pageout from consuming too much 591 * time for scanning on slow CPU's and avoid thrashing, as a 592 * result of scanning too many pages, on faster CPU's. 593 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES 594 * (the upper bound for fastscan) based on the average number 595 * of pages that can potentially be scanned in ~1 sec (using ~4% 596 * of the CPU) on some of the following machines that currently 597 * run Solaris 2.x: 598 * 599 * average memory scanned in ~1 sec 600 * 601 * 25 Mhz SS1+: 23 Meg 602 * LX: 37 Meg 603 * 50 Mhz SC2000: 68 Meg 604 * 605 * 40 Mhz 486: 26 Meg 606 * 66 Mhz 486: 42 Meg 607 * 608 * When free memory falls just below lotsfree, the scan rate 609 * goes from 0 to slowscan (i.e., pageout starts running). This 610 * transition needs to be smooth and is achieved by ensuring that 611 * pageout scans a small number of pages to satisfy the transient 612 * memory demand. This is set to not exceed 100 pages/sec (25 per 613 * wakeup) since scanning that many pages has no noticible impact 614 * on system performance. 615 * 616 * In addition to setting fastscan and slowscan, pageout is 617 * limited to using ~4% of the CPU. This results in increasing 618 * the time taken to scan all of memory, which in turn means that 619 * user processes have a better opportunity of preventing their 620 * pages from being stolen. This has a positive effect on 621 * interactive and overall system performance when memory demand 622 * is high. 623 * 624 * Thus, the rate at which pages are scanned for replacement will 625 * vary linearly between slowscan and the number of pages that 626 * can be scanned using ~4% of processor time instead of varying 627 * linearly between slowscan and fastscan. 628 * 629 * Also, the processor time used by pageout will vary from ~1% 630 * at slowscan to ~4% at fastscan instead of varying between 631 * ~1% at slowscan and ~10% at fastscan. 632 * 633 * The values chosen for the various VM parameters (fastscan, 634 * handspreadpages, etc) are not universally true for all machines, 635 * but appear to be a good rule of thumb for the machines we've 636 * tested. They have the following ranges: 637 * 638 * cpu speed: 20 to 70 Mhz 639 * page size: 4K to 8K 640 * memory size: 16M to 5G 641 * page scan rate: 4000 - 17400 4K pages per sec 642 * 643 * The values need to be re-examined for machines which don't 644 * fall into the various ranges (e.g., slower or faster CPUs, 645 * smaller or larger pagesizes etc) shown above. 646 * 647 * On an MP machine, pageout is often unable to maintain the 648 * minimum paging thresholds under heavy load. This is due to 649 * the fact that user processes running on other CPU's can be 650 * dirtying memory at a much faster pace than pageout can find 651 * pages to free. The memory demands could be met by enabling 652 * more than one CPU to run the clock algorithm in such a manner 653 * that the various clock hands don't overlap. This also makes 654 * it more difficult to determine the values for fastscan, slowscan 655 * and handspreadpages. 656 * 657 * The swapper is currently used to free up memory when pageout 658 * is unable to meet memory demands by swapping out processes. 659 * In addition to freeing up memory, swapping also reduces the 660 * demand for memory by preventing user processes from running 661 * and thereby consuming memory. 662 */ 663 if (clockinit.ci_maxfastscan == 0) { 664 if (pageout_new_spread != 0) { 665 maxfastscan = pageout_new_spread; 666 } else { 667 maxfastscan = MAXHANDSPREADPAGES; 668 } 669 } else { 670 maxfastscan = clockinit.ci_maxfastscan; 671 } 672 673 if (clockinit.ci_fastscan == 0) { 674 fastscan = MIN(looppages / loopfraction, maxfastscan); 675 } else { 676 fastscan = clockinit.ci_fastscan; 677 } 678 679 if (fastscan > looppages / loopfraction) { 680 fastscan = looppages / loopfraction; 681 } 682 683 /* 684 * Set slow scan time to 1/10 the fast scan time, but 685 * not to exceed maxslowscan. 686 */ 687 if (clockinit.ci_slowscan == 0) { 688 slowscan = MIN(fastscan / 10, maxslowscan); 689 } else { 690 slowscan = clockinit.ci_slowscan; 691 } 692 693 if (slowscan > fastscan / 2) { 694 slowscan = fastscan / 2; 695 } 696 697 /* 698 * Handspreadpages is the distance (in pages) between front and back 699 * pageout daemon hands. The amount of time to reclaim a page 700 * once pageout examines it increases with this distance and 701 * decreases as the scan rate rises. It must be < the amount 702 * of pageable memory. 703 * 704 * Since pageout is limited to ~4% of the CPU, setting handspreadpages 705 * to be "fastscan" results in the front hand being a few secs 706 * (varies based on the processor speed) ahead of the back hand 707 * at fastscan rates. This distance can be further reduced, if 708 * necessary, by increasing the processor time used by pageout 709 * to be more than ~4% and preferrably not more than ~10%. 710 * 711 * As a result, user processes have a much better chance of 712 * referencing their pages before the back hand examines them. 713 * This also significantly lowers the number of reclaims from 714 * the freelist since pageout does not end up freeing pages which 715 * may be referenced a sec later. 716 */ 717 if (clockinit.ci_handspreadpages == 0) { 718 handspreadpages = fastscan; 719 } else { 720 handspreadpages = clockinit.ci_handspreadpages; 721 } 722 723 /* 724 * Make sure that back hand follows front hand by at least 725 * 1/SCHEDPAGING_HZ seconds. Without this test, it is possible for the 726 * back hand to look at a page during the same wakeup of the pageout 727 * daemon in which the front hand cleared its ref bit. 728 */ 729 if (handspreadpages >= looppages) { 730 handspreadpages = looppages - 1; 731 } 732 733 /* 734 * Establish the minimum and maximum length of time to be spent 735 * scanning pages per wakeup, limiting the scanner duty cycle. The 736 * input percentage values (0-100) must be converted to a fraction of 737 * the number of nanoseconds in a second of wall time, then further 738 * scaled down by the number of scanner wakeups in a second. 739 */ 740 min_pageout_nsec = MAX(1, 741 NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ); 742 max_pageout_nsec = MAX(min_pageout_nsec, 743 NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ); 744 745 /* 746 * If not called for recalculation, return and skip the remaining 747 * steps. 748 */ 749 if (!recalc) 750 return; 751 752 /* 753 * Set a flag to re-evaluate the clock hand positions. 754 */ 755 for (uint_t i = 0; i < MAX_PSCAN_THREADS; i++) 756 reset_hands[i] = true; 757 758 recalc_pagescanners(); 759 } 760 761 static kmutex_t pageout_mutex; 762 763 /* 764 * Pool of available async pageout putpage requests. 765 */ 766 static struct async_reqs *push_req; 767 static struct async_reqs *req_freelist; /* available req structs */ 768 static struct async_reqs *push_list; /* pending reqs */ 769 static kmutex_t push_lock; /* protects req pool */ 770 static kcondvar_t push_cv; 771 772 /* 773 * If pageout() is stuck on a single push for this many seconds, 774 * pageout_deadman() will assume the system has hit a memory deadlock. If set 775 * to 0, the deadman will have no effect. 776 * 777 * Note that we are only looking for stalls in the calls that pageout() makes 778 * to VOP_PUTPAGE(). These calls are merely asynchronous requests for paging 779 * I/O, which should not take long unless the underlying strategy call blocks 780 * indefinitely for memory. The actual I/O request happens (or fails) later. 781 */ 782 uint_t pageout_deadman_seconds = 90; 783 784 static uint_t pageout_stucktime = 0; 785 static bool pageout_pushing = false; 786 static uint64_t pageout_pushcount = 0; 787 static uint64_t pageout_pushcount_seen = 0; 788 789 int async_list_size = 8192; 790 791 static void pageout_scanner(void *); 792 793 /* 794 * If a page is being shared more than "po_share" times 795 * then leave it alone- don't page it out. 796 */ 797 #define MIN_PO_SHARE (8) 798 #define MAX_PO_SHARE ((MIN_PO_SHARE) << 24) 799 ulong_t po_share = MIN_PO_SHARE; 800 801 /* 802 * Schedule rate for paging. 803 * Rate is linear interpolation between 804 * slowscan with lotsfree and fastscan when out of memory. 805 */ 806 static void 807 schedpaging(void *arg) 808 { 809 spgcnt_t vavail; 810 811 if (freemem < lotsfree + needfree + kmem_reapahead) 812 kmem_reap(); 813 814 if (freemem < lotsfree + needfree) 815 seg_preap(); 816 817 if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree)) 818 kcage_cageout_wakeup(); 819 820 if (mutex_tryenter(&pageout_mutex)) { 821 if (pageouts_running != 0) 822 goto out; 823 824 /* No pageout scanner threads running. */ 825 nscan = 0; 826 vavail = freemem - deficit; 827 if (pageout_new_spread != 0) 828 vavail -= needfree; 829 /* Note that vavail is signed so don't use clamp() here */ 830 if (vavail < 0) 831 vavail = 0; 832 if (vavail > lotsfree) 833 vavail = lotsfree; 834 835 if (needfree > 0 && pageout_new_spread == 0) { 836 /* 837 * If we've not yet collected enough samples to 838 * calculate a spread, use the old logic of kicking 839 * into high gear anytime needfree is non-zero. 840 */ 841 desscan = fastscan / SCHEDPAGING_HZ; 842 } else { 843 /* 844 * Once we've calculated a spread based on system 845 * memory and usage, just treat needfree as another 846 * form of deficit. 847 */ 848 spgcnt_t faststmp, slowstmp, result; 849 850 slowstmp = slowscan * vavail; 851 faststmp = fastscan * (lotsfree - vavail); 852 result = (slowstmp + faststmp) / 853 nz(lotsfree) / SCHEDPAGING_HZ; 854 desscan = (pgcnt_t)result; 855 } 856 857 pageout_nsec = min_pageout_nsec + (lotsfree - vavail) * 858 (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree); 859 860 DTRACE_PROBE2(schedpage__calc, pgcnt_t, desscan, hrtime_t, 861 pageout_nsec); 862 863 if (pageout_new_spread != 0 && despagescanners != 0 && 864 despagescanners != n_page_scanners) { 865 /* 866 * We have finished the pagescan initialisation and the 867 * desired number of page scanners has changed, either 868 * because sampling just finished, because of a memory 869 * DR, or because despagescanners has been modified on 870 * the fly (e.g. via mdb(1)). 871 */ 872 uint_t curr_nscan = n_page_scanners; 873 uint_t i; 874 875 /* Re-validate despagescanners */ 876 recalc_pagescanners(); 877 878 n_page_scanners = despagescanners; 879 880 for (i = 0; i < MAX_PSCAN_THREADS; i++) 881 reset_hands[i] = true; 882 883 /* If we need more scanners, start them now. */ 884 for (i = curr_nscan; i < n_page_scanners; i++) { 885 (void) lwp_kernel_create(proc_pageout, 886 pageout_scanner, (void *)(uintptr_t)i, 887 TS_RUN, curthread->t_pri); 888 } 889 890 /* 891 * If the number of scanners has decreased, trigger a 892 * wakeup so that the excess threads will terminate. 893 */ 894 if (n_page_scanners < curr_nscan) { 895 WAKE_PAGEOUT_SCANNER(reducing); 896 } 897 } 898 899 if (pageout_sampling) { 900 /* 901 * We still need to measure the rate at which the 902 * system is able to scan pages of memory. Each of 903 * these initial samples is a scan of as much system 904 * memory as practical, regardless of whether or not we 905 * are experiencing memory pressure. 906 */ 907 desscan = total_pages; 908 pageout_nsec = max_pageout_nsec; 909 910 WAKE_PAGEOUT_SCANNER(sampling); 911 } else if (freemem < lotsfree + needfree) { 912 /* 913 * We need more memory. 914 */ 915 low_mem_scan++; 916 WAKE_PAGEOUT_SCANNER(lowmem); 917 } else { 918 /* 919 * There are enough free pages, no need to 920 * kick the scanner threads. And next time 921 * around, keep more of the `highly shared' 922 * pages. 923 */ 924 cv_signal_pageout(); 925 if (po_share > MIN_PO_SHARE) 926 po_share >>= 1; 927 } 928 out: 929 mutex_exit(&pageout_mutex); 930 } 931 932 /* 933 * Signal threads waiting for available memory. 934 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but 935 * in this case it is not needed - the waiters will be woken up during 936 * the next invocation of this function. 937 */ 938 if (kmem_avail() > 0) 939 cv_broadcast(&memavail_cv); 940 941 (void) timeout(schedpaging, arg, hz / SCHEDPAGING_HZ); 942 } 943 944 pgcnt_t pushes; 945 ulong_t push_list_size; /* # of requests on pageout queue */ 946 947 /* 948 * Paging out should always be enabled. This tunable exists to hold pageout 949 * for debugging purposes. If set to 0, pageout_scanner() will go back to 950 * sleep each time it is woken by schedpaging(). 951 */ 952 uint_t dopageout = 1; 953 954 /* 955 * The page out daemon, which runs as process 2. 956 * 957 * The daemon treats physical memory as a circular array of pages and scans 958 * the pages using a 'two-handed clock' algorithm. The front hand moves 959 * through the pages, clearing the reference bit. The back hand travels a 960 * distance (handspreadpages) behind the front hand, freeing the pages that 961 * have not been referenced in the time since the front hand passed. If 962 * modified, they are first written to their backing store before being 963 * freed. 964 * 965 * In order to make page invalidation more responsive on machines with 966 * larger memory, multiple pageout_scanner threads may be created. In this 967 * case, each thread is given a segment of the memory "clock face" so that 968 * memory can be reclaimed more quickly. As long as there are at least lotsfree 969 * pages, then pageout_scanner threads are not run. 970 * 971 * There are multiple threads that act on behalf of the pageout process. A 972 * set of threads scan pages (pageout_scanner) and frees them up if they 973 * don't require any VOP_PUTPAGE operation. If a page must be written back 974 * to its backing store, the request is put on a list and the other 975 * (pageout) thread is signaled. The pageout thread grabs VOP_PUTPAGE 976 * requests from the list, and processes them. Some filesystems may require 977 * resources for the VOP_PUTPAGE operations (like memory) and hence can 978 * block the pageout thread, but the scanner thread can still operate. 979 * There is still no guarantee that memory deadlocks cannot occur. 980 */ 981 void 982 pageout() 983 { 984 struct async_reqs *arg; 985 pri_t pageout_pri; 986 int i; 987 pgcnt_t max_pushes; 988 callb_cpr_t cprinfo; 989 990 proc_pageout = ttoproc(curthread); 991 proc_pageout->p_cstime = 0; 992 proc_pageout->p_stime = 0; 993 proc_pageout->p_cutime = 0; 994 proc_pageout->p_utime = 0; 995 bcopy("pageout", PTOU(curproc)->u_psargs, 8); 996 bcopy("pageout", PTOU(curproc)->u_comm, 7); 997 998 mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL); 999 mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL); 1000 1001 /* 1002 * Allocate and initialize the async request structures for pageout. 1003 */ 1004 push_req = (struct async_reqs *) 1005 kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP); 1006 1007 req_freelist = push_req; 1008 for (i = 0; i < async_list_size - 1; i++) { 1009 push_req[i].a_next = &push_req[i + 1]; 1010 } 1011 1012 pageout_pri = curthread->t_pri; 1013 1014 /* Create the first pageout scanner thread. */ 1015 (void) lwp_kernel_create(proc_pageout, pageout_scanner, 1016 (void *)0, /* this is instance 0, not NULL */ 1017 TS_RUN, pageout_pri - 1); 1018 1019 /* 1020 * kick off the pageout scheduler. 1021 */ 1022 schedpaging(NULL); 1023 1024 /* 1025 * Create kernel cage thread. 1026 * The kernel cage thread is started under the pageout process 1027 * to take advantage of the less restricted page allocation 1028 * in page_create_throttle(). 1029 */ 1030 kcage_cageout_init(); 1031 1032 /* 1033 * Limit pushes to avoid saturating pageout devices. 1034 */ 1035 max_pushes = maxpgio / SCHEDPAGING_HZ; 1036 CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout"); 1037 1038 for (;;) { 1039 mutex_enter(&push_lock); 1040 1041 while ((arg = push_list) == NULL || pushes > max_pushes) { 1042 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1043 cv_wait(&push_cv, &push_lock); 1044 pushes = 0; 1045 CALLB_CPR_SAFE_END(&cprinfo, &push_lock); 1046 } 1047 push_list = arg->a_next; 1048 arg->a_next = NULL; 1049 pageout_pushing = true; 1050 mutex_exit(&push_lock); 1051 1052 DTRACE_PROBE(pageout__push); 1053 1054 if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off, 1055 arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) { 1056 pushes++; 1057 } 1058 1059 /* vp held by checkpage() */ 1060 VN_RELE(arg->a_vp); 1061 1062 mutex_enter(&push_lock); 1063 pageout_pushing = false; 1064 pageout_pushcount++; 1065 arg->a_next = req_freelist; /* back on freelist */ 1066 req_freelist = arg; 1067 push_list_size--; 1068 mutex_exit(&push_lock); 1069 } 1070 } 1071 1072 static void 1073 pageout_sample_add(pgcnt_t count, hrtime_t elapsed) 1074 { 1075 VERIFY(pageout_sampling); 1076 1077 /* 1078 * The global variables used below are only modified during initial 1079 * scanning when there is a single page scanner thread running. 1080 */ 1081 pageout_sample_pages += count; 1082 pageout_sample_etime += elapsed; 1083 pageout_sample_cnt++; 1084 1085 if (pageout_sample_cnt >= pageout_sample_lim) { 1086 /* 1087 * We have enough samples, set the spread. 1088 */ 1089 pageout_sampling = false; 1090 pageout_rate = (hrrate_t)pageout_sample_pages * 1091 (hrrate_t)(NANOSEC) / pageout_sample_etime; 1092 pageout_new_spread = pageout_rate / 10; 1093 } 1094 } 1095 1096 static inline page_t * 1097 wrapping_page_next(page_t *cur, page_t *start, page_t *end) 1098 { 1099 if (cur == end) 1100 return (start); 1101 return (page_nextn(cur, 1)); 1102 } 1103 1104 /* 1105 * Kernel thread that scans pages looking for ones to free 1106 */ 1107 static void 1108 pageout_scanner(void *a) 1109 { 1110 page_t *fhand, *bhand, *fhandstart; 1111 page_t *regionstart, *regionend; 1112 uint_t laps; 1113 callb_cpr_t cprinfo; 1114 pgcnt_t nscan_cnt; 1115 pgcnt_t pcount; 1116 hrtime_t sample_start, sample_end; 1117 uint_t inst = (uint_t)(uintptr_t)a; 1118 1119 VERIFY3U(inst, <, MAX_PSCAN_THREADS); 1120 1121 CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan"); 1122 mutex_enter(&pageout_mutex); 1123 1124 /* 1125 * The restart case does not attempt to point the hands at roughly 1126 * the right point on the assumption that after one circuit things 1127 * will have settled down, and restarts shouldn't be that often. 1128 */ 1129 reset_hands[inst] = true; 1130 1131 pageouts_running++; 1132 mutex_exit(&pageout_mutex); 1133 1134 loop: 1135 cv_signal_pageout(); 1136 1137 mutex_enter(&pageout_mutex); 1138 pageouts_running--; 1139 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1140 cv_wait(&proc_pageout->p_cv, &pageout_mutex); 1141 CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex); 1142 pageouts_running++; 1143 mutex_exit(&pageout_mutex); 1144 1145 /* 1146 * Check if pageout has been disabled for debugging purposes. 1147 */ 1148 if (dopageout == 0) 1149 goto loop; 1150 1151 /* 1152 * One may reset the clock hands and scanned region for debugging 1153 * purposes. Hands will also be reset on first thread startup, if 1154 * the number of scanning threads (n_page_scanners) changes, or if 1155 * memory is added to, or removed from, the system. 1156 */ 1157 if (reset_hands[inst]) { 1158 page_t *first; 1159 1160 reset_hands[inst] = false; 1161 1162 if (inst >= n_page_scanners) { 1163 /* 1164 * The desired number of page scanners has been 1165 * reduced and this instance is no longer wanted. 1166 * Exit the lwp. 1167 */ 1168 VERIFY3U(inst, !=, 0); 1169 DTRACE_PROBE1(pageout__exit, uint_t, inst); 1170 mutex_enter(&pageout_mutex); 1171 pageouts_running--; 1172 mutex_exit(&pageout_mutex); 1173 mutex_enter(&curproc->p_lock); 1174 lwp_exit(); 1175 /* NOTREACHED */ 1176 } 1177 1178 first = page_first(); 1179 1180 /* 1181 * Each scanner thread gets its own sector of the memory 1182 * clock face. 1183 */ 1184 pgcnt_t span, offset; 1185 1186 span = looppages / n_page_scanners; 1187 VERIFY3U(span, >, handspreadpages); 1188 1189 offset = inst * span; 1190 regionstart = page_nextn(first, offset); 1191 if (inst == n_page_scanners - 1) { 1192 /* The last instance goes up to the last page */ 1193 regionend = page_nextn(first, looppages - 1); 1194 } else { 1195 regionend = page_nextn(regionstart, span - 1); 1196 } 1197 1198 bhand = regionstart; 1199 fhand = page_nextn(bhand, handspreadpages); 1200 1201 DTRACE_PROBE4(pageout__reset, uint_t, inst, 1202 pgcnt_t, regionstart, pgcnt_t, regionend, 1203 pgcnt_t, fhand); 1204 } 1205 1206 /* 1207 * This CPU kstat is only incremented here and we're on this CPU, so no 1208 * lock. 1209 */ 1210 CPU_STATS_ADDQ(CPU, vm, pgrrun, 1); 1211 1212 /* 1213 * Keep track of the number of times we have scanned all the way around 1214 * the loop on this wakeup. 1215 */ 1216 laps = 0; 1217 1218 /* 1219 * Track the number of pages visited during this scan so that we can 1220 * periodically measure our duty cycle. 1221 */ 1222 nscan_cnt = 0; 1223 pcount = 0; 1224 1225 DTRACE_PROBE5(pageout__start, uint_t, inst, pgcnt_t, desscan, 1226 hrtime_t, pageout_nsec, page_t *, bhand, page_t *, fhand); 1227 1228 /* 1229 * Record the initial position of the front hand for this cycle so 1230 * that we can detect when the hand wraps around. 1231 */ 1232 fhandstart = fhand; 1233 1234 sample_start = gethrtime(); 1235 1236 /* 1237 * Scan the appropriate number of pages for a single duty cycle. 1238 */ 1239 while (nscan_cnt < desscan) { 1240 checkpage_result_t rvfront, rvback; 1241 1242 if (!pageout_sampling && freemem >= lotsfree + needfree) { 1243 /* 1244 * We are not sampling and enough memory has become 1245 * available that scanning is no longer required. 1246 */ 1247 DTRACE_PROBE1(pageout__memfree, uint_t, inst); 1248 break; 1249 } 1250 1251 DTRACE_PROBE2(pageout__loop, uint_t, inst, pgcnt_t, pcount); 1252 1253 /* 1254 * Periodically check to see if we have exceeded the CPU duty 1255 * cycle for a single wakeup. 1256 */ 1257 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) { 1258 hrtime_t pageout_cycle_nsec; 1259 1260 pageout_cycle_nsec = gethrtime() - sample_start; 1261 if (pageout_cycle_nsec >= pageout_nsec) { 1262 atomic_inc_64(&pageout_timeouts); 1263 DTRACE_PROBE1(pageout__timeout, uint_t, inst); 1264 break; 1265 } 1266 } 1267 1268 /* 1269 * If checkpage manages to add a page to the free list, 1270 * we give ourselves another couple of trips around the loop. 1271 */ 1272 if ((rvfront = checkpage(fhand, POH_FRONT)) == CKP_FREED) { 1273 laps = 0; 1274 } 1275 if ((rvback = checkpage(bhand, POH_BACK)) == CKP_FREED) { 1276 laps = 0; 1277 } 1278 1279 ++pcount; 1280 1281 /* 1282 * This CPU kstat is only incremented here and we're on this 1283 * CPU, so no lock. 1284 */ 1285 CPU_STATS_ADDQ(CPU, vm, scan, 1); 1286 1287 /* 1288 * Don't include ineligible pages in the number scanned. 1289 */ 1290 if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) 1291 nscan_cnt++; 1292 1293 /* 1294 * Tick 1295 */ 1296 bhand = wrapping_page_next(bhand, regionstart, regionend); 1297 fhand = wrapping_page_next(fhand, regionstart, regionend); 1298 1299 /* 1300 * The front hand has wrapped around during this wakeup. 1301 */ 1302 if (fhand == fhandstart) { 1303 laps++; 1304 DTRACE_PROBE2(pageout__hand__wrap, uint_t, inst, 1305 uint_t, laps); 1306 1307 /* 1308 * This CPU kstat is only incremented here and we're 1309 * on this CPU, so no lock. 1310 */ 1311 CPU_STATS_ADDQ(CPU, vm, rev, 1); 1312 1313 if (laps > 1) { 1314 /* 1315 * Extremely unlikely, but it happens. 1316 * We went around the loop at least once 1317 * and didn't get far enough. 1318 * If we are still skipping `highly shared' 1319 * pages, skip fewer of them. Otherwise, 1320 * give up till the next clock tick. 1321 */ 1322 if (po_share < MAX_PO_SHARE) { 1323 po_share <<= 1; 1324 } else { 1325 break; 1326 } 1327 } 1328 } 1329 } 1330 1331 sample_end = gethrtime(); 1332 atomic_add_long(&nscan, nscan_cnt); 1333 1334 DTRACE_PROBE4(pageout__end, uint_t, inst, uint_t, laps, 1335 pgcnt_t, nscan_cnt, pgcnt_t, pcount) 1336 1337 /* 1338 * Continue accumulating samples until we have enough to get a 1339 * reasonable value for average scan rate. 1340 */ 1341 if (pageout_sampling) { 1342 VERIFY3U(inst, ==, 0); 1343 pageout_sample_add(pcount, sample_end - sample_start); 1344 /* 1345 * If, after the sample just added, we have finished sampling, 1346 * set up the paging constants. 1347 */ 1348 if (!pageout_sampling) 1349 setupclock(); 1350 } 1351 1352 goto loop; 1353 } 1354 1355 /* 1356 * The pageout deadman is run once per second by clock(). 1357 */ 1358 void 1359 pageout_deadman(void) 1360 { 1361 if (panicstr != NULL) { 1362 /* 1363 * There is no pageout after panic. 1364 */ 1365 return; 1366 } 1367 1368 if (pageout_deadman_seconds == 0) { 1369 /* 1370 * The deadman is not enabled. 1371 */ 1372 return; 1373 } 1374 1375 if (!pageout_pushing) { 1376 goto reset; 1377 } 1378 1379 /* 1380 * We are pushing a page. Check to see if it is the same call we saw 1381 * last time we looked: 1382 */ 1383 if (pageout_pushcount != pageout_pushcount_seen) { 1384 /* 1385 * It is a different call from the last check, so we are not 1386 * stuck. 1387 */ 1388 goto reset; 1389 } 1390 1391 if (++pageout_stucktime >= pageout_deadman_seconds) { 1392 panic("pageout_deadman: stuck pushing the same page for %d " 1393 "seconds (freemem is %lu)", pageout_deadman_seconds, 1394 freemem); 1395 } 1396 1397 return; 1398 1399 reset: 1400 /* 1401 * Reset our tracking state to reflect that we are not stuck: 1402 */ 1403 pageout_stucktime = 0; 1404 pageout_pushcount_seen = pageout_pushcount; 1405 } 1406 1407 /* 1408 * Look at the page at hand. If it is locked (e.g., for physical i/o), 1409 * system (u., page table) or free, then leave it alone. Otherwise, 1410 * if we are running the front hand, turn off the page's reference bit. 1411 * If the proc is over maxrss, we take it. If running the back hand, 1412 * check whether the page has been reclaimed. If not, free the page, 1413 * pushing it to disk first if necessary. 1414 * 1415 * Return values: 1416 * CKP_INELIGIBLE if the page is not a candidate at all, 1417 * CKP_NOT_FREED if the page was not freed, or 1418 * CKP_FREED if we freed it. 1419 */ 1420 static checkpage_result_t 1421 checkpage(page_t *pp, pageout_hand_t whichhand) 1422 { 1423 int ppattr; 1424 int isfs = 0; 1425 int isexec = 0; 1426 int pagesync_flag; 1427 1428 /* 1429 * Skip pages: 1430 * - associated with the kernel vnode since 1431 * they are always "exclusively" locked. 1432 * - that are free 1433 * - that are shared more than po_share'd times 1434 * - its already locked 1435 * 1436 * NOTE: These optimizations assume that reads are atomic. 1437 */ 1438 1439 if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) || 1440 pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 1441 hat_page_checkshare(pp, po_share)) { 1442 return (CKP_INELIGIBLE); 1443 } 1444 1445 if (!page_trylock(pp, SE_EXCL)) { 1446 /* 1447 * Skip the page if we can't acquire the "exclusive" lock. 1448 */ 1449 return (CKP_INELIGIBLE); 1450 } else if (PP_ISFREE(pp)) { 1451 /* 1452 * It became free between the above check and our actually 1453 * locking the page. Oh well, there will be other pages. 1454 */ 1455 page_unlock(pp); 1456 return (CKP_INELIGIBLE); 1457 } 1458 1459 /* 1460 * Reject pages that cannot be freed. The page_struct_lock 1461 * need not be acquired to examine these 1462 * fields since the page has an "exclusive" lock. 1463 */ 1464 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1465 page_unlock(pp); 1466 return (CKP_INELIGIBLE); 1467 } 1468 1469 /* 1470 * Maintain statistics for what we are freeing 1471 */ 1472 if (pp->p_vnode != NULL) { 1473 if (pp->p_vnode->v_flag & VVMEXEC) 1474 isexec = 1; 1475 1476 if (!IS_SWAPFSVP(pp->p_vnode)) 1477 isfs = 1; 1478 } 1479 1480 /* 1481 * Turn off REF and MOD bits with the front hand. 1482 * The back hand examines the REF bit and always considers 1483 * SHARED pages as referenced. 1484 */ 1485 if (whichhand == POH_FRONT) { 1486 pagesync_flag = HAT_SYNC_ZERORM; 1487 } else { 1488 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF | 1489 HAT_SYNC_STOPON_SHARED; 1490 } 1491 1492 ppattr = hat_pagesync(pp, pagesync_flag); 1493 1494 recheck: 1495 /* 1496 * If page is referenced; make unreferenced but reclaimable. 1497 * If this page is not referenced, then it must be reclaimable 1498 * and we can add it to the free list. 1499 */ 1500 if (ppattr & P_REF) { 1501 DTRACE_PROBE2(pageout__isref, page_t *, pp, 1502 pageout_hand_t, whichhand); 1503 1504 if (whichhand == POH_FRONT) { 1505 /* 1506 * Checking of rss or madvise flags needed here... 1507 * 1508 * If not "well-behaved", fall through into the code 1509 * for not referenced. 1510 */ 1511 hat_clrref(pp); 1512 } 1513 1514 /* 1515 * Somebody referenced the page since the front 1516 * hand went by, so it's not a candidate for 1517 * freeing up. 1518 */ 1519 page_unlock(pp); 1520 return (CKP_NOT_FREED); 1521 } 1522 1523 VM_STAT_ADD(pageoutvmstats.checkpage[0]); 1524 1525 /* 1526 * If large page, attempt to demote it. If successfully demoted, 1527 * retry the checkpage. 1528 */ 1529 if (pp->p_szc != 0) { 1530 if (!page_try_demote_pages(pp)) { 1531 VM_STAT_ADD(pageoutvmstats.checkpage[1]); 1532 page_unlock(pp); 1533 return (CKP_INELIGIBLE); 1534 } 1535 1536 ASSERT(pp->p_szc == 0); 1537 VM_STAT_ADD(pageoutvmstats.checkpage[2]); 1538 1539 /* 1540 * Since page_try_demote_pages() could have unloaded some 1541 * mappings it makes sense to reload ppattr. 1542 */ 1543 ppattr = hat_page_getattr(pp, P_MOD | P_REF); 1544 } 1545 1546 /* 1547 * If the page is currently dirty, we have to arrange to have it 1548 * cleaned before it can be freed. 1549 * 1550 * XXX - ASSERT(pp->p_vnode != NULL); 1551 */ 1552 if ((ppattr & P_MOD) && pp->p_vnode != NULL) { 1553 struct vnode *vp = pp->p_vnode; 1554 u_offset_t offset = pp->p_offset; 1555 1556 /* 1557 * XXX - Test for process being swapped out or about to exit? 1558 * [Can't get back to process(es) using the page.] 1559 */ 1560 1561 /* 1562 * Hold the vnode before releasing the page lock to 1563 * prevent it from being freed and re-used by some 1564 * other thread. 1565 */ 1566 VN_HOLD(vp); 1567 page_unlock(pp); 1568 1569 /* 1570 * Queue I/O request for the pageout thread. 1571 */ 1572 if (!queue_io_request(vp, offset)) { 1573 VN_RELE(vp); 1574 return (CKP_NOT_FREED); 1575 } 1576 return (CKP_FREED); 1577 } 1578 1579 /* 1580 * Now we unload all the translations and put the page back on to the 1581 * free list. If the page was used (referenced or modified) after the 1582 * pagesync but before it was unloaded we catch it and handle the page 1583 * properly. 1584 */ 1585 DTRACE_PROBE2(pageout__free, page_t *, pp, pageout_hand_t, whichhand); 1586 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1587 ppattr = hat_page_getattr(pp, P_MOD | P_REF); 1588 if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode != NULL)) { 1589 goto recheck; 1590 } 1591 1592 VN_DISPOSE(pp, B_FREE, 0, kcred); 1593 1594 CPU_STATS_ADD_K(vm, dfree, 1); 1595 1596 if (isfs) { 1597 if (isexec) { 1598 CPU_STATS_ADD_K(vm, execfree, 1); 1599 } else { 1600 CPU_STATS_ADD_K(vm, fsfree, 1); 1601 } 1602 } else { 1603 CPU_STATS_ADD_K(vm, anonfree, 1); 1604 } 1605 1606 return (CKP_FREED); 1607 } 1608 1609 /* 1610 * Queue async i/o request from pageout_scanner and segment swapout 1611 * routines on one common list. This ensures that pageout devices (swap) 1612 * are not saturated by pageout_scanner or swapout requests. 1613 * The pageout thread empties this list by initiating i/o operations. 1614 */ 1615 int 1616 queue_io_request(vnode_t *vp, u_offset_t off) 1617 { 1618 struct async_reqs *arg; 1619 1620 /* 1621 * If we cannot allocate an async request struct, 1622 * skip this page. 1623 */ 1624 mutex_enter(&push_lock); 1625 if ((arg = req_freelist) == NULL) { 1626 mutex_exit(&push_lock); 1627 return (0); 1628 } 1629 req_freelist = arg->a_next; /* adjust freelist */ 1630 push_list_size++; 1631 1632 arg->a_vp = vp; 1633 arg->a_off = off; 1634 arg->a_len = PAGESIZE; 1635 arg->a_flags = B_ASYNC | B_FREE; 1636 arg->a_cred = kcred; /* always held */ 1637 1638 /* 1639 * Add to list of pending write requests. 1640 */ 1641 arg->a_next = push_list; 1642 push_list = arg; 1643 1644 if (req_freelist == NULL) { 1645 /* 1646 * No free async requests left. The lock is held so we 1647 * might as well signal the pusher thread now. 1648 */ 1649 cv_signal(&push_cv); 1650 } 1651 mutex_exit(&push_lock); 1652 return (1); 1653 } 1654 1655 /* 1656 * Wake up pageout to initiate i/o if push_list is not empty. 1657 */ 1658 void 1659 cv_signal_pageout() 1660 { 1661 if (push_list != NULL) { 1662 mutex_enter(&push_lock); 1663 cv_signal(&push_cv); 1664 mutex_exit(&push_lock); 1665 } 1666 } 1667