1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2018 Joyent, Inc. 24 * Copyright 2023 Oxide Computer Company 25 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. 26 */ 27 28 /* 29 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 30 * Use is subject to license terms. 31 */ 32 33 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 34 /* All Rights Reserved */ 35 36 /* 37 * University Copyright- Copyright (c) 1982, 1986, 1988 38 * The Regents of the University of California 39 * All Rights Reserved 40 * 41 * University Acknowledgment- Portions of this document are derived from 42 * software developed by the University of California, Berkeley, and its 43 * contributors. 44 */ 45 46 #include <sys/types.h> 47 #include <sys/t_lock.h> 48 #include <sys/param.h> 49 #include <sys/buf.h> 50 #include <sys/uio.h> 51 #include <sys/proc.h> 52 #include <sys/systm.h> 53 #include <sys/mman.h> 54 #include <sys/cred.h> 55 #include <sys/vnode.h> 56 #include <sys/vm.h> 57 #include <sys/vmparam.h> 58 #include <sys/vtrace.h> 59 #include <sys/cmn_err.h> 60 #include <sys/cpuvar.h> 61 #include <sys/user.h> 62 #include <sys/kmem.h> 63 #include <sys/debug.h> 64 #include <sys/callb.h> 65 #include <sys/mem_cage.h> 66 #include <sys/time.h> 67 #include <sys/stdbool.h> 68 69 #include <vm/hat.h> 70 #include <vm/as.h> 71 #include <vm/seg.h> 72 #include <vm/page.h> 73 #include <vm/pvn.h> 74 #include <vm/seg_kmem.h> 75 76 /* 77 * FREE MEMORY MANAGEMENT 78 * 79 * Management of the pool of free pages is a tricky business. There are 80 * several critical threshold values which constrain our allocation of new 81 * pages and inform the rate of paging out of memory to swap. These threshold 82 * values, and the behaviour they induce, are described below in descending 83 * order of size -- and thus increasing order of severity! 84 * 85 * +---------------------------------------------------- physmem (all memory) 86 * | 87 * | Ordinarily there are no particular constraints placed on page 88 * v allocation. The page scanner is not running and page_create_va() 89 * | will effectively grant all page requests (whether from the kernel 90 * | or from user processes) without artificial delay. 91 * | 92 * +------------------------ lotsfree (1.56% of physmem, min. 16MB, max. 2GB) 93 * | 94 * | When we have less than "lotsfree" pages, pageout_scanner() is 95 * v signalled by schedpaging() to begin looking for pages that can 96 * | be evicted to disk to bring us back above lotsfree. At this 97 * | stage there is still no constraint on allocation of free pages. 98 * | 99 * | For small systems, we set a lower bound of 16MB for lotsfree; 100 * v this is the natural value for a system with 1GB memory. This is 101 * | to ensure that the pageout reserve pool contains at least 4MB 102 * | for use by ZFS. 103 * | 104 * | For systems with a large amount of memory, we constrain lotsfree 105 * | to be at most 2GB (with a pageout reserve of around 0.5GB), as 106 * v at some point the required slack relates more closely to the 107 * | rate at which paging can occur than to the total amount of memory. 108 * | 109 * +------------------- desfree (1/2 of lotsfree, 0.78% of physmem, min. 8MB) 110 * | 111 * | When we drop below desfree, a number of kernel facilities will 112 * v wait before allocating more memory, under the assumption that 113 * | pageout or reaping will make progress and free up some memory. 114 * | This behaviour is not especially coordinated; look for comparisons 115 * | of desfree and freemem. 116 * | 117 * | In addition to various attempts at advisory caution, clock() 118 * | will wake up the thread that is ordinarily parked in sched(). 119 * | This routine is responsible for the heavy-handed swapping out 120 * v of entire processes in an attempt to arrest the slide of free 121 * | memory. See comments in sched.c for more details. 122 * | 123 * +----- minfree & throttlefree (3/4 of desfree, 0.59% of physmem, min. 6MB) 124 * | 125 * | These two separate tunables have, by default, the same value. 126 * v Various parts of the kernel use minfree to signal the need for 127 * | more aggressive reclamation of memory, and sched() is more 128 * | aggressive at swapping processes out. 129 * | 130 * | If free memory falls below throttlefree, page_create_va() will 131 * | use page_create_throttle() to begin holding most requests for 132 * | new pages while pageout and reaping free up memory. Sleeping 133 * v allocations (e.g., KM_SLEEP) are held here while we wait for 134 * | more memory. Non-sleeping allocations are generally allowed to 135 * | proceed, unless their priority is explicitly lowered with 136 * | KM_NORMALPRI (Note: KM_NOSLEEP_LAZY == (KM_NOSLEEP | KM_NORMALPRI).). 137 * | 138 * +------- pageout_reserve (3/4 of throttlefree, 0.44% of physmem, min. 4MB) 139 * | 140 * | When we hit throttlefree, the situation is already dire. The 141 * v system is generally paging out memory and swapping out entire 142 * | processes in order to free up memory for continued operation. 143 * | 144 * | Unfortunately, evicting memory to disk generally requires short 145 * | term use of additional memory; e.g., allocation of buffers for 146 * | storage drivers, updating maps of free and used blocks, etc. 147 * | As such, pageout_reserve is the number of pages that we keep in 148 * | special reserve for use by pageout() and sched() and by any 149 * v other parts of the kernel that need to be working for those to 150 * | make forward progress such as the ZFS I/O pipeline. 151 * | 152 * | When we are below pageout_reserve, we fail or hold any allocation 153 * | that has not explicitly requested access to the reserve pool. 154 * | Access to the reserve is generally granted via the KM_PUSHPAGE 155 * | flag, or by marking a thread T_PUSHPAGE such that all allocations 156 * | can implicitly tap the reserve. For more details, see the 157 * v NOMEMWAIT() macro, the T_PUSHPAGE thread flag, the KM_PUSHPAGE 158 * | and VM_PUSHPAGE allocation flags, and page_create_throttle(). 159 * | 160 * +---------------------------------------------------------- no free memory 161 * | 162 * | If we have arrived here, things are very bad indeed. It is 163 * v surprisingly difficult to tell if this condition is even fatal, 164 * | as enough memory may have been granted to pageout() and to the 165 * | ZFS I/O pipeline that requests for eviction that have already been 166 * | made will complete and free up memory some time soon. 167 * | 168 * | If free memory does not materialise, the system generally remains 169 * | deadlocked. The pageout_deadman() below is run once per second 170 * | from clock(), seeking to limit the amount of time a single request 171 * v to page out can be blocked before the system panics to get a crash 172 * | dump and return to service. 173 * | 174 * +------------------------------------------------------------------------- 175 */ 176 177 /* 178 * The following parameters control operation of the page replacement 179 * algorithm. They are initialized to 0, and then computed at boot time based 180 * on the size of the system; see setupclock(). If they are patched non-zero 181 * in a loaded vmunix they are left alone and may thus be changed per system 182 * using "mdb -kw" on the loaded system. 183 */ 184 pgcnt_t slowscan = 0; 185 pgcnt_t fastscan = 0; 186 187 static pgcnt_t handspreadpages = 0; 188 189 /* 190 * looppages: 191 * Cached copy of the total number of pages in the system (total_pages). 192 * 193 * loopfraction: 194 * Divisor used to relate fastscan to looppages in setupclock(). 195 */ 196 static uint_t loopfraction = 2; 197 static pgcnt_t looppages; 198 199 static uint_t min_percent_cpu = 4; 200 static uint_t max_percent_cpu = 80; 201 static pgcnt_t maxfastscan = 0; 202 static pgcnt_t maxslowscan = 100; 203 204 #define MEGABYTES (1024ULL * 1024ULL) 205 206 /* 207 * pageout_threshold_style: 208 * set to 1 to use the previous default threshold size calculation; 209 * i.e., each threshold is half of the next largest value. 210 */ 211 uint_t pageout_threshold_style = 0; 212 213 /* 214 * The operator may override these tunables to request a different minimum or 215 * maximum lotsfree value, or to change the divisor we use for automatic 216 * sizing. 217 * 218 * By default, we make lotsfree 1/64th of the total memory in the machine. The 219 * minimum and maximum are specified in bytes, rather than pages; a zero value 220 * means the default values (below) are used. 221 */ 222 uint_t lotsfree_fraction = 64; 223 pgcnt_t lotsfree_min = 0; 224 pgcnt_t lotsfree_max = 0; 225 226 #define LOTSFREE_MIN_DEFAULT (16 * MEGABYTES) 227 #define LOTSFREE_MAX_DEFAULT (2048 * MEGABYTES) 228 229 /* 230 * If these tunables are set to non-zero values in /etc/system, and provided 231 * the value is not larger than the threshold above, the specified value will 232 * be used directly without any additional calculation or adjustment. The boot 233 * time value of these overrides is preserved in the "clockinit" struct. More 234 * detail is available in the comment at the top of the file. 235 */ 236 pgcnt_t maxpgio = 0; 237 pgcnt_t minfree = 0; 238 pgcnt_t desfree = 0; 239 pgcnt_t lotsfree = 0; 240 pgcnt_t needfree = 0; 241 pgcnt_t throttlefree = 0; 242 pgcnt_t pageout_reserve = 0; 243 244 pgcnt_t deficit; 245 pgcnt_t nscan; 246 pgcnt_t desscan; 247 248 /* The maximum supported number of page_scanner() threads */ 249 #define MAX_PSCAN_THREADS 16 250 251 /* 252 * Values for min_pageout_nsec, max_pageout_nsec and pageout_nsec are the 253 * number of nanoseconds in each wakeup cycle that gives the equivalent of some 254 * underlying %CPU duty cycle. 255 * 256 * min_pageout_nsec: 257 * nanoseconds/wakeup equivalent of min_percent_cpu. 258 * 259 * max_pageout_nsec: 260 * nanoseconds/wakeup equivalent of max_percent_cpu. 261 * 262 * pageout_nsec: 263 * Number of nanoseconds budgeted for each wakeup cycle. 264 * Computed each time around by schedpaging(). 265 * Varies between min_pageout_nsec and max_pageout_nsec, 266 * depending on memory pressure. 267 */ 268 static hrtime_t min_pageout_nsec; 269 static hrtime_t max_pageout_nsec; 270 static hrtime_t pageout_nsec; 271 272 static bool reset_hands[MAX_PSCAN_THREADS]; 273 274 #define PAGES_POLL_MASK 1023 275 276 /* 277 * Pageout scheduling. 278 * 279 * Schedpaging controls the rate at which the page out daemon runs by 280 * setting the global variables nscan and desscan SCHEDPAGING_HZ 281 * times a second. Nscan records the number of pages pageout has examined 282 * in its current pass; schedpaging() resets this value to zero each time 283 * it runs. Desscan records the number of pages pageout should examine 284 * in its next pass; schedpaging() sets this value based on the amount of 285 * currently available memory. 286 */ 287 #define SCHEDPAGING_HZ 4 288 289 #define WAKE_PAGEOUT_SCANNER(tag) \ 290 do { \ 291 DTRACE_PROBE(schedpage__wake__ ## tag); \ 292 cv_broadcast(&proc_pageout->p_cv); \ 293 } while (0) 294 295 /* 296 * despagescanners: 297 * The desired number of page scanner threads. For testing purposes, this 298 * value can be set in /etc/system or tuned directly with mdb(1). The 299 * system will bring the actual number of threads into line with the 300 * desired number. If set to an invalid value, the system will correct the 301 * setting. 302 */ 303 uint_t despagescanners = 0; 304 305 /* 306 * pageout_sample_lim: 307 * The limit on the number of samples needed to establish a value for new 308 * pageout parameters: fastscan, slowscan, pageout_new_spread, and 309 * handspreadpages. 310 * 311 * pageout_sample_cnt: 312 * Current sample number. Once the sample gets large enough, set new 313 * values for handspreadpages, pageout_new_spread, fastscan and slowscan. 314 * 315 * pageout_sample_pages: 316 * The accumulated number of pages scanned during sampling. 317 * 318 * pageout_sample_etime: 319 * The accumulated nanoseconds for the sample. 320 * 321 * pageout_sampling: 322 * True while sampling is still in progress. 323 * 324 * pageout_rate: 325 * Rate in pages/nanosecond, computed at the end of sampling. 326 * 327 * pageout_new_spread: 328 * Initially zero while the system scan rate is measured by 329 * pageout_scanner(), which then sets this value once per system boot after 330 * enough samples have been recorded (pageout_sample_cnt). Once set, this 331 * new value is used for fastscan and handspreadpages. 332 */ 333 typedef hrtime_t hrrate_t; 334 335 static uint64_t pageout_sample_lim = 4; 336 static uint64_t pageout_sample_cnt = 0; 337 static pgcnt_t pageout_sample_pages = 0; 338 static hrtime_t pageout_sample_etime = 0; 339 static bool pageout_sampling = true; 340 static hrrate_t pageout_rate = 0; 341 static pgcnt_t pageout_new_spread = 0; 342 343 /* The current number of page scanner threads */ 344 static uint_t n_page_scanners = 1; 345 /* The number of page scanner threads that are actively scanning. */ 346 static uint_t pageouts_running; 347 348 /* 349 * Record number of times a pageout_scanner() wakeup cycle finished because it 350 * timed out (exceeded its CPU budget), rather than because it visited 351 * its budgeted number of pages. 352 */ 353 uint64_t pageout_timeouts = 0; 354 355 #ifdef VM_STATS 356 static struct pageoutvmstats_str { 357 ulong_t checkpage[3]; 358 } pageoutvmstats; 359 #endif /* VM_STATS */ 360 361 /* 362 * Threads waiting for free memory use this condition variable and lock until 363 * memory becomes available. 364 */ 365 kmutex_t memavail_lock; 366 kcondvar_t memavail_cv; 367 368 typedef enum pageout_hand { 369 POH_FRONT = 1, 370 POH_BACK, 371 } pageout_hand_t; 372 373 typedef enum { 374 CKP_INELIGIBLE, 375 CKP_NOT_FREED, 376 CKP_FREED, 377 } checkpage_result_t; 378 379 static checkpage_result_t checkpage(page_t *, pageout_hand_t); 380 381 static struct clockinit { 382 bool ci_init; 383 pgcnt_t ci_lotsfree_min; 384 pgcnt_t ci_lotsfree_max; 385 pgcnt_t ci_lotsfree; 386 pgcnt_t ci_desfree; 387 pgcnt_t ci_minfree; 388 pgcnt_t ci_throttlefree; 389 pgcnt_t ci_pageout_reserve; 390 pgcnt_t ci_maxpgio; 391 pgcnt_t ci_maxfastscan; 392 pgcnt_t ci_fastscan; 393 pgcnt_t ci_slowscan; 394 pgcnt_t ci_handspreadpages; 395 uint_t ci_despagescanners; 396 } clockinit = { .ci_init = false }; 397 398 static inline pgcnt_t 399 clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum) 400 { 401 if (value < minimum) 402 return (minimum); 403 else if (value > maximum) 404 return (maximum); 405 else 406 return (value); 407 } 408 409 static pgcnt_t 410 tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval) 411 { 412 if (initval == 0 || initval >= initval_ceiling) 413 return (defval); 414 else 415 return (initval); 416 } 417 418 /* 419 * On large memory systems, multiple instances of the page scanner are run, 420 * each responsible for a separate region of memory. This speeds up page 421 * invalidation under low memory conditions. 422 * 423 * For testing purposes, despagescanners can be set in /etc/system or via 424 * mdb(1) and it will be used as a guide for how many page scanners to create; 425 * the value will be adjusted if it is not sensible. Otherwise, the number of 426 * page scanners is determined dynamically based on handspreadpages. 427 */ 428 static void 429 recalc_pagescanners(void) 430 { 431 uint_t des; 432 433 /* If the initial calibration has not been done, take no action. */ 434 if (pageout_new_spread == 0) 435 return; 436 437 /* 438 * If `clockinit.ci_despagescanners` is non-zero, then a value for 439 * `despagescanners` was set during initial boot. In this case, if 440 * `despagescanners` has been reset to 0 then we want to revert to 441 * that initial boot value. 442 */ 443 if (despagescanners == 0) 444 despagescanners = clockinit.ci_despagescanners; 445 446 if (despagescanners != 0) { 447 /* 448 * We have a desired number of page scanners, either from 449 * /etc/system or set via mdb. Try and use it (it will be 450 * adjusted below if necessary). 451 */ 452 des = despagescanners; 453 } else { 454 /* 455 * Calculate the number of desired scanners based on the 456 * system's memory size. 457 * 458 * A 64GiB region size is used as the basis for calculating how 459 * many scanner threads should be created. For systems with up 460 * to 64GiB of RAM, a single thread is used; for very large 461 * memory systems the threads are limited to MAX_PSCAN_THREADS. 462 */ 463 des = (looppages - 1) / btop(64ULL << 30) + 1; 464 } 465 466 /* 467 * Clamp the number of scanners so that we have no more than 468 * MAX_PSCAN_THREADS and so that each scanner covers at least 10% more 469 * than handspreadpages. 470 */ 471 pgcnt_t min_scanner_pages = handspreadpages + handspreadpages / 10; 472 pgcnt_t max_scanners = looppages / min_scanner_pages; 473 despagescanners = clamp(des, 1, 474 clamp(max_scanners, 1, MAX_PSCAN_THREADS)); 475 } 476 477 /* 478 * Set up the paging constants for the clock algorithm used by 479 * pageout_scanner(), and by the virtual memory system overall. See the 480 * comments at the top of this file for more information about the threshold 481 * values and system responses to memory pressure. 482 * 483 * This routine is called once by main() at startup, after the initial size of 484 * physical memory is determined. It may be called again later if memory is 485 * added to or removed from the system, or if new measurements of the page scan 486 * rate become available. 487 */ 488 void 489 setupclock(void) 490 { 491 bool half = (pageout_threshold_style == 1); 492 bool recalc = true; 493 494 looppages = total_pages; 495 496 /* 497 * The operator may have provided specific values for some of the 498 * tunables via /etc/system. On our first call, we preserve those 499 * values so that they can be used for subsequent recalculations. 500 * 501 * A value of zero for any tunable means we will use the default 502 * sizing. 503 */ 504 if (!clockinit.ci_init) { 505 clockinit.ci_init = true; 506 507 clockinit.ci_lotsfree_min = lotsfree_min; 508 clockinit.ci_lotsfree_max = lotsfree_max; 509 clockinit.ci_lotsfree = lotsfree; 510 clockinit.ci_desfree = desfree; 511 clockinit.ci_minfree = minfree; 512 clockinit.ci_throttlefree = throttlefree; 513 clockinit.ci_pageout_reserve = pageout_reserve; 514 clockinit.ci_maxpgio = maxpgio; 515 clockinit.ci_maxfastscan = maxfastscan; 516 clockinit.ci_fastscan = fastscan; 517 clockinit.ci_slowscan = slowscan; 518 clockinit.ci_handspreadpages = handspreadpages; 519 clockinit.ci_despagescanners = despagescanners; 520 521 /* 522 * The first call does not trigger a recalculation, only 523 * subsequent calls. 524 */ 525 recalc = false; 526 } 527 528 /* 529 * Configure paging threshold values. For more details on what each 530 * threshold signifies, see the comments at the top of this file. 531 */ 532 lotsfree_max = tune(clockinit.ci_lotsfree_max, looppages, 533 btop(LOTSFREE_MAX_DEFAULT)); 534 lotsfree_min = tune(clockinit.ci_lotsfree_min, lotsfree_max, 535 btop(LOTSFREE_MIN_DEFAULT)); 536 537 lotsfree = tune(clockinit.ci_lotsfree, looppages, 538 clamp(looppages / lotsfree_fraction, lotsfree_min, lotsfree_max)); 539 540 desfree = tune(clockinit.ci_desfree, lotsfree, 541 lotsfree / 2); 542 543 minfree = tune(clockinit.ci_minfree, desfree, 544 half ? desfree / 2 : 3 * desfree / 4); 545 546 throttlefree = tune(clockinit.ci_throttlefree, desfree, 547 minfree); 548 549 pageout_reserve = tune(clockinit.ci_pageout_reserve, throttlefree, 550 half ? throttlefree / 2 : 3 * throttlefree / 4); 551 552 /* 553 * Maxpgio thresholds how much paging is acceptable. 554 * This figures that 2/3 busy on an arm is all that is 555 * tolerable for paging. We assume one operation per disk rev. 556 * 557 * XXX - Does not account for multiple swap devices. 558 */ 559 if (clockinit.ci_maxpgio == 0) { 560 maxpgio = (DISKRPM * 2) / 3; 561 } else { 562 maxpgio = clockinit.ci_maxpgio; 563 } 564 565 /* 566 * The clock scan rate varies between fastscan and slowscan 567 * based on the amount of free memory available. Fastscan 568 * rate should be set based on the number pages that can be 569 * scanned per sec using ~10% of processor time. Since this 570 * value depends on the processor, MMU, Mhz etc., it is 571 * difficult to determine it in a generic manner for all 572 * architectures. 573 * 574 * Instead of trying to determine the number of pages scanned 575 * per sec for every processor, fastscan is set to be the smaller 576 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling 577 * time is limited to ~4% of processor time. 578 * 579 * Setting fastscan to be 1/2 of memory allows pageout to scan 580 * all of memory in ~2 secs. This implies that user pages not 581 * accessed within 1 sec (assuming, handspreadpages == fastscan) 582 * can be reclaimed when free memory is very low. Stealing pages 583 * not accessed within 1 sec seems reasonable and ensures that 584 * active user processes don't thrash. 585 * 586 * Smaller values of fastscan result in scanning fewer pages 587 * every second and consequently pageout may not be able to free 588 * sufficient memory to maintain the minimum threshold. Larger 589 * values of fastscan result in scanning a lot more pages which 590 * could lead to thrashing and higher CPU usage. 591 * 592 * Fastscan needs to be limited to a maximum value and should not 593 * scale with memory to prevent pageout from consuming too much 594 * time for scanning on slow CPU's and avoid thrashing, as a 595 * result of scanning too many pages, on faster CPU's. 596 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES 597 * (the upper bound for fastscan) based on the average number 598 * of pages that can potentially be scanned in ~1 sec (using ~4% 599 * of the CPU) on some of the following machines that currently 600 * run Solaris 2.x: 601 * 602 * average memory scanned in ~1 sec 603 * 604 * 25 Mhz SS1+: 23 Meg 605 * LX: 37 Meg 606 * 50 Mhz SC2000: 68 Meg 607 * 608 * 40 Mhz 486: 26 Meg 609 * 66 Mhz 486: 42 Meg 610 * 611 * When free memory falls just below lotsfree, the scan rate 612 * goes from 0 to slowscan (i.e., pageout starts running). This 613 * transition needs to be smooth and is achieved by ensuring that 614 * pageout scans a small number of pages to satisfy the transient 615 * memory demand. This is set to not exceed 100 pages/sec (25 per 616 * wakeup) since scanning that many pages has no noticible impact 617 * on system performance. 618 * 619 * In addition to setting fastscan and slowscan, pageout is 620 * limited to using ~4% of the CPU. This results in increasing 621 * the time taken to scan all of memory, which in turn means that 622 * user processes have a better opportunity of preventing their 623 * pages from being stolen. This has a positive effect on 624 * interactive and overall system performance when memory demand 625 * is high. 626 * 627 * Thus, the rate at which pages are scanned for replacement will 628 * vary linearly between slowscan and the number of pages that 629 * can be scanned using ~4% of processor time instead of varying 630 * linearly between slowscan and fastscan. 631 * 632 * Also, the processor time used by pageout will vary from ~1% 633 * at slowscan to ~4% at fastscan instead of varying between 634 * ~1% at slowscan and ~10% at fastscan. 635 * 636 * The values chosen for the various VM parameters (fastscan, 637 * handspreadpages, etc) are not universally true for all machines, 638 * but appear to be a good rule of thumb for the machines we've 639 * tested. They have the following ranges: 640 * 641 * cpu speed: 20 to 70 Mhz 642 * page size: 4K to 8K 643 * memory size: 16M to 5G 644 * page scan rate: 4000 - 17400 4K pages per sec 645 * 646 * The values need to be re-examined for machines which don't 647 * fall into the various ranges (e.g., slower or faster CPUs, 648 * smaller or larger pagesizes etc) shown above. 649 * 650 * On an MP machine, pageout is often unable to maintain the 651 * minimum paging thresholds under heavy load. This is due to 652 * the fact that user processes running on other CPU's can be 653 * dirtying memory at a much faster pace than pageout can find 654 * pages to free. The memory demands could be met by enabling 655 * more than one CPU to run the clock algorithm in such a manner 656 * that the various clock hands don't overlap. This also makes 657 * it more difficult to determine the values for fastscan, slowscan 658 * and handspreadpages. 659 * 660 * The swapper is currently used to free up memory when pageout 661 * is unable to meet memory demands by swapping out processes. 662 * In addition to freeing up memory, swapping also reduces the 663 * demand for memory by preventing user processes from running 664 * and thereby consuming memory. 665 */ 666 if (clockinit.ci_maxfastscan == 0) { 667 if (pageout_new_spread != 0) { 668 maxfastscan = pageout_new_spread; 669 } else { 670 maxfastscan = MAXHANDSPREADPAGES; 671 } 672 } else { 673 maxfastscan = clockinit.ci_maxfastscan; 674 } 675 676 if (clockinit.ci_fastscan == 0) { 677 fastscan = MIN(looppages / loopfraction, maxfastscan); 678 } else { 679 fastscan = clockinit.ci_fastscan; 680 } 681 682 if (fastscan > looppages / loopfraction) { 683 fastscan = looppages / loopfraction; 684 } 685 686 /* 687 * Set slow scan time to 1/10 the fast scan time, but 688 * not to exceed maxslowscan. 689 */ 690 if (clockinit.ci_slowscan == 0) { 691 slowscan = MIN(fastscan / 10, maxslowscan); 692 } else { 693 slowscan = clockinit.ci_slowscan; 694 } 695 696 if (slowscan > fastscan / 2) { 697 slowscan = fastscan / 2; 698 } 699 700 /* 701 * Handspreadpages is the distance (in pages) between front and back 702 * pageout daemon hands. The amount of time to reclaim a page 703 * once pageout examines it increases with this distance and 704 * decreases as the scan rate rises. It must be < the amount 705 * of pageable memory. 706 * 707 * Since pageout is limited to ~4% of the CPU, setting handspreadpages 708 * to be "fastscan" results in the front hand being a few secs 709 * (varies based on the processor speed) ahead of the back hand 710 * at fastscan rates. This distance can be further reduced, if 711 * necessary, by increasing the processor time used by pageout 712 * to be more than ~4% and preferrably not more than ~10%. 713 * 714 * As a result, user processes have a much better chance of 715 * referencing their pages before the back hand examines them. 716 * This also significantly lowers the number of reclaims from 717 * the freelist since pageout does not end up freeing pages which 718 * may be referenced a sec later. 719 */ 720 if (clockinit.ci_handspreadpages == 0) { 721 handspreadpages = fastscan; 722 } else { 723 handspreadpages = clockinit.ci_handspreadpages; 724 } 725 726 /* 727 * Make sure that back hand follows front hand by at least 728 * 1/SCHEDPAGING_HZ seconds. Without this test, it is possible for the 729 * back hand to look at a page during the same wakeup of the pageout 730 * daemon in which the front hand cleared its ref bit. 731 */ 732 if (handspreadpages >= looppages) { 733 handspreadpages = looppages - 1; 734 } 735 736 /* 737 * Establish the minimum and maximum length of time to be spent 738 * scanning pages per wakeup, limiting the scanner duty cycle. The 739 * input percentage values (0-100) must be converted to a fraction of 740 * the number of nanoseconds in a second of wall time, then further 741 * scaled down by the number of scanner wakeups in a second. 742 */ 743 min_pageout_nsec = MAX(1, 744 NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ); 745 max_pageout_nsec = MAX(min_pageout_nsec, 746 NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ); 747 748 /* 749 * If not called for recalculation, return and skip the remaining 750 * steps. 751 */ 752 if (!recalc) 753 return; 754 755 /* 756 * Set a flag to re-evaluate the clock hand positions. 757 */ 758 for (uint_t i = 0; i < MAX_PSCAN_THREADS; i++) 759 reset_hands[i] = true; 760 761 recalc_pagescanners(); 762 } 763 764 static kmutex_t pageout_mutex; 765 766 /* 767 * Pool of available async pageout putpage requests. 768 */ 769 static struct async_reqs *push_req; 770 static struct async_reqs *req_freelist; /* available req structs */ 771 static struct async_reqs *push_list; /* pending reqs */ 772 static kmutex_t push_lock; /* protects req pool */ 773 static kcondvar_t push_cv; 774 775 /* 776 * If pageout() is stuck on a single push for this many seconds, 777 * pageout_deadman() will assume the system has hit a memory deadlock. If set 778 * to 0, the deadman will have no effect. 779 * 780 * Note that we are only looking for stalls in the calls that pageout() makes 781 * to VOP_PUTPAGE(). These calls are merely asynchronous requests for paging 782 * I/O, which should not take long unless the underlying strategy call blocks 783 * indefinitely for memory. The actual I/O request happens (or fails) later. 784 */ 785 uint_t pageout_deadman_seconds = 90; 786 787 static uint_t pageout_stucktime = 0; 788 static bool pageout_pushing = false; 789 static uint64_t pageout_pushcount = 0; 790 static uint64_t pageout_pushcount_seen = 0; 791 792 int async_list_size = 8192; 793 794 static void pageout_scanner(void *); 795 796 /* 797 * If a page is being shared more than "po_share" times 798 * then leave it alone- don't page it out. 799 */ 800 #define MIN_PO_SHARE (8) 801 #define MAX_PO_SHARE ((MIN_PO_SHARE) << 24) 802 ulong_t po_share = MIN_PO_SHARE; 803 804 /* 805 * Schedule rate for paging. 806 * Rate is linear interpolation between 807 * slowscan with lotsfree and fastscan when out of memory. 808 */ 809 static void 810 schedpaging(void *arg) 811 { 812 spgcnt_t vavail; 813 814 if (freemem < lotsfree + needfree + kmem_reapahead) 815 kmem_reap(); 816 817 if (freemem < lotsfree + needfree) 818 seg_preap(); 819 820 if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree)) 821 kcage_cageout_wakeup(); 822 823 if (mutex_tryenter(&pageout_mutex)) { 824 if (pageouts_running != 0) 825 goto out; 826 827 /* No pageout scanner threads running. */ 828 nscan = 0; 829 vavail = freemem - deficit; 830 if (pageout_new_spread != 0) 831 vavail -= needfree; 832 /* Note that vavail is signed so don't use clamp() here */ 833 if (vavail < 0) 834 vavail = 0; 835 if (vavail > lotsfree) 836 vavail = lotsfree; 837 838 if (needfree > 0 && pageout_new_spread == 0) { 839 /* 840 * If we've not yet collected enough samples to 841 * calculate a spread, use the old logic of kicking 842 * into high gear anytime needfree is non-zero. 843 */ 844 desscan = fastscan / SCHEDPAGING_HZ; 845 } else { 846 /* 847 * Once we've calculated a spread based on system 848 * memory and usage, just treat needfree as another 849 * form of deficit. 850 */ 851 spgcnt_t faststmp, slowstmp, result; 852 853 slowstmp = slowscan * vavail; 854 faststmp = fastscan * (lotsfree - vavail); 855 result = (slowstmp + faststmp) / 856 nz(lotsfree) / SCHEDPAGING_HZ; 857 desscan = (pgcnt_t)result; 858 } 859 860 pageout_nsec = min_pageout_nsec + (lotsfree - vavail) * 861 (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree); 862 863 DTRACE_PROBE2(schedpage__calc, pgcnt_t, desscan, hrtime_t, 864 pageout_nsec); 865 866 if (pageout_new_spread != 0 && despagescanners != 0 && 867 despagescanners != n_page_scanners) { 868 /* 869 * We have finished the pagescan initialisation and the 870 * desired number of page scanners has changed, either 871 * because sampling just finished, because of a memory 872 * DR, or because despagescanners has been modified on 873 * the fly (e.g. via mdb(1)). 874 */ 875 uint_t curr_nscan = n_page_scanners; 876 uint_t i; 877 878 /* Re-validate despagescanners */ 879 recalc_pagescanners(); 880 881 n_page_scanners = despagescanners; 882 883 for (i = 0; i < MAX_PSCAN_THREADS; i++) 884 reset_hands[i] = true; 885 886 /* If we need more scanners, start them now. */ 887 for (i = curr_nscan; i < n_page_scanners; i++) { 888 (void) lwp_kernel_create(proc_pageout, 889 pageout_scanner, (void *)(uintptr_t)i, 890 TS_RUN, curthread->t_pri); 891 } 892 893 /* 894 * If the number of scanners has decreased, trigger a 895 * wakeup so that the excess threads will terminate. 896 */ 897 if (n_page_scanners < curr_nscan) { 898 WAKE_PAGEOUT_SCANNER(reducing); 899 } 900 } 901 902 if (pageout_sampling) { 903 /* 904 * We still need to measure the rate at which the 905 * system is able to scan pages of memory. Each of 906 * these initial samples is a scan of as much system 907 * memory as practical, regardless of whether or not we 908 * are experiencing memory pressure. 909 */ 910 desscan = total_pages; 911 pageout_nsec = max_pageout_nsec; 912 913 WAKE_PAGEOUT_SCANNER(sampling); 914 } else if (freemem < lotsfree + needfree) { 915 /* 916 * We need more memory. 917 */ 918 WAKE_PAGEOUT_SCANNER(lowmem); 919 } else { 920 /* 921 * There are enough free pages, no need to 922 * kick the scanner threads. And next time 923 * around, keep more of the `highly shared' 924 * pages. 925 */ 926 cv_signal_pageout(); 927 if (po_share > MIN_PO_SHARE) 928 po_share >>= 1; 929 } 930 out: 931 mutex_exit(&pageout_mutex); 932 } 933 934 /* 935 * Signal threads waiting for available memory. 936 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but 937 * in this case it is not needed - the waiters will be woken up during 938 * the next invocation of this function. 939 */ 940 if (kmem_avail() > 0) 941 cv_broadcast(&memavail_cv); 942 943 (void) timeout(schedpaging, arg, hz / SCHEDPAGING_HZ); 944 } 945 946 pgcnt_t pushes; 947 ulong_t push_list_size; /* # of requests on pageout queue */ 948 949 /* 950 * Paging out should always be enabled. This tunable exists to hold pageout 951 * for debugging purposes. If set to 0, pageout_scanner() will go back to 952 * sleep each time it is woken by schedpaging(). 953 */ 954 uint_t dopageout = 1; 955 956 /* 957 * The page out daemon, which runs as process 2. 958 * 959 * The daemon treats physical memory as a circular array of pages and scans 960 * the pages using a 'two-handed clock' algorithm. The front hand moves 961 * through the pages, clearing the reference bit. The back hand travels a 962 * distance (handspreadpages) behind the front hand, freeing the pages that 963 * have not been referenced in the time since the front hand passed. If 964 * modified, they are first written to their backing store before being 965 * freed. 966 * 967 * In order to make page invalidation more responsive on machines with 968 * larger memory, multiple pageout_scanner threads may be created. In this 969 * case, each thread is given a segment of the memory "clock face" so that 970 * memory can be reclaimed more quickly. As long as there are at least lotsfree 971 * pages, then pageout_scanner threads are not run. 972 * 973 * There are multiple threads that act on behalf of the pageout process. A 974 * set of threads scan pages (pageout_scanner) and frees them up if they 975 * don't require any VOP_PUTPAGE operation. If a page must be written back 976 * to its backing store, the request is put on a list and the other 977 * (pageout) thread is signaled. The pageout thread grabs VOP_PUTPAGE 978 * requests from the list, and processes them. Some filesystems may require 979 * resources for the VOP_PUTPAGE operations (like memory) and hence can 980 * block the pageout thread, but the scanner thread can still operate. 981 * There is still no guarantee that memory deadlocks cannot occur. 982 */ 983 void 984 pageout() 985 { 986 struct async_reqs *arg; 987 pri_t pageout_pri; 988 int i; 989 pgcnt_t max_pushes; 990 callb_cpr_t cprinfo; 991 992 proc_pageout = ttoproc(curthread); 993 proc_pageout->p_cstime = 0; 994 proc_pageout->p_stime = 0; 995 proc_pageout->p_cutime = 0; 996 proc_pageout->p_utime = 0; 997 bcopy("pageout", PTOU(curproc)->u_psargs, 8); 998 bcopy("pageout", PTOU(curproc)->u_comm, 7); 999 1000 mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL); 1001 mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL); 1002 1003 /* 1004 * Allocate and initialize the async request structures for pageout. 1005 */ 1006 push_req = (struct async_reqs *) 1007 kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP); 1008 1009 req_freelist = push_req; 1010 for (i = 0; i < async_list_size - 1; i++) { 1011 push_req[i].a_next = &push_req[i + 1]; 1012 } 1013 1014 pageout_pri = curthread->t_pri; 1015 1016 /* Create the first pageout scanner thread. */ 1017 (void) lwp_kernel_create(proc_pageout, pageout_scanner, 1018 (void *)0, /* this is instance 0, not NULL */ 1019 TS_RUN, pageout_pri - 1); 1020 1021 /* 1022 * kick off the pageout scheduler. 1023 */ 1024 schedpaging(NULL); 1025 1026 /* 1027 * Create kernel cage thread. 1028 * The kernel cage thread is started under the pageout process 1029 * to take advantage of the less restricted page allocation 1030 * in page_create_throttle(). 1031 */ 1032 kcage_cageout_init(); 1033 1034 /* 1035 * Limit pushes to avoid saturating pageout devices. 1036 */ 1037 max_pushes = maxpgio / SCHEDPAGING_HZ; 1038 CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout"); 1039 1040 for (;;) { 1041 mutex_enter(&push_lock); 1042 1043 while ((arg = push_list) == NULL || pushes > max_pushes) { 1044 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1045 cv_wait(&push_cv, &push_lock); 1046 pushes = 0; 1047 CALLB_CPR_SAFE_END(&cprinfo, &push_lock); 1048 } 1049 push_list = arg->a_next; 1050 arg->a_next = NULL; 1051 pageout_pushing = true; 1052 mutex_exit(&push_lock); 1053 1054 DTRACE_PROBE(pageout__push); 1055 1056 if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off, 1057 arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) { 1058 pushes++; 1059 } 1060 1061 /* vp held by checkpage() */ 1062 VN_RELE(arg->a_vp); 1063 1064 mutex_enter(&push_lock); 1065 pageout_pushing = false; 1066 pageout_pushcount++; 1067 arg->a_next = req_freelist; /* back on freelist */ 1068 req_freelist = arg; 1069 push_list_size--; 1070 mutex_exit(&push_lock); 1071 } 1072 } 1073 1074 static void 1075 pageout_sample_add(pgcnt_t count, hrtime_t elapsed) 1076 { 1077 VERIFY(pageout_sampling); 1078 1079 /* 1080 * The global variables used below are only modified during initial 1081 * scanning when there is a single page scanner thread running. 1082 */ 1083 pageout_sample_pages += count; 1084 pageout_sample_etime += elapsed; 1085 pageout_sample_cnt++; 1086 1087 if (pageout_sample_cnt >= pageout_sample_lim) { 1088 /* 1089 * We have enough samples, set the spread. 1090 */ 1091 pageout_sampling = false; 1092 pageout_rate = (hrrate_t)pageout_sample_pages * 1093 (hrrate_t)(NANOSEC) / pageout_sample_etime; 1094 pageout_new_spread = pageout_rate / 10; 1095 } 1096 } 1097 1098 static inline page_t * 1099 wrapping_page_next(page_t *cur, page_t *start, page_t *end) 1100 { 1101 if (cur == end) 1102 return (start); 1103 return (page_nextn(cur, 1)); 1104 } 1105 1106 /* 1107 * Kernel thread that scans pages looking for ones to free 1108 */ 1109 static void 1110 pageout_scanner(void *a) 1111 { 1112 page_t *fhand, *bhand, *fhandstart; 1113 page_t *regionstart, *regionend; 1114 uint_t laps; 1115 callb_cpr_t cprinfo; 1116 pgcnt_t nscan_cnt; 1117 pgcnt_t pcount; 1118 hrtime_t sample_start, sample_end; 1119 uint_t inst = (uint_t)(uintptr_t)a; 1120 1121 VERIFY3U(inst, <, MAX_PSCAN_THREADS); 1122 1123 CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan"); 1124 mutex_enter(&pageout_mutex); 1125 1126 /* 1127 * The restart case does not attempt to point the hands at roughly 1128 * the right point on the assumption that after one circuit things 1129 * will have settled down, and restarts shouldn't be that often. 1130 */ 1131 reset_hands[inst] = true; 1132 1133 pageouts_running++; 1134 mutex_exit(&pageout_mutex); 1135 1136 loop: 1137 cv_signal_pageout(); 1138 1139 mutex_enter(&pageout_mutex); 1140 pageouts_running--; 1141 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1142 cv_wait(&proc_pageout->p_cv, &pageout_mutex); 1143 CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex); 1144 pageouts_running++; 1145 mutex_exit(&pageout_mutex); 1146 1147 /* 1148 * Check if pageout has been disabled for debugging purposes. 1149 */ 1150 if (dopageout == 0) 1151 goto loop; 1152 1153 /* 1154 * One may reset the clock hands and scanned region for debugging 1155 * purposes. Hands will also be reset on first thread startup, if 1156 * the number of scanning threads (n_page_scanners) changes, or if 1157 * memory is added to, or removed from, the system. 1158 */ 1159 if (reset_hands[inst]) { 1160 page_t *first; 1161 1162 reset_hands[inst] = false; 1163 1164 if (inst >= n_page_scanners) { 1165 /* 1166 * The desired number of page scanners has been 1167 * reduced and this instance is no longer wanted. 1168 * Exit the lwp. 1169 */ 1170 VERIFY3U(inst, !=, 0); 1171 DTRACE_PROBE1(pageout__exit, uint_t, inst); 1172 mutex_enter(&pageout_mutex); 1173 pageouts_running--; 1174 mutex_exit(&pageout_mutex); 1175 mutex_enter(&curproc->p_lock); 1176 lwp_exit(); 1177 /* NOTREACHED */ 1178 } 1179 1180 first = page_first(); 1181 1182 /* 1183 * Each scanner thread gets its own sector of the memory 1184 * clock face. 1185 */ 1186 pgcnt_t span, offset; 1187 1188 span = looppages / n_page_scanners; 1189 VERIFY3U(span, >, handspreadpages); 1190 1191 offset = inst * span; 1192 regionstart = page_nextn(first, offset); 1193 if (inst == n_page_scanners - 1) { 1194 /* The last instance goes up to the last page */ 1195 regionend = page_nextn(first, looppages - 1); 1196 } else { 1197 regionend = page_nextn(regionstart, span - 1); 1198 } 1199 1200 bhand = regionstart; 1201 fhand = page_nextn(bhand, handspreadpages); 1202 1203 DTRACE_PROBE4(pageout__reset, uint_t, inst, 1204 pgcnt_t, regionstart, pgcnt_t, regionend, 1205 pgcnt_t, fhand); 1206 } 1207 1208 /* 1209 * This CPU kstat is only incremented here and we're on this CPU, so no 1210 * lock. 1211 */ 1212 CPU_STATS_ADDQ(CPU, vm, pgrrun, 1); 1213 1214 /* 1215 * Keep track of the number of times we have scanned all the way around 1216 * the loop on this wakeup. 1217 */ 1218 laps = 0; 1219 1220 /* 1221 * Track the number of pages visited during this scan so that we can 1222 * periodically measure our duty cycle. 1223 */ 1224 nscan_cnt = 0; 1225 pcount = 0; 1226 1227 DTRACE_PROBE5(pageout__start, uint_t, inst, pgcnt_t, desscan, 1228 hrtime_t, pageout_nsec, page_t *, bhand, page_t *, fhand); 1229 1230 /* 1231 * Record the initial position of the front hand for this cycle so 1232 * that we can detect when the hand wraps around. 1233 */ 1234 fhandstart = fhand; 1235 1236 sample_start = gethrtime(); 1237 1238 /* 1239 * Scan the appropriate number of pages for a single duty cycle. 1240 */ 1241 while (nscan_cnt < desscan) { 1242 checkpage_result_t rvfront, rvback; 1243 1244 if (!pageout_sampling && freemem >= lotsfree + needfree) { 1245 /* 1246 * We are not sampling and enough memory has become 1247 * available that scanning is no longer required. 1248 */ 1249 DTRACE_PROBE1(pageout__memfree, uint_t, inst); 1250 break; 1251 } 1252 1253 DTRACE_PROBE2(pageout__loop, uint_t, inst, pgcnt_t, pcount); 1254 1255 /* 1256 * Periodically check to see if we have exceeded the CPU duty 1257 * cycle for a single wakeup. 1258 */ 1259 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) { 1260 hrtime_t pageout_cycle_nsec; 1261 1262 pageout_cycle_nsec = gethrtime() - sample_start; 1263 if (pageout_cycle_nsec >= pageout_nsec) { 1264 atomic_inc_64(&pageout_timeouts); 1265 DTRACE_PROBE1(pageout__timeout, uint_t, inst); 1266 break; 1267 } 1268 } 1269 1270 /* 1271 * If checkpage manages to add a page to the free list, 1272 * we give ourselves another couple of trips around the loop. 1273 */ 1274 if ((rvfront = checkpage(fhand, POH_FRONT)) == CKP_FREED) { 1275 laps = 0; 1276 } 1277 if ((rvback = checkpage(bhand, POH_BACK)) == CKP_FREED) { 1278 laps = 0; 1279 } 1280 1281 ++pcount; 1282 1283 /* 1284 * This CPU kstat is only incremented here and we're on this 1285 * CPU, so no lock. 1286 */ 1287 CPU_STATS_ADDQ(CPU, vm, scan, 1); 1288 1289 /* 1290 * Don't include ineligible pages in the number scanned. 1291 */ 1292 if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) 1293 nscan_cnt++; 1294 1295 /* 1296 * Tick 1297 */ 1298 bhand = wrapping_page_next(bhand, regionstart, regionend); 1299 fhand = wrapping_page_next(fhand, regionstart, regionend); 1300 1301 /* 1302 * The front hand has wrapped around during this wakeup. 1303 */ 1304 if (fhand == fhandstart) { 1305 laps++; 1306 DTRACE_PROBE2(pageout__hand__wrap, uint_t, inst, 1307 uint_t, laps); 1308 1309 /* 1310 * This CPU kstat is only incremented here and we're 1311 * on this CPU, so no lock. 1312 */ 1313 CPU_STATS_ADDQ(CPU, vm, rev, 1); 1314 1315 if (laps > 1) { 1316 /* 1317 * Extremely unlikely, but it happens. 1318 * We went around the loop at least once 1319 * and didn't get far enough. 1320 * If we are still skipping `highly shared' 1321 * pages, skip fewer of them. Otherwise, 1322 * give up till the next clock tick. 1323 */ 1324 if (po_share < MAX_PO_SHARE) { 1325 po_share <<= 1; 1326 } else { 1327 break; 1328 } 1329 } 1330 } 1331 } 1332 1333 sample_end = gethrtime(); 1334 atomic_add_long(&nscan, nscan_cnt); 1335 1336 DTRACE_PROBE4(pageout__end, uint_t, inst, uint_t, laps, 1337 pgcnt_t, nscan_cnt, pgcnt_t, pcount) 1338 1339 /* 1340 * Continue accumulating samples until we have enough to get a 1341 * reasonable value for average scan rate. 1342 */ 1343 if (pageout_sampling) { 1344 VERIFY3U(inst, ==, 0); 1345 pageout_sample_add(pcount, sample_end - sample_start); 1346 /* 1347 * If, after the sample just added, we have finished sampling, 1348 * set up the paging constants. 1349 */ 1350 if (!pageout_sampling) 1351 setupclock(); 1352 } 1353 1354 goto loop; 1355 } 1356 1357 /* 1358 * The pageout deadman is run once per second by clock(). 1359 */ 1360 void 1361 pageout_deadman(void) 1362 { 1363 if (panicstr != NULL) { 1364 /* 1365 * There is no pageout after panic. 1366 */ 1367 return; 1368 } 1369 1370 if (pageout_deadman_seconds == 0) { 1371 /* 1372 * The deadman is not enabled. 1373 */ 1374 return; 1375 } 1376 1377 if (!pageout_pushing) { 1378 goto reset; 1379 } 1380 1381 /* 1382 * We are pushing a page. Check to see if it is the same call we saw 1383 * last time we looked: 1384 */ 1385 if (pageout_pushcount != pageout_pushcount_seen) { 1386 /* 1387 * It is a different call from the last check, so we are not 1388 * stuck. 1389 */ 1390 goto reset; 1391 } 1392 1393 if (++pageout_stucktime >= pageout_deadman_seconds) { 1394 panic("pageout_deadman: stuck pushing the same page for %d " 1395 "seconds (freemem is %lu)", pageout_deadman_seconds, 1396 freemem); 1397 } 1398 1399 return; 1400 1401 reset: 1402 /* 1403 * Reset our tracking state to reflect that we are not stuck: 1404 */ 1405 pageout_stucktime = 0; 1406 pageout_pushcount_seen = pageout_pushcount; 1407 } 1408 1409 /* 1410 * Look at the page at hand. If it is locked (e.g., for physical i/o), 1411 * system (u., page table) or free, then leave it alone. Otherwise, 1412 * if we are running the front hand, turn off the page's reference bit. 1413 * If the proc is over maxrss, we take it. If running the back hand, 1414 * check whether the page has been reclaimed. If not, free the page, 1415 * pushing it to disk first if necessary. 1416 * 1417 * Return values: 1418 * CKP_INELIGIBLE if the page is not a candidate at all, 1419 * CKP_NOT_FREED if the page was not freed, or 1420 * CKP_FREED if we freed it. 1421 */ 1422 static checkpage_result_t 1423 checkpage(page_t *pp, pageout_hand_t whichhand) 1424 { 1425 int ppattr; 1426 int isfs = 0; 1427 int isexec = 0; 1428 int pagesync_flag; 1429 1430 /* 1431 * Skip pages: 1432 * - associated with the kernel vnode since 1433 * they are always "exclusively" locked. 1434 * - that are free 1435 * - that are shared more than po_share'd times 1436 * - its already locked 1437 * 1438 * NOTE: These optimizations assume that reads are atomic. 1439 */ 1440 1441 if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) || 1442 pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 1443 hat_page_checkshare(pp, po_share)) { 1444 return (CKP_INELIGIBLE); 1445 } 1446 1447 if (!page_trylock(pp, SE_EXCL)) { 1448 /* 1449 * Skip the page if we can't acquire the "exclusive" lock. 1450 */ 1451 return (CKP_INELIGIBLE); 1452 } else if (PP_ISFREE(pp)) { 1453 /* 1454 * It became free between the above check and our actually 1455 * locking the page. Oh well, there will be other pages. 1456 */ 1457 page_unlock(pp); 1458 return (CKP_INELIGIBLE); 1459 } 1460 1461 /* 1462 * Reject pages that cannot be freed. The page_struct_lock 1463 * need not be acquired to examine these 1464 * fields since the page has an "exclusive" lock. 1465 */ 1466 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1467 page_unlock(pp); 1468 return (CKP_INELIGIBLE); 1469 } 1470 1471 /* 1472 * Maintain statistics for what we are freeing 1473 */ 1474 if (pp->p_vnode != NULL) { 1475 if (pp->p_vnode->v_flag & VVMEXEC) 1476 isexec = 1; 1477 1478 if (!IS_SWAPFSVP(pp->p_vnode)) 1479 isfs = 1; 1480 } 1481 1482 /* 1483 * Turn off REF and MOD bits with the front hand. 1484 * The back hand examines the REF bit and always considers 1485 * SHARED pages as referenced. 1486 */ 1487 if (whichhand == POH_FRONT) { 1488 pagesync_flag = HAT_SYNC_ZERORM; 1489 } else { 1490 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF | 1491 HAT_SYNC_STOPON_SHARED; 1492 } 1493 1494 ppattr = hat_pagesync(pp, pagesync_flag); 1495 1496 recheck: 1497 /* 1498 * If page is referenced; make unreferenced but reclaimable. 1499 * If this page is not referenced, then it must be reclaimable 1500 * and we can add it to the free list. 1501 */ 1502 if (ppattr & P_REF) { 1503 DTRACE_PROBE2(pageout__isref, page_t *, pp, 1504 pageout_hand_t, whichhand); 1505 1506 if (whichhand == POH_FRONT) { 1507 /* 1508 * Checking of rss or madvise flags needed here... 1509 * 1510 * If not "well-behaved", fall through into the code 1511 * for not referenced. 1512 */ 1513 hat_clrref(pp); 1514 } 1515 1516 /* 1517 * Somebody referenced the page since the front 1518 * hand went by, so it's not a candidate for 1519 * freeing up. 1520 */ 1521 page_unlock(pp); 1522 return (CKP_NOT_FREED); 1523 } 1524 1525 VM_STAT_ADD(pageoutvmstats.checkpage[0]); 1526 1527 /* 1528 * If large page, attempt to demote it. If successfully demoted, 1529 * retry the checkpage. 1530 */ 1531 if (pp->p_szc != 0) { 1532 if (!page_try_demote_pages(pp)) { 1533 VM_STAT_ADD(pageoutvmstats.checkpage[1]); 1534 page_unlock(pp); 1535 return (CKP_INELIGIBLE); 1536 } 1537 1538 ASSERT(pp->p_szc == 0); 1539 VM_STAT_ADD(pageoutvmstats.checkpage[2]); 1540 1541 /* 1542 * Since page_try_demote_pages() could have unloaded some 1543 * mappings it makes sense to reload ppattr. 1544 */ 1545 ppattr = hat_page_getattr(pp, P_MOD | P_REF); 1546 } 1547 1548 /* 1549 * If the page is currently dirty, we have to arrange to have it 1550 * cleaned before it can be freed. 1551 * 1552 * XXX - ASSERT(pp->p_vnode != NULL); 1553 */ 1554 if ((ppattr & P_MOD) && pp->p_vnode != NULL) { 1555 struct vnode *vp = pp->p_vnode; 1556 u_offset_t offset = pp->p_offset; 1557 1558 /* 1559 * XXX - Test for process being swapped out or about to exit? 1560 * [Can't get back to process(es) using the page.] 1561 */ 1562 1563 /* 1564 * Hold the vnode before releasing the page lock to 1565 * prevent it from being freed and re-used by some 1566 * other thread. 1567 */ 1568 VN_HOLD(vp); 1569 page_unlock(pp); 1570 1571 /* 1572 * Queue I/O request for the pageout thread. 1573 */ 1574 if (!queue_io_request(vp, offset)) { 1575 VN_RELE(vp); 1576 return (CKP_NOT_FREED); 1577 } 1578 return (CKP_FREED); 1579 } 1580 1581 /* 1582 * Now we unload all the translations and put the page back on to the 1583 * free list. If the page was used (referenced or modified) after the 1584 * pagesync but before it was unloaded we catch it and handle the page 1585 * properly. 1586 */ 1587 DTRACE_PROBE2(pageout__free, page_t *, pp, pageout_hand_t, whichhand); 1588 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1589 ppattr = hat_page_getattr(pp, P_MOD | P_REF); 1590 if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode != NULL)) { 1591 goto recheck; 1592 } 1593 1594 VN_DISPOSE(pp, B_FREE, 0, kcred); 1595 1596 CPU_STATS_ADD_K(vm, dfree, 1); 1597 1598 if (isfs) { 1599 if (isexec) { 1600 CPU_STATS_ADD_K(vm, execfree, 1); 1601 } else { 1602 CPU_STATS_ADD_K(vm, fsfree, 1); 1603 } 1604 } else { 1605 CPU_STATS_ADD_K(vm, anonfree, 1); 1606 } 1607 1608 return (CKP_FREED); 1609 } 1610 1611 /* 1612 * Queue async i/o request from pageout_scanner and segment swapout 1613 * routines on one common list. This ensures that pageout devices (swap) 1614 * are not saturated by pageout_scanner or swapout requests. 1615 * The pageout thread empties this list by initiating i/o operations. 1616 */ 1617 int 1618 queue_io_request(vnode_t *vp, u_offset_t off) 1619 { 1620 struct async_reqs *arg; 1621 1622 /* 1623 * If we cannot allocate an async request struct, 1624 * skip this page. 1625 */ 1626 mutex_enter(&push_lock); 1627 if ((arg = req_freelist) == NULL) { 1628 mutex_exit(&push_lock); 1629 return (0); 1630 } 1631 req_freelist = arg->a_next; /* adjust freelist */ 1632 push_list_size++; 1633 1634 arg->a_vp = vp; 1635 arg->a_off = off; 1636 arg->a_len = PAGESIZE; 1637 arg->a_flags = B_ASYNC | B_FREE; 1638 arg->a_cred = kcred; /* always held */ 1639 1640 /* 1641 * Add to list of pending write requests. 1642 */ 1643 arg->a_next = push_list; 1644 push_list = arg; 1645 1646 if (req_freelist == NULL) { 1647 /* 1648 * No free async requests left. The lock is held so we 1649 * might as well signal the pusher thread now. 1650 */ 1651 cv_signal(&push_cv); 1652 } 1653 mutex_exit(&push_lock); 1654 return (1); 1655 } 1656 1657 /* 1658 * Wake up pageout to initiate i/o if push_list is not empty. 1659 */ 1660 void 1661 cv_signal_pageout() 1662 { 1663 if (push_list != NULL) { 1664 mutex_enter(&push_lock); 1665 cv_signal(&push_cv); 1666 mutex_exit(&push_lock); 1667 } 1668 } 1669