1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2020 Oxide Computer Company 24 */ 25 26 /* 27 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 28 * Use is subject to license terms. 29 */ 30 31 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 32 /* All Rights Reserved */ 33 34 /* 35 * University Copyright- Copyright (c) 1982, 1986, 1988 36 * The Regents of the University of California 37 * All Rights Reserved 38 * 39 * University Acknowledgment- Portions of this document are derived from 40 * software developed by the University of California, Berkeley, and its 41 * contributors. 42 */ 43 44 #include <sys/types.h> 45 #include <sys/t_lock.h> 46 #include <sys/param.h> 47 #include <sys/buf.h> 48 #include <sys/uio.h> 49 #include <sys/proc.h> 50 #include <sys/systm.h> 51 #include <sys/mman.h> 52 #include <sys/cred.h> 53 #include <sys/vnode.h> 54 #include <sys/vm.h> 55 #include <sys/vmparam.h> 56 #include <sys/vtrace.h> 57 #include <sys/cmn_err.h> 58 #include <sys/cpuvar.h> 59 #include <sys/user.h> 60 #include <sys/kmem.h> 61 #include <sys/debug.h> 62 #include <sys/callb.h> 63 #include <sys/tnf_probe.h> 64 #include <sys/mem_cage.h> 65 #include <sys/time.h> 66 #include <sys/stdbool.h> 67 68 #include <vm/hat.h> 69 #include <vm/as.h> 70 #include <vm/seg.h> 71 #include <vm/page.h> 72 #include <vm/pvn.h> 73 #include <vm/seg_kmem.h> 74 75 static int checkpage(page_t *, int); 76 77 /* 78 * The following parameters control operation of the page replacement 79 * algorithm. They are initialized to 0, and then computed at boot time 80 * based on the size of the system. If they are patched non-zero in 81 * a loaded vmunix they are left alone and may thus be changed per system 82 * using adb on the loaded system. 83 */ 84 pgcnt_t slowscan = 0; 85 pgcnt_t fastscan = 0; 86 87 static pgcnt_t handspreadpages = 0; 88 static int loopfraction = 2; 89 static pgcnt_t looppages; 90 static int min_percent_cpu = 4; 91 static int max_percent_cpu = 80; 92 static pgcnt_t maxfastscan = 0; 93 static pgcnt_t maxslowscan = 100; 94 95 pgcnt_t maxpgio = 0; 96 pgcnt_t minfree = 0; 97 pgcnt_t desfree = 0; 98 pgcnt_t lotsfree = 0; 99 pgcnt_t needfree = 0; 100 pgcnt_t throttlefree = 0; 101 pgcnt_t pageout_reserve = 0; 102 103 pgcnt_t deficit; 104 pgcnt_t nscan; 105 pgcnt_t desscan; 106 107 /* 108 * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks 109 * are the number of ticks in each wakeup cycle that gives the 110 * equivalent of some underlying %CPU duty cycle. 111 * When RATETOSCHEDPAGING is 4, and hz is 100, pageout_scanner is 112 * awakened every 25 clock ticks. So, converting from %CPU to ticks 113 * per wakeup cycle would be x% of 25, that is (x * 100) / 25. 114 * So, for example, 4% == 1 tick and 80% == 20 ticks. 115 * 116 * min_pageout_ticks: 117 * ticks/wakeup equivalent of min_percent_cpu. 118 * 119 * max_pageout_ticks: 120 * ticks/wakeup equivalent of max_percent_cpu. 121 * 122 * pageout_ticks: 123 * Number of clock ticks budgeted for each wakeup cycle. 124 * Computed each time around by schedpaging(). 125 * Varies between min_pageout_ticks .. max_pageout_ticks, 126 * depending on memory pressure. 127 * 128 * pageout_lbolt: 129 * Timestamp of the last time pageout_scanner woke up and started 130 * (or resumed) scanning for not recently referenced pages. 131 */ 132 133 static clock_t min_pageout_ticks; 134 static clock_t max_pageout_ticks; 135 static clock_t pageout_ticks; 136 static clock_t pageout_lbolt; 137 138 static uint_t reset_hands; 139 140 #define PAGES_POLL_MASK 1023 141 142 /* 143 * pageout_sample_lim: 144 * The limit on the number of samples needed to establish a value 145 * for new pageout parameters, fastscan, slowscan, and handspreadpages. 146 * 147 * pageout_sample_cnt: 148 * Current sample number. Once the sample gets large enough, 149 * set new values for handspreadpages, fastscan and slowscan. 150 * 151 * pageout_sample_pages: 152 * The accumulated number of pages scanned during sampling. 153 * 154 * pageout_sample_ticks: 155 * The accumulated clock ticks for the sample. 156 * 157 * pageout_rate: 158 * Rate in pages/nanosecond, computed at the end of sampling. 159 * 160 * pageout_new_spread: 161 * The new value to use for fastscan and handspreadpages. 162 * Calculated after enough samples have been taken. 163 */ 164 165 typedef hrtime_t hrrate_t; 166 167 static uint64_t pageout_sample_lim = 4; 168 static uint64_t pageout_sample_cnt = 0; 169 static pgcnt_t pageout_sample_pages = 0; 170 static hrrate_t pageout_rate = 0; 171 static pgcnt_t pageout_new_spread = 0; 172 173 static clock_t pageout_cycle_ticks; 174 static hrtime_t sample_start, sample_end; 175 static hrtime_t pageout_sample_etime = 0; 176 177 /* 178 * Record number of times a pageout_scanner wakeup cycle finished because it 179 * timed out (exceeded its CPU budget), rather than because it visited 180 * its budgeted number of pages. 181 */ 182 uint64_t pageout_timeouts = 0; 183 184 #ifdef VM_STATS 185 static struct pageoutvmstats_str { 186 ulong_t checkpage[3]; 187 } pageoutvmstats; 188 #endif /* VM_STATS */ 189 190 /* 191 * Threads waiting for free memory use this condition variable and lock until 192 * memory becomes available. 193 */ 194 kmutex_t memavail_lock; 195 kcondvar_t memavail_cv; 196 197 /* 198 * The size of the clock loop. 199 */ 200 #define LOOPPAGES total_pages 201 202 /* 203 * Set up the paging constants for the clock algorithm. 204 * Called after the system is initialized and the amount of memory 205 * and number of paging devices is known. 206 * 207 * lotsfree is 1/64 of memory, but at least 512K. 208 * desfree is 1/2 of lotsfree. 209 * minfree is 1/2 of desfree. 210 * 211 * Note: to revert to the paging algorithm of Solaris 2.4/2.5, set: 212 * 213 * lotsfree = btop(512K) 214 * desfree = btop(200K) 215 * minfree = btop(100K) 216 * throttlefree = INT_MIN 217 * max_percent_cpu = 4 218 */ 219 void 220 setupclock(int recalc) 221 { 222 223 static spgcnt_t init_lfree, init_dfree, init_mfree; 224 static spgcnt_t init_tfree, init_preserve, init_mpgio; 225 static spgcnt_t init_mfscan, init_fscan, init_sscan, init_hspages; 226 227 looppages = LOOPPAGES; 228 229 /* 230 * setupclock can now be called to recalculate the paging 231 * parameters in the case of dynamic addition of memory. 232 * So to make sure we make the proper calculations, if such a 233 * situation should arise, we save away the initial values 234 * of each parameter so we can recall them when needed. This 235 * way we don't lose the settings an admin might have made 236 * through the /etc/system file. 237 */ 238 239 if (!recalc) { 240 init_lfree = lotsfree; 241 init_dfree = desfree; 242 init_mfree = minfree; 243 init_tfree = throttlefree; 244 init_preserve = pageout_reserve; 245 init_mpgio = maxpgio; 246 init_mfscan = maxfastscan; 247 init_fscan = fastscan; 248 init_sscan = slowscan; 249 init_hspages = handspreadpages; 250 } 251 252 /* 253 * Set up thresholds for paging: 254 */ 255 256 /* 257 * Lotsfree is threshold where paging daemon turns on. 258 */ 259 if (init_lfree == 0 || init_lfree >= looppages) 260 lotsfree = MAX(looppages / 64, btop(512 * 1024)); 261 else 262 lotsfree = init_lfree; 263 264 /* 265 * Desfree is amount of memory desired free. 266 * If less than this for extended period, start swapping. 267 */ 268 if (init_dfree == 0 || init_dfree >= lotsfree) 269 desfree = lotsfree / 2; 270 else 271 desfree = init_dfree; 272 273 /* 274 * Minfree is minimal amount of free memory which is tolerable. 275 */ 276 if (init_mfree == 0 || init_mfree >= desfree) 277 minfree = desfree / 2; 278 else 279 minfree = init_mfree; 280 281 /* 282 * Throttlefree is the point at which we start throttling 283 * PG_WAIT requests until enough memory becomes available. 284 */ 285 if (init_tfree == 0 || init_tfree >= desfree) 286 throttlefree = minfree; 287 else 288 throttlefree = init_tfree; 289 290 /* 291 * Pageout_reserve is the number of pages that we keep in 292 * stock for pageout's own use. Having a few such pages 293 * provides insurance against system deadlock due to 294 * pageout needing pages. When freemem < pageout_reserve, 295 * non-blocking allocations are denied to any threads 296 * other than pageout and sched. (At some point we might 297 * want to consider a per-thread flag like T_PUSHING_PAGES 298 * to indicate that a thread is part of the page-pushing 299 * dance (e.g. an interrupt thread) and thus is entitled 300 * to the same special dispensation we accord pageout.) 301 */ 302 if (init_preserve == 0 || init_preserve >= throttlefree) 303 pageout_reserve = throttlefree / 2; 304 else 305 pageout_reserve = init_preserve; 306 307 /* 308 * Maxpgio thresholds how much paging is acceptable. 309 * This figures that 2/3 busy on an arm is all that is 310 * tolerable for paging. We assume one operation per disk rev. 311 * 312 * XXX - Does not account for multiple swap devices. 313 */ 314 if (init_mpgio == 0) 315 maxpgio = (DISKRPM * 2) / 3; 316 else 317 maxpgio = init_mpgio; 318 319 /* 320 * The clock scan rate varies between fastscan and slowscan 321 * based on the amount of free memory available. Fastscan 322 * rate should be set based on the number pages that can be 323 * scanned per sec using ~10% of processor time. Since this 324 * value depends on the processor, MMU, Mhz etc., it is 325 * difficult to determine it in a generic manner for all 326 * architectures. 327 * 328 * Instead of trying to determine the number of pages scanned 329 * per sec for every processor, fastscan is set to be the smaller 330 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling 331 * time is limited to ~4% of processor time. 332 * 333 * Setting fastscan to be 1/2 of memory allows pageout to scan 334 * all of memory in ~2 secs. This implies that user pages not 335 * accessed within 1 sec (assuming, handspreadpages == fastscan) 336 * can be reclaimed when free memory is very low. Stealing pages 337 * not accessed within 1 sec seems reasonable and ensures that 338 * active user processes don't thrash. 339 * 340 * Smaller values of fastscan result in scanning fewer pages 341 * every second and consequently pageout may not be able to free 342 * sufficient memory to maintain the minimum threshold. Larger 343 * values of fastscan result in scanning a lot more pages which 344 * could lead to thrashing and higher CPU usage. 345 * 346 * Fastscan needs to be limited to a maximum value and should not 347 * scale with memory to prevent pageout from consuming too much 348 * time for scanning on slow CPU's and avoid thrashing, as a 349 * result of scanning too many pages, on faster CPU's. 350 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES 351 * (the upper bound for fastscan) based on the average number 352 * of pages that can potentially be scanned in ~1 sec (using ~4% 353 * of the CPU) on some of the following machines that currently 354 * run Solaris 2.x: 355 * 356 * average memory scanned in ~1 sec 357 * 358 * 25 Mhz SS1+: 23 Meg 359 * LX: 37 Meg 360 * 50 Mhz SC2000: 68 Meg 361 * 362 * 40 Mhz 486: 26 Meg 363 * 66 Mhz 486: 42 Meg 364 * 365 * When free memory falls just below lotsfree, the scan rate 366 * goes from 0 to slowscan (i.e., pageout starts running). This 367 * transition needs to be smooth and is achieved by ensuring that 368 * pageout scans a small number of pages to satisfy the transient 369 * memory demand. This is set to not exceed 100 pages/sec (25 per 370 * wakeup) since scanning that many pages has no noticible impact 371 * on system performance. 372 * 373 * In addition to setting fastscan and slowscan, pageout is 374 * limited to using ~4% of the CPU. This results in increasing 375 * the time taken to scan all of memory, which in turn means that 376 * user processes have a better opportunity of preventing their 377 * pages from being stolen. This has a positive effect on 378 * interactive and overall system performance when memory demand 379 * is high. 380 * 381 * Thus, the rate at which pages are scanned for replacement will 382 * vary linearly between slowscan and the number of pages that 383 * can be scanned using ~4% of processor time instead of varying 384 * linearly between slowscan and fastscan. 385 * 386 * Also, the processor time used by pageout will vary from ~1% 387 * at slowscan to ~4% at fastscan instead of varying between 388 * ~1% at slowscan and ~10% at fastscan. 389 * 390 * The values chosen for the various VM parameters (fastscan, 391 * handspreadpages, etc) are not universally true for all machines, 392 * but appear to be a good rule of thumb for the machines we've 393 * tested. They have the following ranges: 394 * 395 * cpu speed: 20 to 70 Mhz 396 * page size: 4K to 8K 397 * memory size: 16M to 5G 398 * page scan rate: 4000 - 17400 4K pages per sec 399 * 400 * The values need to be re-examined for machines which don't 401 * fall into the various ranges (e.g., slower or faster CPUs, 402 * smaller or larger pagesizes etc) shown above. 403 * 404 * On an MP machine, pageout is often unable to maintain the 405 * minimum paging thresholds under heavy load. This is due to 406 * the fact that user processes running on other CPU's can be 407 * dirtying memory at a much faster pace than pageout can find 408 * pages to free. The memory demands could be met by enabling 409 * more than one CPU to run the clock algorithm in such a manner 410 * that the various clock hands don't overlap. This also makes 411 * it more difficult to determine the values for fastscan, slowscan 412 * and handspreadpages. 413 * 414 * The swapper is currently used to free up memory when pageout 415 * is unable to meet memory demands by swapping out processes. 416 * In addition to freeing up memory, swapping also reduces the 417 * demand for memory by preventing user processes from running 418 * and thereby consuming memory. 419 */ 420 if (init_mfscan == 0) { 421 if (pageout_new_spread != 0) 422 maxfastscan = pageout_new_spread; 423 else 424 maxfastscan = MAXHANDSPREADPAGES; 425 } else { 426 maxfastscan = init_mfscan; 427 } 428 if (init_fscan == 0) 429 fastscan = MIN(looppages / loopfraction, maxfastscan); 430 else 431 fastscan = init_fscan; 432 if (fastscan > looppages / loopfraction) 433 fastscan = looppages / loopfraction; 434 435 /* 436 * Set slow scan time to 1/10 the fast scan time, but 437 * not to exceed maxslowscan. 438 */ 439 if (init_sscan == 0) 440 slowscan = MIN(fastscan / 10, maxslowscan); 441 else 442 slowscan = init_sscan; 443 if (slowscan > fastscan / 2) 444 slowscan = fastscan / 2; 445 446 /* 447 * Handspreadpages is distance (in pages) between front and back 448 * pageout daemon hands. The amount of time to reclaim a page 449 * once pageout examines it increases with this distance and 450 * decreases as the scan rate rises. It must be < the amount 451 * of pageable memory. 452 * 453 * Since pageout is limited to ~4% of the CPU, setting handspreadpages 454 * to be "fastscan" results in the front hand being a few secs 455 * (varies based on the processor speed) ahead of the back hand 456 * at fastscan rates. This distance can be further reduced, if 457 * necessary, by increasing the processor time used by pageout 458 * to be more than ~4% and preferrably not more than ~10%. 459 * 460 * As a result, user processes have a much better chance of 461 * referencing their pages before the back hand examines them. 462 * This also significantly lowers the number of reclaims from 463 * the freelist since pageout does not end up freeing pages which 464 * may be referenced a sec later. 465 */ 466 if (init_hspages == 0) 467 handspreadpages = fastscan; 468 else 469 handspreadpages = init_hspages; 470 471 /* 472 * Make sure that back hand follows front hand by at least 473 * 1/RATETOSCHEDPAGING seconds. Without this test, it is possible 474 * for the back hand to look at a page during the same wakeup of 475 * the pageout daemon in which the front hand cleared its ref bit. 476 */ 477 if (handspreadpages >= looppages) 478 handspreadpages = looppages - 1; 479 480 /* 481 * If we have been called to recalculate the parameters, 482 * set a flag to re-evaluate the clock hand pointers. 483 */ 484 if (recalc) 485 reset_hands = 1; 486 } 487 488 /* 489 * Pageout scheduling. 490 * 491 * Schedpaging controls the rate at which the page out daemon runs by 492 * setting the global variables nscan and desscan RATETOSCHEDPAGING 493 * times a second. Nscan records the number of pages pageout has examined 494 * in its current pass; schedpaging resets this value to zero each time 495 * it runs. Desscan records the number of pages pageout should examine 496 * in its next pass; schedpaging sets this value based on the amount of 497 * currently available memory. 498 */ 499 500 #define RATETOSCHEDPAGING 4 /* hz that is */ 501 502 static kmutex_t pageout_mutex; /* held while pageout or schedpaging running */ 503 504 /* 505 * Pool of available async pageout putpage requests. 506 */ 507 static struct async_reqs *push_req; 508 static struct async_reqs *req_freelist; /* available req structs */ 509 static struct async_reqs *push_list; /* pending reqs */ 510 static kmutex_t push_lock; /* protects req pool */ 511 static kcondvar_t push_cv; 512 513 /* 514 * If pageout() is stuck on a single push for this many seconds, 515 * pageout_deadman() will assume the system has hit a memory deadlock. If set 516 * to 0, the deadman will have no effect. 517 * 518 * Note that we are only looking for stalls in the calls that pageout() makes 519 * to VOP_PUTPAGE(). These calls are merely asynchronous requests for paging 520 * I/O, which should not take long unless the underlying strategy call blocks 521 * indefinitely for memory. The actual I/O request happens (or fails) later. 522 */ 523 uint_t pageout_deadman_seconds = 90; 524 525 static uint_t pageout_stucktime = 0; 526 static bool pageout_pushing = false; 527 static uint64_t pageout_pushcount = 0; 528 static uint64_t pageout_pushcount_seen = 0; 529 530 static int async_list_size = 256; /* number of async request structs */ 531 532 static void pageout_scanner(void); 533 534 /* 535 * If a page is being shared more than "po_share" times 536 * then leave it alone- don't page it out. 537 */ 538 #define MIN_PO_SHARE (8) 539 #define MAX_PO_SHARE ((MIN_PO_SHARE) << 24) 540 ulong_t po_share = MIN_PO_SHARE; 541 542 /* 543 * Schedule rate for paging. 544 * Rate is linear interpolation between 545 * slowscan with lotsfree and fastscan when out of memory. 546 */ 547 static void 548 schedpaging(void *arg) 549 { 550 spgcnt_t vavail; 551 552 if (freemem < lotsfree + needfree + kmem_reapahead) 553 kmem_reap(); 554 555 if (freemem < lotsfree + needfree) 556 seg_preap(); 557 558 if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree)) 559 kcage_cageout_wakeup(); 560 561 if (mutex_tryenter(&pageout_mutex)) { 562 /* pageout() not running */ 563 nscan = 0; 564 vavail = freemem - deficit; 565 if (pageout_new_spread != 0) 566 vavail -= needfree; 567 if (vavail < 0) 568 vavail = 0; 569 if (vavail > lotsfree) 570 vavail = lotsfree; 571 572 /* 573 * Fix for 1161438 (CRS SPR# 73922). All variables 574 * in the original calculation for desscan were 32 bit signed 575 * ints. As freemem approaches 0x0 on a system with 1 Gig or 576 * more of memory, the calculation can overflow. When this 577 * happens, desscan becomes negative and pageout_scanner() 578 * stops paging out. 579 */ 580 if ((needfree) && (pageout_new_spread == 0)) { 581 /* 582 * If we've not yet collected enough samples to 583 * calculate a spread, use the old logic of kicking 584 * into high gear anytime needfree is non-zero. 585 */ 586 desscan = fastscan / RATETOSCHEDPAGING; 587 } else { 588 /* 589 * Once we've calculated a spread based on system 590 * memory and usage, just treat needfree as another 591 * form of deficit. 592 */ 593 spgcnt_t faststmp, slowstmp, result; 594 595 slowstmp = slowscan * vavail; 596 faststmp = fastscan * (lotsfree - vavail); 597 result = (slowstmp + faststmp) / 598 nz(lotsfree) / RATETOSCHEDPAGING; 599 desscan = (pgcnt_t)result; 600 } 601 602 pageout_ticks = min_pageout_ticks + (lotsfree - vavail) * 603 (max_pageout_ticks - min_pageout_ticks) / nz(lotsfree); 604 605 if (freemem < lotsfree + needfree || 606 pageout_sample_cnt < pageout_sample_lim) { 607 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 608 "pageout_cv_signal:freemem %ld", freemem); 609 cv_signal(&proc_pageout->p_cv); 610 } else { 611 /* 612 * There are enough free pages, no need to 613 * kick the scanner thread. And next time 614 * around, keep more of the `highly shared' 615 * pages. 616 */ 617 cv_signal_pageout(); 618 if (po_share > MIN_PO_SHARE) { 619 po_share >>= 1; 620 } 621 } 622 mutex_exit(&pageout_mutex); 623 } 624 625 /* 626 * Signal threads waiting for available memory. 627 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but 628 * in this case it is not needed - the waiters will be waken up during 629 * the next invocation of this function. 630 */ 631 if (kmem_avail() > 0) 632 cv_broadcast(&memavail_cv); 633 634 (void) timeout(schedpaging, arg, hz / RATETOSCHEDPAGING); 635 } 636 637 pgcnt_t pushes; 638 ulong_t push_list_size; /* # of requests on pageout queue */ 639 640 #define FRONT 1 641 #define BACK 2 642 643 int dopageout = 1; /* must be non-zero to turn page stealing on */ 644 645 /* 646 * The page out daemon, which runs as process 2. 647 * 648 * As long as there are at least lotsfree pages, 649 * this process is not run. When the number of free 650 * pages stays in the range desfree to lotsfree, 651 * this daemon runs through the pages in the loop 652 * at a rate determined in schedpaging(). Pageout manages 653 * two hands on the clock. The front hand moves through 654 * memory, clearing the reference bit, 655 * and stealing pages from procs that are over maxrss. 656 * The back hand travels a distance behind the front hand, 657 * freeing the pages that have not been referenced in the time 658 * since the front hand passed. If modified, they are pushed to 659 * swap before being freed. 660 * 661 * There are 2 threads that act on behalf of the pageout process. 662 * One thread scans pages (pageout_scanner) and frees them up if 663 * they don't require any VOP_PUTPAGE operation. If a page must be 664 * written back to its backing store, the request is put on a list 665 * and the other (pageout) thread is signaled. The pageout thread 666 * grabs VOP_PUTPAGE requests from the list, and processes them. 667 * Some filesystems may require resources for the VOP_PUTPAGE 668 * operations (like memory) and hence can block the pageout 669 * thread, but the scanner thread can still operate. There is still 670 * no guarantee that memory deadlocks cannot occur. 671 * 672 * For now, this thing is in very rough form. 673 */ 674 void 675 pageout() 676 { 677 struct async_reqs *arg; 678 pri_t pageout_pri; 679 int i; 680 pgcnt_t max_pushes; 681 callb_cpr_t cprinfo; 682 683 proc_pageout = ttoproc(curthread); 684 proc_pageout->p_cstime = 0; 685 proc_pageout->p_stime = 0; 686 proc_pageout->p_cutime = 0; 687 proc_pageout->p_utime = 0; 688 bcopy("pageout", PTOU(curproc)->u_psargs, 8); 689 bcopy("pageout", PTOU(curproc)->u_comm, 7); 690 691 /* 692 * Create pageout scanner thread 693 */ 694 mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL); 695 mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL); 696 697 /* 698 * Allocate and initialize the async request structures 699 * for pageout. 700 */ 701 push_req = (struct async_reqs *) 702 kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP); 703 704 req_freelist = push_req; 705 for (i = 0; i < async_list_size - 1; i++) 706 push_req[i].a_next = &push_req[i + 1]; 707 708 pageout_pri = curthread->t_pri; 709 710 /* Create the pageout scanner thread. */ 711 (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN, 712 pageout_pri - 1); 713 714 /* 715 * kick off pageout scheduler. 716 */ 717 schedpaging(NULL); 718 719 /* 720 * Create kernel cage thread. 721 * The kernel cage thread is started under the pageout process 722 * to take advantage of the less restricted page allocation 723 * in page_create_throttle(). 724 */ 725 kcage_cageout_init(); 726 727 /* 728 * Limit pushes to avoid saturating pageout devices. 729 */ 730 max_pushes = maxpgio / RATETOSCHEDPAGING; 731 CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout"); 732 733 for (;;) { 734 mutex_enter(&push_lock); 735 736 while ((arg = push_list) == NULL || pushes > max_pushes) { 737 CALLB_CPR_SAFE_BEGIN(&cprinfo); 738 cv_wait(&push_cv, &push_lock); 739 pushes = 0; 740 CALLB_CPR_SAFE_END(&cprinfo, &push_lock); 741 } 742 push_list = arg->a_next; 743 arg->a_next = NULL; 744 pageout_pushing = true; 745 mutex_exit(&push_lock); 746 747 if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off, 748 arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) { 749 pushes++; 750 } 751 752 /* vp held by checkpage() */ 753 VN_RELE(arg->a_vp); 754 755 mutex_enter(&push_lock); 756 pageout_pushing = false; 757 pageout_pushcount++; 758 arg->a_next = req_freelist; /* back on freelist */ 759 req_freelist = arg; 760 push_list_size--; 761 mutex_exit(&push_lock); 762 } 763 } 764 765 /* 766 * Kernel thread that scans pages looking for ones to free 767 */ 768 static void 769 pageout_scanner(void) 770 { 771 struct page *fronthand, *backhand; 772 uint_t count; 773 callb_cpr_t cprinfo; 774 pgcnt_t nscan_limit; 775 pgcnt_t pcount; 776 777 CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan"); 778 mutex_enter(&pageout_mutex); 779 780 /* 781 * The restart case does not attempt to point the hands at roughly 782 * the right point on the assumption that after one circuit things 783 * will have settled down - and restarts shouldn't be that often. 784 */ 785 786 /* 787 * Set the two clock hands to be separated by a reasonable amount, 788 * but no more than 360 degrees apart. 789 */ 790 backhand = page_first(); 791 if (handspreadpages >= total_pages) 792 fronthand = page_nextn(backhand, total_pages - 1); 793 else 794 fronthand = page_nextn(backhand, handspreadpages); 795 796 min_pageout_ticks = MAX(1, 797 ((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING); 798 max_pageout_ticks = MAX(min_pageout_ticks, 799 ((hz * max_percent_cpu) / 100) / RATETOSCHEDPAGING); 800 801 loop: 802 cv_signal_pageout(); 803 804 CALLB_CPR_SAFE_BEGIN(&cprinfo); 805 cv_wait(&proc_pageout->p_cv, &pageout_mutex); 806 CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex); 807 808 if (!dopageout) 809 goto loop; 810 811 if (reset_hands) { 812 reset_hands = 0; 813 814 backhand = page_first(); 815 if (handspreadpages >= total_pages) 816 fronthand = page_nextn(backhand, total_pages - 1); 817 else 818 fronthand = page_nextn(backhand, handspreadpages); 819 } 820 821 CPU_STATS_ADDQ(CPU, vm, pgrrun, 1); 822 count = 0; 823 824 TRACE_4(TR_FAC_VM, TR_PAGEOUT_START, 825 "pageout_start:freemem %ld lotsfree %ld nscan %ld desscan %ld", 826 freemem, lotsfree, nscan, desscan); 827 828 /* Kernel probe */ 829 TNF_PROBE_2(pageout_scan_start, "vm pagedaemon", /* CSTYLED */, 830 tnf_ulong, pages_free, freemem, tnf_ulong, pages_needed, needfree); 831 832 pcount = 0; 833 if (pageout_sample_cnt < pageout_sample_lim) { 834 nscan_limit = total_pages; 835 } else { 836 nscan_limit = desscan; 837 } 838 pageout_lbolt = ddi_get_lbolt(); 839 sample_start = gethrtime(); 840 841 /* 842 * Scan the appropriate number of pages for a single duty cycle. 843 * However, stop scanning as soon as there is enough free memory. 844 * For a short while, we will be sampling the performance of the 845 * scanner and need to keep running just to get sample data, in 846 * which case we keep going and don't pay attention to whether 847 * or not there is enough free memory. 848 */ 849 850 while (nscan < nscan_limit && (freemem < lotsfree + needfree || 851 pageout_sample_cnt < pageout_sample_lim)) { 852 int rvfront, rvback; 853 854 /* 855 * Check to see if we have exceeded our %CPU budget 856 * for this wakeup, but not on every single page visited, 857 * just every once in a while. 858 */ 859 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) { 860 pageout_cycle_ticks = ddi_get_lbolt() - pageout_lbolt; 861 if (pageout_cycle_ticks >= pageout_ticks) { 862 ++pageout_timeouts; 863 break; 864 } 865 } 866 867 /* 868 * If checkpage manages to add a page to the free list, 869 * we give ourselves another couple of trips around the loop. 870 */ 871 if ((rvfront = checkpage(fronthand, FRONT)) == 1) 872 count = 0; 873 if ((rvback = checkpage(backhand, BACK)) == 1) 874 count = 0; 875 876 ++pcount; 877 878 /* 879 * protected by pageout_mutex instead of cpu_stat_lock 880 */ 881 CPU_STATS_ADDQ(CPU, vm, scan, 1); 882 883 /* 884 * Don't include ineligible pages in the number scanned. 885 */ 886 if (rvfront != -1 || rvback != -1) 887 nscan++; 888 889 backhand = page_next(backhand); 890 891 /* 892 * backhand update and wraparound check are done separately 893 * because lint barks when it finds an empty "if" body 894 */ 895 896 if ((fronthand = page_next(fronthand)) == page_first()) { 897 TRACE_2(TR_FAC_VM, TR_PAGEOUT_HAND_WRAP, 898 "pageout_hand_wrap:freemem %ld whichhand %d", 899 freemem, FRONT); 900 901 /* 902 * protected by pageout_mutex instead of cpu_stat_lock 903 */ 904 CPU_STATS_ADDQ(CPU, vm, rev, 1); 905 if (++count > 1) { 906 /* 907 * Extremely unlikely, but it happens. 908 * We went around the loop at least once 909 * and didn't get far enough. 910 * If we are still skipping `highly shared' 911 * pages, skip fewer of them. Otherwise, 912 * give up till the next clock tick. 913 */ 914 if (po_share < MAX_PO_SHARE) { 915 po_share <<= 1; 916 } else { 917 /* 918 * Really a "goto loop", but 919 * if someone is TRACing or 920 * TNF_PROBE_ing, at least 921 * make records to show 922 * where we are. 923 */ 924 break; 925 } 926 } 927 } 928 } 929 930 sample_end = gethrtime(); 931 932 TRACE_5(TR_FAC_VM, TR_PAGEOUT_END, 933 "pageout_end:freemem %ld lots %ld nscan %ld des %ld count %u", 934 freemem, lotsfree, nscan, desscan, count); 935 936 /* Kernel probe */ 937 TNF_PROBE_2(pageout_scan_end, "vm pagedaemon", /* CSTYLED */, 938 tnf_ulong, pages_scanned, nscan, tnf_ulong, pages_free, freemem); 939 940 if (pageout_sample_cnt < pageout_sample_lim) { 941 pageout_sample_pages += pcount; 942 pageout_sample_etime += sample_end - sample_start; 943 ++pageout_sample_cnt; 944 } 945 if (pageout_sample_cnt >= pageout_sample_lim && 946 pageout_new_spread == 0) { 947 pageout_rate = (hrrate_t)pageout_sample_pages * 948 (hrrate_t)(NANOSEC) / pageout_sample_etime; 949 pageout_new_spread = pageout_rate / 10; 950 setupclock(1); 951 } 952 953 goto loop; 954 } 955 956 /* 957 * The pageout deadman is run once per second by clock(). 958 */ 959 void 960 pageout_deadman(void) 961 { 962 if (panicstr != NULL) { 963 /* 964 * There is no pageout after panic. 965 */ 966 return; 967 } 968 969 if (pageout_deadman_seconds == 0) { 970 /* 971 * The deadman is not enabled. 972 */ 973 return; 974 } 975 976 if (!pageout_pushing) { 977 goto reset; 978 } 979 980 /* 981 * We are pushing a page. Check to see if it is the same call we saw 982 * last time we looked: 983 */ 984 if (pageout_pushcount != pageout_pushcount_seen) { 985 /* 986 * It is a different call from the last check, so we are not 987 * stuck. 988 */ 989 goto reset; 990 } 991 992 if (++pageout_stucktime >= pageout_deadman_seconds) { 993 panic("pageout_deadman: stuck pushing the same page for %d " 994 "seconds (freemem is %lu)", pageout_deadman_seconds, 995 freemem); 996 } 997 998 return; 999 1000 reset: 1001 /* 1002 * Reset our tracking state to reflect that we are not stuck: 1003 */ 1004 pageout_stucktime = 0; 1005 pageout_pushcount_seen = pageout_pushcount; 1006 } 1007 1008 /* 1009 * Look at the page at hand. If it is locked (e.g., for physical i/o), 1010 * system (u., page table) or free, then leave it alone. Otherwise, 1011 * if we are running the front hand, turn off the page's reference bit. 1012 * If the proc is over maxrss, we take it. If running the back hand, 1013 * check whether the page has been reclaimed. If not, free the page, 1014 * pushing it to disk first if necessary. 1015 * 1016 * Return values: 1017 * -1 if the page is not a candidate at all, 1018 * 0 if not freed, or 1019 * 1 if we freed it. 1020 */ 1021 static int 1022 checkpage(struct page *pp, int whichhand) 1023 { 1024 int ppattr; 1025 int isfs = 0; 1026 int isexec = 0; 1027 int pagesync_flag; 1028 1029 /* 1030 * Skip pages: 1031 * - associated with the kernel vnode since 1032 * they are always "exclusively" locked. 1033 * - that are free 1034 * - that are shared more than po_share'd times 1035 * - its already locked 1036 * 1037 * NOTE: These optimizations assume that reads are atomic. 1038 */ 1039 1040 if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) || 1041 pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 1042 hat_page_checkshare(pp, po_share)) { 1043 return (-1); 1044 } 1045 1046 if (!page_trylock(pp, SE_EXCL)) { 1047 /* 1048 * Skip the page if we can't acquire the "exclusive" lock. 1049 */ 1050 return (-1); 1051 } else if (PP_ISFREE(pp)) { 1052 /* 1053 * It became free between the above check and our actually 1054 * locking the page. Oh, well there will be other pages. 1055 */ 1056 page_unlock(pp); 1057 return (-1); 1058 } 1059 1060 /* 1061 * Reject pages that cannot be freed. The page_struct_lock 1062 * need not be acquired to examine these 1063 * fields since the page has an "exclusive" lock. 1064 */ 1065 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1066 page_unlock(pp); 1067 return (-1); 1068 } 1069 1070 /* 1071 * Maintain statistics for what we are freeing 1072 */ 1073 1074 if (pp->p_vnode != NULL) { 1075 if (pp->p_vnode->v_flag & VVMEXEC) 1076 isexec = 1; 1077 1078 if (!IS_SWAPFSVP(pp->p_vnode)) 1079 isfs = 1; 1080 } 1081 1082 /* 1083 * Turn off REF and MOD bits with the front hand. 1084 * The back hand examines the REF bit and always considers 1085 * SHARED pages as referenced. 1086 */ 1087 if (whichhand == FRONT) 1088 pagesync_flag = HAT_SYNC_ZERORM; 1089 else 1090 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF | 1091 HAT_SYNC_STOPON_SHARED; 1092 1093 ppattr = hat_pagesync(pp, pagesync_flag); 1094 1095 recheck: 1096 /* 1097 * If page is referenced; make unreferenced but reclaimable. 1098 * If this page is not referenced, then it must be reclaimable 1099 * and we can add it to the free list. 1100 */ 1101 if (ppattr & P_REF) { 1102 TRACE_2(TR_FAC_VM, TR_PAGEOUT_ISREF, 1103 "pageout_isref:pp %p whichhand %d", pp, whichhand); 1104 if (whichhand == FRONT) { 1105 /* 1106 * Checking of rss or madvise flags needed here... 1107 * 1108 * If not "well-behaved", fall through into the code 1109 * for not referenced. 1110 */ 1111 hat_clrref(pp); 1112 } 1113 /* 1114 * Somebody referenced the page since the front 1115 * hand went by, so it's not a candidate for 1116 * freeing up. 1117 */ 1118 page_unlock(pp); 1119 return (0); 1120 } 1121 1122 VM_STAT_ADD(pageoutvmstats.checkpage[0]); 1123 1124 /* 1125 * If large page, attempt to demote it. If successfully demoted, 1126 * retry the checkpage. 1127 */ 1128 if (pp->p_szc != 0) { 1129 if (!page_try_demote_pages(pp)) { 1130 VM_STAT_ADD(pageoutvmstats.checkpage[1]); 1131 page_unlock(pp); 1132 return (-1); 1133 } 1134 ASSERT(pp->p_szc == 0); 1135 VM_STAT_ADD(pageoutvmstats.checkpage[2]); 1136 /* 1137 * since page_try_demote_pages() could have unloaded some 1138 * mappings it makes sense to reload ppattr. 1139 */ 1140 ppattr = hat_page_getattr(pp, P_MOD | P_REF); 1141 } 1142 1143 /* 1144 * If the page is currently dirty, we have to arrange 1145 * to have it cleaned before it can be freed. 1146 * 1147 * XXX - ASSERT(pp->p_vnode != NULL); 1148 */ 1149 if ((ppattr & P_MOD) && pp->p_vnode) { 1150 struct vnode *vp = pp->p_vnode; 1151 u_offset_t offset = pp->p_offset; 1152 1153 /* 1154 * XXX - Test for process being swapped out or about to exit? 1155 * [Can't get back to process(es) using the page.] 1156 */ 1157 1158 /* 1159 * Hold the vnode before releasing the page lock to 1160 * prevent it from being freed and re-used by some 1161 * other thread. 1162 */ 1163 VN_HOLD(vp); 1164 page_unlock(pp); 1165 1166 /* 1167 * Queue i/o request for the pageout thread. 1168 */ 1169 if (!queue_io_request(vp, offset)) { 1170 VN_RELE(vp); 1171 return (0); 1172 } 1173 return (1); 1174 } 1175 1176 /* 1177 * Now we unload all the translations, 1178 * and put the page back on to the free list. 1179 * If the page was used (referenced or modified) after 1180 * the pagesync but before it was unloaded we catch it 1181 * and handle the page properly. 1182 */ 1183 TRACE_2(TR_FAC_VM, TR_PAGEOUT_FREE, 1184 "pageout_free:pp %p whichhand %d", pp, whichhand); 1185 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1186 ppattr = hat_page_getattr(pp, P_MOD | P_REF); 1187 if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode)) 1188 goto recheck; 1189 1190 /*LINTED: constant in conditional context*/ 1191 VN_DISPOSE(pp, B_FREE, 0, kcred); 1192 1193 CPU_STATS_ADD_K(vm, dfree, 1); 1194 1195 if (isfs) { 1196 if (isexec) { 1197 CPU_STATS_ADD_K(vm, execfree, 1); 1198 } else { 1199 CPU_STATS_ADD_K(vm, fsfree, 1); 1200 } 1201 } else { 1202 CPU_STATS_ADD_K(vm, anonfree, 1); 1203 } 1204 1205 return (1); /* freed a page! */ 1206 } 1207 1208 /* 1209 * Queue async i/o request from pageout_scanner and segment swapout 1210 * routines on one common list. This ensures that pageout devices (swap) 1211 * are not saturated by pageout_scanner or swapout requests. 1212 * The pageout thread empties this list by initiating i/o operations. 1213 */ 1214 int 1215 queue_io_request(vnode_t *vp, u_offset_t off) 1216 { 1217 struct async_reqs *arg; 1218 1219 /* 1220 * If we cannot allocate an async request struct, 1221 * skip this page. 1222 */ 1223 mutex_enter(&push_lock); 1224 if ((arg = req_freelist) == NULL) { 1225 mutex_exit(&push_lock); 1226 return (0); 1227 } 1228 req_freelist = arg->a_next; /* adjust freelist */ 1229 push_list_size++; 1230 1231 arg->a_vp = vp; 1232 arg->a_off = off; 1233 arg->a_len = PAGESIZE; 1234 arg->a_flags = B_ASYNC | B_FREE; 1235 arg->a_cred = kcred; /* always held */ 1236 1237 /* 1238 * Add to list of pending write requests. 1239 */ 1240 arg->a_next = push_list; 1241 push_list = arg; 1242 1243 if (req_freelist == NULL) { 1244 /* 1245 * No free async requests left. The lock is held so we 1246 * might as well signal the pusher thread now. 1247 */ 1248 cv_signal(&push_cv); 1249 } 1250 mutex_exit(&push_lock); 1251 return (1); 1252 } 1253 1254 /* 1255 * Wakeup pageout to initiate i/o if push_list is not empty. 1256 */ 1257 void 1258 cv_signal_pageout() 1259 { 1260 if (push_list != NULL) { 1261 mutex_enter(&push_lock); 1262 cv_signal(&push_cv); 1263 mutex_exit(&push_lock); 1264 } 1265 } 1266