1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 #pragma ident "%Z%%M% %I% %E% SMI" 41 42 #include <sys/types.h> 43 #include <sys/t_lock.h> 44 #include <sys/param.h> 45 #include <sys/buf.h> 46 #include <sys/uio.h> 47 #include <sys/proc.h> 48 #include <sys/systm.h> 49 #include <sys/mman.h> 50 #include <sys/cred.h> 51 #include <sys/vnode.h> 52 #include <sys/vm.h> 53 #include <sys/vmparam.h> 54 #include <sys/vtrace.h> 55 #include <sys/cmn_err.h> 56 #include <sys/cpuvar.h> 57 #include <sys/user.h> 58 #include <sys/kmem.h> 59 #include <sys/debug.h> 60 #include <sys/callb.h> 61 #include <sys/tnf_probe.h> 62 #include <sys/mem_cage.h> 63 #include <sys/time.h> 64 65 #include <vm/hat.h> 66 #include <vm/as.h> 67 #include <vm/seg.h> 68 #include <vm/page.h> 69 #include <vm/pvn.h> 70 #include <vm/seg_kmem.h> 71 72 static int checkpage(page_t *, int); 73 74 /* 75 * The following parameters control operation of the page replacement 76 * algorithm. They are initialized to 0, and then computed at boot time 77 * based on the size of the system. If they are patched non-zero in 78 * a loaded vmunix they are left alone and may thus be changed per system 79 * using adb on the loaded system. 80 */ 81 pgcnt_t slowscan = 0; 82 pgcnt_t fastscan = 0; 83 84 static pgcnt_t handspreadpages = 0; 85 static int loopfraction = 2; 86 static pgcnt_t looppages; 87 static int min_percent_cpu = 4; 88 static int max_percent_cpu = 80; 89 static pgcnt_t maxfastscan = 0; 90 static pgcnt_t maxslowscan = 100; 91 92 pgcnt_t maxpgio = 0; 93 pgcnt_t minfree = 0; 94 pgcnt_t desfree = 0; 95 pgcnt_t lotsfree = 0; 96 pgcnt_t needfree = 0; 97 pgcnt_t throttlefree = 0; 98 pgcnt_t pageout_reserve = 0; 99 100 pgcnt_t deficit; 101 pgcnt_t nscan; 102 pgcnt_t desscan; 103 104 /* 105 * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks 106 * are the number of ticks in each wakeup cycle that gives the 107 * equivalent of some underlying %CPU duty cycle. 108 * When RATETOSCHEDPAGING is 4, and hz is 100, pageout_scanner is 109 * awakened every 25 clock ticks. So, converting from %CPU to ticks 110 * per wakeup cycle would be x% of 25, that is (x * 100) / 25. 111 * So, for example, 4% == 1 tick and 80% == 20 ticks. 112 * 113 * min_pageout_ticks: 114 * ticks/wakeup equivalent of min_percent_cpu. 115 * 116 * max_pageout_ticks: 117 * ticks/wakeup equivalent of max_percent_cpu. 118 * 119 * pageout_ticks: 120 * Number of clock ticks budgeted for each wakeup cycle. 121 * Computed each time around by schedpaging(). 122 * Varies between min_pageout_ticks .. max_pageout_ticks, 123 * depending on memory pressure. 124 * 125 * pageout_lbolt: 126 * Timestamp of the last time pageout_scanner woke up and started 127 * (or resumed) scanning for not recently referenced pages. 128 */ 129 130 static clock_t min_pageout_ticks; 131 static clock_t max_pageout_ticks; 132 static clock_t pageout_ticks; 133 static clock_t pageout_lbolt; 134 135 static uint_t reset_hands; 136 137 #define PAGES_POLL_MASK 1023 138 139 /* 140 * pageout_sample_lim: 141 * The limit on the number of samples needed to establish a value 142 * for new pageout parameters, fastscan, slowscan, and handspreadpages. 143 * 144 * pageout_sample_cnt: 145 * Current sample number. Once the sample gets large enough, 146 * set new values for handspreadpages, fastscan and slowscan. 147 * 148 * pageout_sample_pages: 149 * The accumulated number of pages scanned during sampling. 150 * 151 * pageout_sample_ticks: 152 * The accumulated clock ticks for the sample. 153 * 154 * pageout_rate: 155 * Rate in pages/nanosecond, computed at the end of sampling. 156 * 157 * pageout_new_spread: 158 * The new value to use for fastscan and handspreadpages. 159 * Calculated after enough samples have been taken. 160 */ 161 162 typedef hrtime_t hrrate_t; 163 164 static uint64_t pageout_sample_lim = 4; 165 static uint64_t pageout_sample_cnt = 0; 166 static pgcnt_t pageout_sample_pages = 0; 167 static hrrate_t pageout_rate = 0; 168 static pgcnt_t pageout_new_spread = 0; 169 170 static clock_t pageout_cycle_ticks; 171 static hrtime_t sample_start, sample_end; 172 static hrtime_t pageout_sample_etime = 0; 173 174 /* 175 * Record number of times a pageout_scanner wakeup cycle finished because it 176 * timed out (exceeded its CPU budget), rather than because it visited 177 * its budgeted number of pages. 178 */ 179 uint64_t pageout_timeouts = 0; 180 181 #ifdef VM_STATS 182 static struct pageoutvmstats_str { 183 ulong_t checkpage[3]; 184 } pageoutvmstats; 185 #endif /* VM_STATS */ 186 187 /* 188 * Threads waiting for free memory use this condition variable and lock until 189 * memory becomes available. 190 */ 191 kmutex_t memavail_lock; 192 kcondvar_t memavail_cv; 193 194 /* 195 * The size of the clock loop. 196 */ 197 #define LOOPPAGES total_pages 198 199 /* 200 * Set up the paging constants for the clock algorithm. 201 * Called after the system is initialized and the amount of memory 202 * and number of paging devices is known. 203 * 204 * lotsfree is 1/64 of memory, but at least 512K. 205 * desfree is 1/2 of lotsfree. 206 * minfree is 1/2 of desfree. 207 * 208 * Note: to revert to the paging algorithm of Solaris 2.4/2.5, set: 209 * 210 * lotsfree = btop(512K) 211 * desfree = btop(200K) 212 * minfree = btop(100K) 213 * throttlefree = INT_MIN 214 * max_percent_cpu = 4 215 */ 216 void 217 setupclock(int recalc) 218 { 219 220 static spgcnt_t init_lfree, init_dfree, init_mfree; 221 static spgcnt_t init_tfree, init_preserve, init_mpgio; 222 static spgcnt_t init_mfscan, init_fscan, init_sscan, init_hspages; 223 224 looppages = LOOPPAGES; 225 226 /* 227 * setupclock can now be called to recalculate the paging 228 * parameters in the case of dynamic addition of memory. 229 * So to make sure we make the proper calculations, if such a 230 * situation should arise, we save away the initial values 231 * of each parameter so we can recall them when needed. This 232 * way we don't lose the settings an admin might have made 233 * through the /etc/system file. 234 */ 235 236 if (!recalc) { 237 init_lfree = lotsfree; 238 init_dfree = desfree; 239 init_mfree = minfree; 240 init_tfree = throttlefree; 241 init_preserve = pageout_reserve; 242 init_mpgio = maxpgio; 243 init_mfscan = maxfastscan; 244 init_fscan = fastscan; 245 init_sscan = slowscan; 246 init_hspages = handspreadpages; 247 } 248 249 /* 250 * Set up thresholds for paging: 251 */ 252 253 /* 254 * Lotsfree is threshold where paging daemon turns on. 255 */ 256 if (init_lfree == 0 || init_lfree >= looppages) 257 lotsfree = MAX(looppages / 64, btop(512 * 1024)); 258 else 259 lotsfree = init_lfree; 260 261 /* 262 * Desfree is amount of memory desired free. 263 * If less than this for extended period, start swapping. 264 */ 265 if (init_dfree == 0 || init_dfree >= lotsfree) 266 desfree = lotsfree / 2; 267 else 268 desfree = init_dfree; 269 270 /* 271 * Minfree is minimal amount of free memory which is tolerable. 272 */ 273 if (init_mfree == 0 || init_mfree >= desfree) 274 minfree = desfree / 2; 275 else 276 minfree = init_mfree; 277 278 /* 279 * Throttlefree is the point at which we start throttling 280 * PG_WAIT requests until enough memory becomes available. 281 */ 282 if (init_tfree == 0 || init_tfree >= desfree) 283 throttlefree = minfree; 284 else 285 throttlefree = init_tfree; 286 287 /* 288 * Pageout_reserve is the number of pages that we keep in 289 * stock for pageout's own use. Having a few such pages 290 * provides insurance against system deadlock due to 291 * pageout needing pages. When freemem < pageout_reserve, 292 * non-blocking allocations are denied to any threads 293 * other than pageout and sched. (At some point we might 294 * want to consider a per-thread flag like T_PUSHING_PAGES 295 * to indicate that a thread is part of the page-pushing 296 * dance (e.g. an interrupt thread) and thus is entitled 297 * to the same special dispensation we accord pageout.) 298 */ 299 if (init_preserve == 0 || init_preserve >= throttlefree) 300 pageout_reserve = throttlefree / 2; 301 else 302 pageout_reserve = init_preserve; 303 304 /* 305 * Maxpgio thresholds how much paging is acceptable. 306 * This figures that 2/3 busy on an arm is all that is 307 * tolerable for paging. We assume one operation per disk rev. 308 * 309 * XXX - Does not account for multiple swap devices. 310 */ 311 if (init_mpgio == 0) 312 maxpgio = (DISKRPM * 2) / 3; 313 else 314 maxpgio = init_mpgio; 315 316 /* 317 * The clock scan rate varies between fastscan and slowscan 318 * based on the amount of free memory available. Fastscan 319 * rate should be set based on the number pages that can be 320 * scanned per sec using ~10% of processor time. Since this 321 * value depends on the processor, MMU, Mhz etc., it is 322 * difficult to determine it in a generic manner for all 323 * architectures. 324 * 325 * Instead of trying to determine the number of pages scanned 326 * per sec for every processor, fastscan is set to be the smaller 327 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling 328 * time is limited to ~4% of processor time. 329 * 330 * Setting fastscan to be 1/2 of memory allows pageout to scan 331 * all of memory in ~2 secs. This implies that user pages not 332 * accessed within 1 sec (assuming, handspreadpages == fastscan) 333 * can be reclaimed when free memory is very low. Stealing pages 334 * not accessed within 1 sec seems reasonable and ensures that 335 * active user processes don't thrash. 336 * 337 * Smaller values of fastscan result in scanning fewer pages 338 * every second and consequently pageout may not be able to free 339 * sufficient memory to maintain the minimum threshold. Larger 340 * values of fastscan result in scanning a lot more pages which 341 * could lead to thrashing and higher CPU usage. 342 * 343 * Fastscan needs to be limited to a maximum value and should not 344 * scale with memory to prevent pageout from consuming too much 345 * time for scanning on slow CPU's and avoid thrashing, as a 346 * result of scanning too many pages, on faster CPU's. 347 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES 348 * (the upper bound for fastscan) based on the average number 349 * of pages that can potentially be scanned in ~1 sec (using ~4% 350 * of the CPU) on some of the following machines that currently 351 * run Solaris 2.x: 352 * 353 * average memory scanned in ~1 sec 354 * 355 * 25 Mhz SS1+: 23 Meg 356 * LX: 37 Meg 357 * 50 Mhz SC2000: 68 Meg 358 * 359 * 40 Mhz 486: 26 Meg 360 * 66 Mhz 486: 42 Meg 361 * 362 * When free memory falls just below lotsfree, the scan rate 363 * goes from 0 to slowscan (i.e., pageout starts running). This 364 * transition needs to be smooth and is achieved by ensuring that 365 * pageout scans a small number of pages to satisfy the transient 366 * memory demand. This is set to not exceed 100 pages/sec (25 per 367 * wakeup) since scanning that many pages has no noticible impact 368 * on system performance. 369 * 370 * In addition to setting fastscan and slowscan, pageout is 371 * limited to using ~4% of the CPU. This results in increasing 372 * the time taken to scan all of memory, which in turn means that 373 * user processes have a better opportunity of preventing their 374 * pages from being stolen. This has a positive effect on 375 * interactive and overall system performance when memory demand 376 * is high. 377 * 378 * Thus, the rate at which pages are scanned for replacement will 379 * vary linearly between slowscan and the number of pages that 380 * can be scanned using ~4% of processor time instead of varying 381 * linearly between slowscan and fastscan. 382 * 383 * Also, the processor time used by pageout will vary from ~1% 384 * at slowscan to ~4% at fastscan instead of varying between 385 * ~1% at slowscan and ~10% at fastscan. 386 * 387 * The values chosen for the various VM parameters (fastscan, 388 * handspreadpages, etc) are not universally true for all machines, 389 * but appear to be a good rule of thumb for the machines we've 390 * tested. They have the following ranges: 391 * 392 * cpu speed: 20 to 70 Mhz 393 * page size: 4K to 8K 394 * memory size: 16M to 5G 395 * page scan rate: 4000 - 17400 4K pages per sec 396 * 397 * The values need to be re-examined for machines which don't 398 * fall into the various ranges (e.g., slower or faster CPUs, 399 * smaller or larger pagesizes etc) shown above. 400 * 401 * On an MP machine, pageout is often unable to maintain the 402 * minimum paging thresholds under heavy load. This is due to 403 * the fact that user processes running on other CPU's can be 404 * dirtying memory at a much faster pace than pageout can find 405 * pages to free. The memory demands could be met by enabling 406 * more than one CPU to run the clock algorithm in such a manner 407 * that the various clock hands don't overlap. This also makes 408 * it more difficult to determine the values for fastscan, slowscan 409 * and handspreadpages. 410 * 411 * The swapper is currently used to free up memory when pageout 412 * is unable to meet memory demands by swapping out processes. 413 * In addition to freeing up memory, swapping also reduces the 414 * demand for memory by preventing user processes from running 415 * and thereby consuming memory. 416 */ 417 if (init_mfscan == 0) { 418 if (pageout_new_spread != 0) 419 maxfastscan = pageout_new_spread; 420 else 421 maxfastscan = MAXHANDSPREADPAGES; 422 } else { 423 maxfastscan = init_mfscan; 424 } 425 if (init_fscan == 0) 426 fastscan = MIN(looppages / loopfraction, maxfastscan); 427 else 428 fastscan = init_fscan; 429 if (fastscan > looppages / loopfraction) 430 fastscan = looppages / loopfraction; 431 432 /* 433 * Set slow scan time to 1/10 the fast scan time, but 434 * not to exceed maxslowscan. 435 */ 436 if (init_sscan == 0) 437 slowscan = MIN(fastscan / 10, maxslowscan); 438 else 439 slowscan = init_sscan; 440 if (slowscan > fastscan / 2) 441 slowscan = fastscan / 2; 442 443 /* 444 * Handspreadpages is distance (in pages) between front and back 445 * pageout daemon hands. The amount of time to reclaim a page 446 * once pageout examines it increases with this distance and 447 * decreases as the scan rate rises. It must be < the amount 448 * of pageable memory. 449 * 450 * Since pageout is limited to ~4% of the CPU, setting handspreadpages 451 * to be "fastscan" results in the front hand being a few secs 452 * (varies based on the processor speed) ahead of the back hand 453 * at fastscan rates. This distance can be further reduced, if 454 * necessary, by increasing the processor time used by pageout 455 * to be more than ~4% and preferrably not more than ~10%. 456 * 457 * As a result, user processes have a much better chance of 458 * referencing their pages before the back hand examines them. 459 * This also significantly lowers the number of reclaims from 460 * the freelist since pageout does not end up freeing pages which 461 * may be referenced a sec later. 462 */ 463 if (init_hspages == 0) 464 handspreadpages = fastscan; 465 else 466 handspreadpages = init_hspages; 467 468 /* 469 * Make sure that back hand follows front hand by at least 470 * 1/RATETOSCHEDPAGING seconds. Without this test, it is possible 471 * for the back hand to look at a page during the same wakeup of 472 * the pageout daemon in which the front hand cleared its ref bit. 473 */ 474 if (handspreadpages >= looppages) 475 handspreadpages = looppages - 1; 476 477 /* 478 * If we have been called to recalculate the parameters, 479 * set a flag to re-evaluate the clock hand pointers. 480 */ 481 if (recalc) 482 reset_hands = 1; 483 } 484 485 /* 486 * Pageout scheduling. 487 * 488 * Schedpaging controls the rate at which the page out daemon runs by 489 * setting the global variables nscan and desscan RATETOSCHEDPAGING 490 * times a second. Nscan records the number of pages pageout has examined 491 * in its current pass; schedpaging resets this value to zero each time 492 * it runs. Desscan records the number of pages pageout should examine 493 * in its next pass; schedpaging sets this value based on the amount of 494 * currently available memory. 495 */ 496 497 #define RATETOSCHEDPAGING 4 /* hz that is */ 498 499 static kmutex_t pageout_mutex; /* held while pageout or schedpaging running */ 500 501 /* 502 * Pool of available async pageout putpage requests. 503 */ 504 static struct async_reqs *push_req; 505 static struct async_reqs *req_freelist; /* available req structs */ 506 static struct async_reqs *push_list; /* pending reqs */ 507 static kmutex_t push_lock; /* protects req pool */ 508 static kcondvar_t push_cv; 509 510 static int async_list_size = 256; /* number of async request structs */ 511 512 static void pageout_scanner(void); 513 514 /* 515 * If a page is being shared more than "po_share" times 516 * then leave it alone- don't page it out. 517 */ 518 #define MIN_PO_SHARE (8) 519 #define MAX_PO_SHARE ((MIN_PO_SHARE) << 24) 520 ulong_t po_share = MIN_PO_SHARE; 521 522 /* 523 * Schedule rate for paging. 524 * Rate is linear interpolation between 525 * slowscan with lotsfree and fastscan when out of memory. 526 */ 527 static void 528 schedpaging(void *arg) 529 { 530 spgcnt_t vavail; 531 532 if (freemem < lotsfree + needfree + kmem_reapahead) 533 kmem_reap(); 534 535 if (freemem < lotsfree + needfree + seg_preapahead) 536 seg_preap(); 537 538 if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree)) 539 kcage_cageout_wakeup(); 540 541 if (mutex_tryenter(&pageout_mutex)) { 542 /* pageout() not running */ 543 nscan = 0; 544 vavail = freemem - deficit; 545 if (vavail < 0) 546 vavail = 0; 547 if (vavail > lotsfree) 548 vavail = lotsfree; 549 550 /* 551 * Fix for 1161438 (CRS SPR# 73922). All variables 552 * in the original calculation for desscan were 32 bit signed 553 * ints. As freemem approaches 0x0 on a system with 1 Gig or 554 * more of memory, the calculation can overflow. When this 555 * happens, desscan becomes negative and pageout_scanner() 556 * stops paging out. 557 */ 558 if (needfree) { 559 desscan = fastscan / RATETOSCHEDPAGING; 560 } else { 561 spgcnt_t faststmp, slowstmp, result; 562 563 slowstmp = slowscan * vavail; 564 faststmp = fastscan * (lotsfree - vavail); 565 result = (slowstmp + faststmp) / 566 nz(lotsfree) / RATETOSCHEDPAGING; 567 desscan = (pgcnt_t)result; 568 } 569 570 pageout_ticks = min_pageout_ticks + (lotsfree - vavail) * 571 (max_pageout_ticks - min_pageout_ticks) / nz(lotsfree); 572 573 if (freemem < lotsfree + needfree || 574 pageout_sample_cnt < pageout_sample_lim) { 575 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 576 "pageout_cv_signal:freemem %ld", freemem); 577 cv_signal(&proc_pageout->p_cv); 578 } else { 579 /* 580 * There are enough free pages, no need to 581 * kick the scanner thread. And next time 582 * around, keep more of the `highly shared' 583 * pages. 584 */ 585 cv_signal_pageout(); 586 if (po_share > MIN_PO_SHARE) { 587 po_share >>= 1; 588 } 589 } 590 mutex_exit(&pageout_mutex); 591 } 592 593 /* 594 * Signal threads waiting for available memory. 595 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but 596 * in this case it is not needed - the waiters will be waken up during 597 * the next invocation of this function. 598 */ 599 if (kmem_avail() > 0) 600 cv_broadcast(&memavail_cv); 601 602 (void) timeout(schedpaging, arg, hz / RATETOSCHEDPAGING); 603 } 604 605 pgcnt_t pushes; 606 ulong_t push_list_size; /* # of requests on pageout queue */ 607 608 #define FRONT 1 609 #define BACK 2 610 611 int dopageout = 1; /* must be non-zero to turn page stealing on */ 612 613 /* 614 * The page out daemon, which runs as process 2. 615 * 616 * As long as there are at least lotsfree pages, 617 * this process is not run. When the number of free 618 * pages stays in the range desfree to lotsfree, 619 * this daemon runs through the pages in the loop 620 * at a rate determined in schedpaging(). Pageout manages 621 * two hands on the clock. The front hand moves through 622 * memory, clearing the reference bit, 623 * and stealing pages from procs that are over maxrss. 624 * The back hand travels a distance behind the front hand, 625 * freeing the pages that have not been referenced in the time 626 * since the front hand passed. If modified, they are pushed to 627 * swap before being freed. 628 * 629 * There are 2 threads that act on behalf of the pageout process. 630 * One thread scans pages (pageout_scanner) and frees them up if 631 * they don't require any VOP_PUTPAGE operation. If a page must be 632 * written back to its backing store, the request is put on a list 633 * and the other (pageout) thread is signaled. The pageout thread 634 * grabs VOP_PUTPAGE requests from the list, and processes them. 635 * Some filesystems may require resources for the VOP_PUTPAGE 636 * operations (like memory) and hence can block the pageout 637 * thread, but the scanner thread can still operate. There is still 638 * no gaurentee that memory deadlocks cannot occur. 639 * 640 * For now, this thing is in very rough form. 641 */ 642 void 643 pageout() 644 { 645 struct async_reqs *arg; 646 pri_t pageout_pri; 647 int i; 648 pgcnt_t max_pushes; 649 callb_cpr_t cprinfo; 650 651 proc_pageout = ttoproc(curthread); 652 proc_pageout->p_cstime = 0; 653 proc_pageout->p_stime = 0; 654 proc_pageout->p_cutime = 0; 655 proc_pageout->p_utime = 0; 656 bcopy("pageout", u.u_psargs, 8); 657 bcopy("pageout", u.u_comm, 7); 658 659 /* 660 * Create pageout scanner thread 661 */ 662 mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL); 663 mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL); 664 665 /* 666 * Allocate and initialize the async request structures 667 * for pageout. 668 */ 669 push_req = (struct async_reqs *) 670 kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP); 671 672 req_freelist = push_req; 673 for (i = 0; i < async_list_size - 1; i++) 674 push_req[i].a_next = &push_req[i + 1]; 675 676 pageout_pri = curthread->t_pri; 677 pageout_init(pageout_scanner, proc_pageout, pageout_pri - 1); 678 679 /* 680 * kick off pageout scheduler. 681 */ 682 schedpaging(NULL); 683 684 /* 685 * Create kernel cage thread. 686 * The kernel cage thread is started under the pageout process 687 * to take advantage of the less restricted page allocation 688 * in page_create_throttle(). 689 */ 690 kcage_cageout_init(); 691 692 /* 693 * Limit pushes to avoid saturating pageout devices. 694 */ 695 max_pushes = maxpgio / RATETOSCHEDPAGING; 696 CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout"); 697 698 for (;;) { 699 mutex_enter(&push_lock); 700 701 while ((arg = push_list) == NULL || pushes > max_pushes) { 702 CALLB_CPR_SAFE_BEGIN(&cprinfo); 703 cv_wait(&push_cv, &push_lock); 704 pushes = 0; 705 CALLB_CPR_SAFE_END(&cprinfo, &push_lock); 706 } 707 push_list = arg->a_next; 708 arg->a_next = NULL; 709 mutex_exit(&push_lock); 710 711 if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off, 712 arg->a_len, arg->a_flags, 713 arg->a_cred) == 0) { 714 pushes++; 715 } 716 717 /* vp held by checkpage() */ 718 VN_RELE(arg->a_vp); 719 720 mutex_enter(&push_lock); 721 arg->a_next = req_freelist; /* back on freelist */ 722 req_freelist = arg; 723 push_list_size--; 724 mutex_exit(&push_lock); 725 } 726 } 727 728 /* 729 * Kernel thread that scans pages looking for ones to free 730 */ 731 static void 732 pageout_scanner(void) 733 { 734 struct page *fronthand, *backhand; 735 uint_t count; 736 callb_cpr_t cprinfo; 737 pgcnt_t nscan_limit; 738 pgcnt_t pcount; 739 740 CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan"); 741 mutex_enter(&pageout_mutex); 742 743 /* 744 * The restart case does not attempt to point the hands at roughly 745 * the right point on the assumption that after one circuit things 746 * will have settled down - and restarts shouldn't be that often. 747 */ 748 749 /* 750 * Set the two clock hands to be separated by a reasonable amount, 751 * but no more than 360 degrees apart. 752 */ 753 backhand = page_first(); 754 if (handspreadpages >= total_pages) 755 fronthand = page_nextn(backhand, total_pages - 1); 756 else 757 fronthand = page_nextn(backhand, handspreadpages); 758 759 min_pageout_ticks = MAX(1, 760 ((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING); 761 max_pageout_ticks = MAX(min_pageout_ticks, 762 ((hz * max_percent_cpu) / 100) / RATETOSCHEDPAGING); 763 764 loop: 765 cv_signal_pageout(); 766 767 CALLB_CPR_SAFE_BEGIN(&cprinfo); 768 cv_wait(&proc_pageout->p_cv, &pageout_mutex); 769 CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex); 770 771 if (!dopageout) 772 goto loop; 773 774 if (reset_hands) { 775 reset_hands = 0; 776 777 backhand = page_first(); 778 if (handspreadpages >= total_pages) 779 fronthand = page_nextn(backhand, total_pages - 1); 780 else 781 fronthand = page_nextn(backhand, handspreadpages); 782 } 783 784 CPU_STATS_ADDQ(CPU, vm, pgrrun, 1); 785 count = 0; 786 787 TRACE_4(TR_FAC_VM, TR_PAGEOUT_START, 788 "pageout_start:freemem %ld lotsfree %ld nscan %ld desscan %ld", 789 freemem, lotsfree, nscan, desscan); 790 791 /* Kernel probe */ 792 TNF_PROBE_2(pageout_scan_start, "vm pagedaemon", /* CSTYLED */, 793 tnf_ulong, pages_free, freemem, 794 tnf_ulong, pages_needed, needfree); 795 796 pcount = 0; 797 if (pageout_sample_cnt < pageout_sample_lim) { 798 nscan_limit = total_pages; 799 } else { 800 nscan_limit = desscan; 801 } 802 pageout_lbolt = lbolt; 803 sample_start = gethrtime(); 804 805 /* 806 * Scan the appropriate number of pages for a single duty cycle. 807 * However, stop scanning as soon as there is enough free memory. 808 * For a short while, we will be sampling the performance of the 809 * scanner and need to keep running just to get sample data, in 810 * which case we keep going and don't pay attention to whether 811 * or not there is enough free memory. 812 */ 813 814 while (nscan < nscan_limit && (freemem < lotsfree + needfree || 815 pageout_sample_cnt < pageout_sample_lim)) { 816 int rvfront, rvback; 817 818 /* 819 * Check to see if we have exceeded our %CPU budget 820 * for this wakeup, but not on every single page visited, 821 * just every once in a while. 822 */ 823 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) { 824 pageout_cycle_ticks = lbolt - pageout_lbolt; 825 if (pageout_cycle_ticks >= pageout_ticks) { 826 ++pageout_timeouts; 827 break; 828 } 829 } 830 831 /* 832 * If checkpage manages to add a page to the free list, 833 * we give ourselves another couple of trips around the loop. 834 */ 835 if ((rvfront = checkpage(fronthand, FRONT)) == 1) 836 count = 0; 837 if ((rvback = checkpage(backhand, BACK)) == 1) 838 count = 0; 839 840 ++pcount; 841 842 /* 843 * protected by pageout_mutex instead of cpu_stat_lock 844 */ 845 CPU_STATS_ADDQ(CPU, vm, scan, 1); 846 847 /* 848 * Don't include ineligible pages in the number scanned. 849 */ 850 if (rvfront != -1 || rvback != -1) 851 nscan++; 852 853 backhand = page_next(backhand); 854 855 /* 856 * backhand update and wraparound check are done separately 857 * because lint barks when it finds an empty "if" body 858 */ 859 860 if ((fronthand = page_next(fronthand)) == page_first()) { 861 TRACE_2(TR_FAC_VM, TR_PAGEOUT_HAND_WRAP, 862 "pageout_hand_wrap:freemem %ld whichhand %d", 863 freemem, FRONT); 864 865 /* 866 * protected by pageout_mutex instead of cpu_stat_lock 867 */ 868 CPU_STATS_ADDQ(CPU, vm, rev, 1); 869 if (++count > 1) { 870 /* 871 * Extremely unlikely, but it happens. 872 * We went around the loop at least once 873 * and didn't get far enough. 874 * If we are still skipping `highly shared' 875 * pages, skip fewer of them. Otherwise, 876 * give up till the next clock tick. 877 */ 878 if (po_share < MAX_PO_SHARE) { 879 po_share <<= 1; 880 } else { 881 /* 882 * Really a "goto loop", but 883 * if someone is TRACing or 884 * TNF_PROBE_ing, at least 885 * make records to show 886 * where we are. 887 */ 888 break; 889 } 890 } 891 } 892 } 893 894 sample_end = gethrtime(); 895 896 TRACE_5(TR_FAC_VM, TR_PAGEOUT_END, 897 "pageout_end:freemem %ld lots %ld nscan %ld des %ld count %u", 898 freemem, lotsfree, nscan, desscan, count); 899 900 /* Kernel probe */ 901 TNF_PROBE_2(pageout_scan_end, "vm pagedaemon", /* CSTYLED */, 902 tnf_ulong, pages_scanned, nscan, 903 tnf_ulong, pages_free, freemem); 904 905 if (pageout_sample_cnt < pageout_sample_lim) { 906 pageout_sample_pages += pcount; 907 pageout_sample_etime += sample_end - sample_start; 908 ++pageout_sample_cnt; 909 } 910 if (pageout_sample_cnt >= pageout_sample_lim && 911 pageout_new_spread == 0) { 912 pageout_rate = (hrrate_t)pageout_sample_pages * 913 (hrrate_t)(NANOSEC) / pageout_sample_etime; 914 pageout_new_spread = pageout_rate / 10; 915 setupclock(1); 916 } 917 918 goto loop; 919 } 920 921 /* 922 * Look at the page at hand. If it is locked (e.g., for physical i/o), 923 * system (u., page table) or free, then leave it alone. Otherwise, 924 * if we are running the front hand, turn off the page's reference bit. 925 * If the proc is over maxrss, we take it. If running the back hand, 926 * check whether the page has been reclaimed. If not, free the page, 927 * pushing it to disk first if necessary. 928 * 929 * Return values: 930 * -1 if the page is not a candidate at all, 931 * 0 if not freed, or 932 * 1 if we freed it. 933 */ 934 static int 935 checkpage(struct page *pp, int whichhand) 936 { 937 int ppattr; 938 int isfs = 0; 939 int isexec = 0; 940 int pagesync_flag; 941 942 /* 943 * Skip pages: 944 * - associated with the kernel vnode since 945 * they are always "exclusively" locked. 946 * - that are free 947 * - that are shared more than po_share'd times 948 * - its already locked 949 * 950 * NOTE: These optimizations assume that reads are atomic. 951 */ 952 top: 953 if ((pp->p_vnode == &kvp) || 954 (PP_ISFREE(pp)) || 955 (hat_page_getshare(pp) > po_share) || PAGE_LOCKED(pp)) { 956 return (-1); 957 } 958 959 if (!page_trylock(pp, SE_EXCL)) { 960 /* 961 * Skip the page if we can't acquire the "exclusive" lock. 962 */ 963 return (-1); 964 } else if (PP_ISFREE(pp)) { 965 /* 966 * It became free between the above check and our actually 967 * locking the page. Oh, well there will be other pages. 968 */ 969 page_unlock(pp); 970 return (-1); 971 } 972 973 /* 974 * Reject pages that cannot be freed. The page_struct_lock 975 * need not be acquired to examine these 976 * fields since the page has an "exclusive" lock. 977 */ 978 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 979 page_unlock(pp); 980 return (-1); 981 } 982 983 /* 984 * Maintain statistics for what we are freeing 985 */ 986 987 if (pp->p_vnode != NULL) { 988 if (pp->p_vnode->v_flag & VVMEXEC) 989 isexec = 1; 990 991 if (!IS_SWAPFSVP(pp->p_vnode)) 992 isfs = 1; 993 } 994 995 /* 996 * Turn off REF and MOD bits with the front hand. 997 * The back hand examines the REF bit and always considers 998 * SHARED pages as referenced. 999 */ 1000 if (whichhand == FRONT) 1001 pagesync_flag = HAT_SYNC_ZERORM; 1002 else 1003 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF | 1004 HAT_SYNC_STOPON_SHARED; 1005 1006 ppattr = hat_pagesync(pp, pagesync_flag); 1007 1008 recheck: 1009 /* 1010 * If page is referenced; make unreferenced but reclaimable. 1011 * If this page is not referenced, then it must be reclaimable 1012 * and we can add it to the free list. 1013 */ 1014 if (ppattr & P_REF) { 1015 TRACE_2(TR_FAC_VM, TR_PAGEOUT_ISREF, 1016 "pageout_isref:pp %p whichhand %d", pp, whichhand); 1017 if (whichhand == FRONT) { 1018 /* 1019 * Checking of rss or madvise flags needed here... 1020 * 1021 * If not "well-behaved", fall through into the code 1022 * for not referenced. 1023 */ 1024 hat_clrref(pp); 1025 } 1026 /* 1027 * Somebody referenced the page since the front 1028 * hand went by, so it's not a candidate for 1029 * freeing up. 1030 */ 1031 page_unlock(pp); 1032 return (0); 1033 } 1034 1035 VM_STAT_ADD(pageoutvmstats.checkpage[0]); 1036 1037 /* 1038 * If large page, attempt to demote it. If successfully demoted, 1039 * retry the checkpage. 1040 */ 1041 if (pp->p_szc != 0) { 1042 if (!page_try_demote_pages(pp)) { 1043 VM_STAT_ADD(pageoutvmstats.checkpage[1]); 1044 page_unlock(pp); 1045 return (-1); 1046 } 1047 ASSERT(pp->p_szc == 0); 1048 VM_STAT_ADD(pageoutvmstats.checkpage[2]); 1049 /* 1050 * since page_try_demote_pages() could have unloaded some 1051 * mappings it makes sense to reload ppattr. 1052 */ 1053 ppattr = hat_page_getattr(pp, P_MOD | P_REF); 1054 } 1055 1056 /* 1057 * If the page is currently dirty, we have to arrange 1058 * to have it cleaned before it can be freed. 1059 * 1060 * XXX - ASSERT(pp->p_vnode != NULL); 1061 */ 1062 if ((ppattr & P_MOD) && pp->p_vnode) { 1063 struct vnode *vp = pp->p_vnode; 1064 u_offset_t offset = pp->p_offset; 1065 1066 /* 1067 * XXX - Test for process being swapped out or about to exit? 1068 * [Can't get back to process(es) using the page.] 1069 */ 1070 1071 /* 1072 * Hold the vnode before releasing the page lock to 1073 * prevent it from being freed and re-used by some 1074 * other thread. 1075 */ 1076 VN_HOLD(vp); 1077 page_unlock(pp); 1078 1079 /* 1080 * Queue i/o request for the pageout thread. 1081 */ 1082 if (!queue_io_request(vp, offset)) { 1083 VN_RELE(vp); 1084 return (0); 1085 } 1086 return (1); 1087 } 1088 1089 /* 1090 * Now we unload all the translations, 1091 * and put the page back on to the free list. 1092 * If the page was used (referenced or modified) after 1093 * the pagesync but before it was unloaded we catch it 1094 * and handle the page properly. 1095 */ 1096 TRACE_2(TR_FAC_VM, TR_PAGEOUT_FREE, 1097 "pageout_free:pp %p whichhand %d", pp, whichhand); 1098 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1099 ppattr = hat_page_getattr(pp, P_MOD | P_REF); 1100 if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode)) 1101 goto recheck; 1102 1103 /*LINTED: constant in conditional context*/ 1104 VN_DISPOSE(pp, B_FREE, 0, kcred); 1105 1106 CPU_STATS_ADD_K(vm, dfree, 1); 1107 1108 if (isfs) { 1109 if (isexec) { 1110 CPU_STATS_ADD_K(vm, execfree, 1); 1111 } else { 1112 CPU_STATS_ADD_K(vm, fsfree, 1); 1113 } 1114 } else { 1115 CPU_STATS_ADD_K(vm, anonfree, 1); 1116 } 1117 1118 return (1); /* freed a page! */ 1119 } 1120 1121 /* 1122 * Queue async i/o request from pageout_scanner and segment swapout 1123 * routines on one common list. This ensures that pageout devices (swap) 1124 * are not saturated by pageout_scanner or swapout requests. 1125 * The pageout thread empties this list by initiating i/o operations. 1126 */ 1127 int 1128 queue_io_request(vnode_t *vp, u_offset_t off) 1129 { 1130 struct async_reqs *arg; 1131 1132 /* 1133 * If we cannot allocate an async request struct, 1134 * skip this page. 1135 */ 1136 mutex_enter(&push_lock); 1137 if ((arg = req_freelist) == NULL) { 1138 mutex_exit(&push_lock); 1139 return (0); 1140 } 1141 req_freelist = arg->a_next; /* adjust freelist */ 1142 push_list_size++; 1143 1144 arg->a_vp = vp; 1145 arg->a_off = off; 1146 arg->a_len = PAGESIZE; 1147 arg->a_flags = B_ASYNC | B_FREE; 1148 arg->a_cred = kcred; /* always held */ 1149 1150 /* 1151 * Add to list of pending write requests. 1152 */ 1153 arg->a_next = push_list; 1154 push_list = arg; 1155 1156 if (req_freelist == NULL) { 1157 /* 1158 * No free async requests left. The lock is held so we 1159 * might as well signal the pusher thread now. 1160 */ 1161 cv_signal(&push_cv); 1162 } 1163 mutex_exit(&push_lock); 1164 return (1); 1165 } 1166 1167 /* 1168 * Wakeup pageout to initiate i/o if push_list is not empty. 1169 */ 1170 void 1171 cv_signal_pageout() 1172 { 1173 if (push_list != NULL) { 1174 mutex_enter(&push_lock); 1175 cv_signal(&push_cv); 1176 mutex_exit(&push_lock); 1177 } 1178 } 1179