1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 #include <sys/types.h> 42 #include <sys/t_lock.h> 43 #include <sys/param.h> 44 #include <sys/buf.h> 45 #include <sys/uio.h> 46 #include <sys/proc.h> 47 #include <sys/systm.h> 48 #include <sys/mman.h> 49 #include <sys/cred.h> 50 #include <sys/vnode.h> 51 #include <sys/vm.h> 52 #include <sys/vmparam.h> 53 #include <sys/vtrace.h> 54 #include <sys/cmn_err.h> 55 #include <sys/cpuvar.h> 56 #include <sys/user.h> 57 #include <sys/kmem.h> 58 #include <sys/debug.h> 59 #include <sys/callb.h> 60 #include <sys/tnf_probe.h> 61 #include <sys/mem_cage.h> 62 #include <sys/time.h> 63 64 #include <vm/hat.h> 65 #include <vm/as.h> 66 #include <vm/seg.h> 67 #include <vm/page.h> 68 #include <vm/pvn.h> 69 #include <vm/seg_kmem.h> 70 71 static int checkpage(page_t *, int); 72 73 /* 74 * The following parameters control operation of the page replacement 75 * algorithm. They are initialized to 0, and then computed at boot time 76 * based on the size of the system. If they are patched non-zero in 77 * a loaded vmunix they are left alone and may thus be changed per system 78 * using adb on the loaded system. 79 */ 80 pgcnt_t slowscan = 0; 81 pgcnt_t fastscan = 0; 82 83 static pgcnt_t handspreadpages = 0; 84 static int loopfraction = 2; 85 static pgcnt_t looppages; 86 static int min_percent_cpu = 4; 87 static int max_percent_cpu = 80; 88 static pgcnt_t maxfastscan = 0; 89 static pgcnt_t maxslowscan = 100; 90 91 pgcnt_t maxpgio = 0; 92 pgcnt_t minfree = 0; 93 pgcnt_t desfree = 0; 94 pgcnt_t lotsfree = 0; 95 pgcnt_t needfree = 0; 96 pgcnt_t throttlefree = 0; 97 pgcnt_t pageout_reserve = 0; 98 99 pgcnt_t deficit; 100 pgcnt_t nscan; 101 pgcnt_t desscan; 102 103 /* 104 * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks 105 * are the number of ticks in each wakeup cycle that gives the 106 * equivalent of some underlying %CPU duty cycle. 107 * When RATETOSCHEDPAGING is 4, and hz is 100, pageout_scanner is 108 * awakened every 25 clock ticks. So, converting from %CPU to ticks 109 * per wakeup cycle would be x% of 25, that is (x * 100) / 25. 110 * So, for example, 4% == 1 tick and 80% == 20 ticks. 111 * 112 * min_pageout_ticks: 113 * ticks/wakeup equivalent of min_percent_cpu. 114 * 115 * max_pageout_ticks: 116 * ticks/wakeup equivalent of max_percent_cpu. 117 * 118 * pageout_ticks: 119 * Number of clock ticks budgeted for each wakeup cycle. 120 * Computed each time around by schedpaging(). 121 * Varies between min_pageout_ticks .. max_pageout_ticks, 122 * depending on memory pressure. 123 * 124 * pageout_lbolt: 125 * Timestamp of the last time pageout_scanner woke up and started 126 * (or resumed) scanning for not recently referenced pages. 127 */ 128 129 static clock_t min_pageout_ticks; 130 static clock_t max_pageout_ticks; 131 static clock_t pageout_ticks; 132 static clock_t pageout_lbolt; 133 134 static uint_t reset_hands; 135 136 #define PAGES_POLL_MASK 1023 137 138 /* 139 * pageout_sample_lim: 140 * The limit on the number of samples needed to establish a value 141 * for new pageout parameters, fastscan, slowscan, and handspreadpages. 142 * 143 * pageout_sample_cnt: 144 * Current sample number. Once the sample gets large enough, 145 * set new values for handspreadpages, fastscan and slowscan. 146 * 147 * pageout_sample_pages: 148 * The accumulated number of pages scanned during sampling. 149 * 150 * pageout_sample_ticks: 151 * The accumulated clock ticks for the sample. 152 * 153 * pageout_rate: 154 * Rate in pages/nanosecond, computed at the end of sampling. 155 * 156 * pageout_new_spread: 157 * The new value to use for fastscan and handspreadpages. 158 * Calculated after enough samples have been taken. 159 */ 160 161 typedef hrtime_t hrrate_t; 162 163 static uint64_t pageout_sample_lim = 4; 164 static uint64_t pageout_sample_cnt = 0; 165 static pgcnt_t pageout_sample_pages = 0; 166 static hrrate_t pageout_rate = 0; 167 static pgcnt_t pageout_new_spread = 0; 168 169 static clock_t pageout_cycle_ticks; 170 static hrtime_t sample_start, sample_end; 171 static hrtime_t pageout_sample_etime = 0; 172 173 /* 174 * Record number of times a pageout_scanner wakeup cycle finished because it 175 * timed out (exceeded its CPU budget), rather than because it visited 176 * its budgeted number of pages. 177 */ 178 uint64_t pageout_timeouts = 0; 179 180 #ifdef VM_STATS 181 static struct pageoutvmstats_str { 182 ulong_t checkpage[3]; 183 } pageoutvmstats; 184 #endif /* VM_STATS */ 185 186 /* 187 * Threads waiting for free memory use this condition variable and lock until 188 * memory becomes available. 189 */ 190 kmutex_t memavail_lock; 191 kcondvar_t memavail_cv; 192 193 /* 194 * The size of the clock loop. 195 */ 196 #define LOOPPAGES total_pages 197 198 /* 199 * Set up the paging constants for the clock algorithm. 200 * Called after the system is initialized and the amount of memory 201 * and number of paging devices is known. 202 * 203 * lotsfree is 1/64 of memory, but at least 512K. 204 * desfree is 1/2 of lotsfree. 205 * minfree is 1/2 of desfree. 206 * 207 * Note: to revert to the paging algorithm of Solaris 2.4/2.5, set: 208 * 209 * lotsfree = btop(512K) 210 * desfree = btop(200K) 211 * minfree = btop(100K) 212 * throttlefree = INT_MIN 213 * max_percent_cpu = 4 214 */ 215 void 216 setupclock(int recalc) 217 { 218 219 static spgcnt_t init_lfree, init_dfree, init_mfree; 220 static spgcnt_t init_tfree, init_preserve, init_mpgio; 221 static spgcnt_t init_mfscan, init_fscan, init_sscan, init_hspages; 222 223 looppages = LOOPPAGES; 224 225 /* 226 * setupclock can now be called to recalculate the paging 227 * parameters in the case of dynamic addition of memory. 228 * So to make sure we make the proper calculations, if such a 229 * situation should arise, we save away the initial values 230 * of each parameter so we can recall them when needed. This 231 * way we don't lose the settings an admin might have made 232 * through the /etc/system file. 233 */ 234 235 if (!recalc) { 236 init_lfree = lotsfree; 237 init_dfree = desfree; 238 init_mfree = minfree; 239 init_tfree = throttlefree; 240 init_preserve = pageout_reserve; 241 init_mpgio = maxpgio; 242 init_mfscan = maxfastscan; 243 init_fscan = fastscan; 244 init_sscan = slowscan; 245 init_hspages = handspreadpages; 246 } 247 248 /* 249 * Set up thresholds for paging: 250 */ 251 252 /* 253 * Lotsfree is threshold where paging daemon turns on. 254 */ 255 if (init_lfree == 0 || init_lfree >= looppages) 256 lotsfree = MAX(looppages / 64, btop(512 * 1024)); 257 else 258 lotsfree = init_lfree; 259 260 /* 261 * Desfree is amount of memory desired free. 262 * If less than this for extended period, start swapping. 263 */ 264 if (init_dfree == 0 || init_dfree >= lotsfree) 265 desfree = lotsfree / 2; 266 else 267 desfree = init_dfree; 268 269 /* 270 * Minfree is minimal amount of free memory which is tolerable. 271 */ 272 if (init_mfree == 0 || init_mfree >= desfree) 273 minfree = desfree / 2; 274 else 275 minfree = init_mfree; 276 277 /* 278 * Throttlefree is the point at which we start throttling 279 * PG_WAIT requests until enough memory becomes available. 280 */ 281 if (init_tfree == 0 || init_tfree >= desfree) 282 throttlefree = minfree; 283 else 284 throttlefree = init_tfree; 285 286 /* 287 * Pageout_reserve is the number of pages that we keep in 288 * stock for pageout's own use. Having a few such pages 289 * provides insurance against system deadlock due to 290 * pageout needing pages. When freemem < pageout_reserve, 291 * non-blocking allocations are denied to any threads 292 * other than pageout and sched. (At some point we might 293 * want to consider a per-thread flag like T_PUSHING_PAGES 294 * to indicate that a thread is part of the page-pushing 295 * dance (e.g. an interrupt thread) and thus is entitled 296 * to the same special dispensation we accord pageout.) 297 */ 298 if (init_preserve == 0 || init_preserve >= throttlefree) 299 pageout_reserve = throttlefree / 2; 300 else 301 pageout_reserve = init_preserve; 302 303 /* 304 * Maxpgio thresholds how much paging is acceptable. 305 * This figures that 2/3 busy on an arm is all that is 306 * tolerable for paging. We assume one operation per disk rev. 307 * 308 * XXX - Does not account for multiple swap devices. 309 */ 310 if (init_mpgio == 0) 311 maxpgio = (DISKRPM * 2) / 3; 312 else 313 maxpgio = init_mpgio; 314 315 /* 316 * The clock scan rate varies between fastscan and slowscan 317 * based on the amount of free memory available. Fastscan 318 * rate should be set based on the number pages that can be 319 * scanned per sec using ~10% of processor time. Since this 320 * value depends on the processor, MMU, Mhz etc., it is 321 * difficult to determine it in a generic manner for all 322 * architectures. 323 * 324 * Instead of trying to determine the number of pages scanned 325 * per sec for every processor, fastscan is set to be the smaller 326 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling 327 * time is limited to ~4% of processor time. 328 * 329 * Setting fastscan to be 1/2 of memory allows pageout to scan 330 * all of memory in ~2 secs. This implies that user pages not 331 * accessed within 1 sec (assuming, handspreadpages == fastscan) 332 * can be reclaimed when free memory is very low. Stealing pages 333 * not accessed within 1 sec seems reasonable and ensures that 334 * active user processes don't thrash. 335 * 336 * Smaller values of fastscan result in scanning fewer pages 337 * every second and consequently pageout may not be able to free 338 * sufficient memory to maintain the minimum threshold. Larger 339 * values of fastscan result in scanning a lot more pages which 340 * could lead to thrashing and higher CPU usage. 341 * 342 * Fastscan needs to be limited to a maximum value and should not 343 * scale with memory to prevent pageout from consuming too much 344 * time for scanning on slow CPU's and avoid thrashing, as a 345 * result of scanning too many pages, on faster CPU's. 346 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES 347 * (the upper bound for fastscan) based on the average number 348 * of pages that can potentially be scanned in ~1 sec (using ~4% 349 * of the CPU) on some of the following machines that currently 350 * run Solaris 2.x: 351 * 352 * average memory scanned in ~1 sec 353 * 354 * 25 Mhz SS1+: 23 Meg 355 * LX: 37 Meg 356 * 50 Mhz SC2000: 68 Meg 357 * 358 * 40 Mhz 486: 26 Meg 359 * 66 Mhz 486: 42 Meg 360 * 361 * When free memory falls just below lotsfree, the scan rate 362 * goes from 0 to slowscan (i.e., pageout starts running). This 363 * transition needs to be smooth and is achieved by ensuring that 364 * pageout scans a small number of pages to satisfy the transient 365 * memory demand. This is set to not exceed 100 pages/sec (25 per 366 * wakeup) since scanning that many pages has no noticible impact 367 * on system performance. 368 * 369 * In addition to setting fastscan and slowscan, pageout is 370 * limited to using ~4% of the CPU. This results in increasing 371 * the time taken to scan all of memory, which in turn means that 372 * user processes have a better opportunity of preventing their 373 * pages from being stolen. This has a positive effect on 374 * interactive and overall system performance when memory demand 375 * is high. 376 * 377 * Thus, the rate at which pages are scanned for replacement will 378 * vary linearly between slowscan and the number of pages that 379 * can be scanned using ~4% of processor time instead of varying 380 * linearly between slowscan and fastscan. 381 * 382 * Also, the processor time used by pageout will vary from ~1% 383 * at slowscan to ~4% at fastscan instead of varying between 384 * ~1% at slowscan and ~10% at fastscan. 385 * 386 * The values chosen for the various VM parameters (fastscan, 387 * handspreadpages, etc) are not universally true for all machines, 388 * but appear to be a good rule of thumb for the machines we've 389 * tested. They have the following ranges: 390 * 391 * cpu speed: 20 to 70 Mhz 392 * page size: 4K to 8K 393 * memory size: 16M to 5G 394 * page scan rate: 4000 - 17400 4K pages per sec 395 * 396 * The values need to be re-examined for machines which don't 397 * fall into the various ranges (e.g., slower or faster CPUs, 398 * smaller or larger pagesizes etc) shown above. 399 * 400 * On an MP machine, pageout is often unable to maintain the 401 * minimum paging thresholds under heavy load. This is due to 402 * the fact that user processes running on other CPU's can be 403 * dirtying memory at a much faster pace than pageout can find 404 * pages to free. The memory demands could be met by enabling 405 * more than one CPU to run the clock algorithm in such a manner 406 * that the various clock hands don't overlap. This also makes 407 * it more difficult to determine the values for fastscan, slowscan 408 * and handspreadpages. 409 * 410 * The swapper is currently used to free up memory when pageout 411 * is unable to meet memory demands by swapping out processes. 412 * In addition to freeing up memory, swapping also reduces the 413 * demand for memory by preventing user processes from running 414 * and thereby consuming memory. 415 */ 416 if (init_mfscan == 0) { 417 if (pageout_new_spread != 0) 418 maxfastscan = pageout_new_spread; 419 else 420 maxfastscan = MAXHANDSPREADPAGES; 421 } else { 422 maxfastscan = init_mfscan; 423 } 424 if (init_fscan == 0) 425 fastscan = MIN(looppages / loopfraction, maxfastscan); 426 else 427 fastscan = init_fscan; 428 if (fastscan > looppages / loopfraction) 429 fastscan = looppages / loopfraction; 430 431 /* 432 * Set slow scan time to 1/10 the fast scan time, but 433 * not to exceed maxslowscan. 434 */ 435 if (init_sscan == 0) 436 slowscan = MIN(fastscan / 10, maxslowscan); 437 else 438 slowscan = init_sscan; 439 if (slowscan > fastscan / 2) 440 slowscan = fastscan / 2; 441 442 /* 443 * Handspreadpages is distance (in pages) between front and back 444 * pageout daemon hands. The amount of time to reclaim a page 445 * once pageout examines it increases with this distance and 446 * decreases as the scan rate rises. It must be < the amount 447 * of pageable memory. 448 * 449 * Since pageout is limited to ~4% of the CPU, setting handspreadpages 450 * to be "fastscan" results in the front hand being a few secs 451 * (varies based on the processor speed) ahead of the back hand 452 * at fastscan rates. This distance can be further reduced, if 453 * necessary, by increasing the processor time used by pageout 454 * to be more than ~4% and preferrably not more than ~10%. 455 * 456 * As a result, user processes have a much better chance of 457 * referencing their pages before the back hand examines them. 458 * This also significantly lowers the number of reclaims from 459 * the freelist since pageout does not end up freeing pages which 460 * may be referenced a sec later. 461 */ 462 if (init_hspages == 0) 463 handspreadpages = fastscan; 464 else 465 handspreadpages = init_hspages; 466 467 /* 468 * Make sure that back hand follows front hand by at least 469 * 1/RATETOSCHEDPAGING seconds. Without this test, it is possible 470 * for the back hand to look at a page during the same wakeup of 471 * the pageout daemon in which the front hand cleared its ref bit. 472 */ 473 if (handspreadpages >= looppages) 474 handspreadpages = looppages - 1; 475 476 /* 477 * If we have been called to recalculate the parameters, 478 * set a flag to re-evaluate the clock hand pointers. 479 */ 480 if (recalc) 481 reset_hands = 1; 482 } 483 484 /* 485 * Pageout scheduling. 486 * 487 * Schedpaging controls the rate at which the page out daemon runs by 488 * setting the global variables nscan and desscan RATETOSCHEDPAGING 489 * times a second. Nscan records the number of pages pageout has examined 490 * in its current pass; schedpaging resets this value to zero each time 491 * it runs. Desscan records the number of pages pageout should examine 492 * in its next pass; schedpaging sets this value based on the amount of 493 * currently available memory. 494 */ 495 496 #define RATETOSCHEDPAGING 4 /* hz that is */ 497 498 static kmutex_t pageout_mutex; /* held while pageout or schedpaging running */ 499 500 /* 501 * Pool of available async pageout putpage requests. 502 */ 503 static struct async_reqs *push_req; 504 static struct async_reqs *req_freelist; /* available req structs */ 505 static struct async_reqs *push_list; /* pending reqs */ 506 static kmutex_t push_lock; /* protects req pool */ 507 static kcondvar_t push_cv; 508 509 static int async_list_size = 256; /* number of async request structs */ 510 511 static void pageout_scanner(void); 512 513 /* 514 * If a page is being shared more than "po_share" times 515 * then leave it alone- don't page it out. 516 */ 517 #define MIN_PO_SHARE (8) 518 #define MAX_PO_SHARE ((MIN_PO_SHARE) << 24) 519 ulong_t po_share = MIN_PO_SHARE; 520 521 /* 522 * Schedule rate for paging. 523 * Rate is linear interpolation between 524 * slowscan with lotsfree and fastscan when out of memory. 525 */ 526 static void 527 schedpaging(void *arg) 528 { 529 spgcnt_t vavail; 530 531 if (freemem < lotsfree + needfree + kmem_reapahead) 532 kmem_reap(); 533 534 if (freemem < lotsfree + needfree + seg_preapahead) 535 seg_preap(); 536 537 if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree)) 538 kcage_cageout_wakeup(); 539 540 if (mutex_tryenter(&pageout_mutex)) { 541 /* pageout() not running */ 542 nscan = 0; 543 vavail = freemem - deficit; 544 if (vavail < 0) 545 vavail = 0; 546 if (vavail > lotsfree) 547 vavail = lotsfree; 548 549 /* 550 * Fix for 1161438 (CRS SPR# 73922). All variables 551 * in the original calculation for desscan were 32 bit signed 552 * ints. As freemem approaches 0x0 on a system with 1 Gig or 553 * more of memory, the calculation can overflow. When this 554 * happens, desscan becomes negative and pageout_scanner() 555 * stops paging out. 556 */ 557 if (needfree) { 558 desscan = fastscan / RATETOSCHEDPAGING; 559 } else { 560 spgcnt_t faststmp, slowstmp, result; 561 562 slowstmp = slowscan * vavail; 563 faststmp = fastscan * (lotsfree - vavail); 564 result = (slowstmp + faststmp) / 565 nz(lotsfree) / RATETOSCHEDPAGING; 566 desscan = (pgcnt_t)result; 567 } 568 569 pageout_ticks = min_pageout_ticks + (lotsfree - vavail) * 570 (max_pageout_ticks - min_pageout_ticks) / nz(lotsfree); 571 572 if (freemem < lotsfree + needfree || 573 pageout_sample_cnt < pageout_sample_lim) { 574 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 575 "pageout_cv_signal:freemem %ld", freemem); 576 cv_signal(&proc_pageout->p_cv); 577 } else { 578 /* 579 * There are enough free pages, no need to 580 * kick the scanner thread. And next time 581 * around, keep more of the `highly shared' 582 * pages. 583 */ 584 cv_signal_pageout(); 585 if (po_share > MIN_PO_SHARE) { 586 po_share >>= 1; 587 } 588 } 589 mutex_exit(&pageout_mutex); 590 } 591 592 /* 593 * Signal threads waiting for available memory. 594 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but 595 * in this case it is not needed - the waiters will be waken up during 596 * the next invocation of this function. 597 */ 598 if (kmem_avail() > 0) 599 cv_broadcast(&memavail_cv); 600 601 (void) timeout(schedpaging, arg, hz / RATETOSCHEDPAGING); 602 } 603 604 pgcnt_t pushes; 605 ulong_t push_list_size; /* # of requests on pageout queue */ 606 607 #define FRONT 1 608 #define BACK 2 609 610 int dopageout = 1; /* must be non-zero to turn page stealing on */ 611 612 /* 613 * The page out daemon, which runs as process 2. 614 * 615 * As long as there are at least lotsfree pages, 616 * this process is not run. When the number of free 617 * pages stays in the range desfree to lotsfree, 618 * this daemon runs through the pages in the loop 619 * at a rate determined in schedpaging(). Pageout manages 620 * two hands on the clock. The front hand moves through 621 * memory, clearing the reference bit, 622 * and stealing pages from procs that are over maxrss. 623 * The back hand travels a distance behind the front hand, 624 * freeing the pages that have not been referenced in the time 625 * since the front hand passed. If modified, they are pushed to 626 * swap before being freed. 627 * 628 * There are 2 threads that act on behalf of the pageout process. 629 * One thread scans pages (pageout_scanner) and frees them up if 630 * they don't require any VOP_PUTPAGE operation. If a page must be 631 * written back to its backing store, the request is put on a list 632 * and the other (pageout) thread is signaled. The pageout thread 633 * grabs VOP_PUTPAGE requests from the list, and processes them. 634 * Some filesystems may require resources for the VOP_PUTPAGE 635 * operations (like memory) and hence can block the pageout 636 * thread, but the scanner thread can still operate. There is still 637 * no gaurentee that memory deadlocks cannot occur. 638 * 639 * For now, this thing is in very rough form. 640 */ 641 void 642 pageout() 643 { 644 struct async_reqs *arg; 645 pri_t pageout_pri; 646 int i; 647 pgcnt_t max_pushes; 648 callb_cpr_t cprinfo; 649 650 proc_pageout = ttoproc(curthread); 651 proc_pageout->p_cstime = 0; 652 proc_pageout->p_stime = 0; 653 proc_pageout->p_cutime = 0; 654 proc_pageout->p_utime = 0; 655 bcopy("pageout", u.u_psargs, 8); 656 bcopy("pageout", u.u_comm, 7); 657 658 /* 659 * Create pageout scanner thread 660 */ 661 mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL); 662 mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL); 663 664 /* 665 * Allocate and initialize the async request structures 666 * for pageout. 667 */ 668 push_req = (struct async_reqs *) 669 kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP); 670 671 req_freelist = push_req; 672 for (i = 0; i < async_list_size - 1; i++) 673 push_req[i].a_next = &push_req[i + 1]; 674 675 pageout_pri = curthread->t_pri; 676 pageout_init(pageout_scanner, proc_pageout, pageout_pri - 1); 677 678 /* 679 * kick off pageout scheduler. 680 */ 681 schedpaging(NULL); 682 683 /* 684 * Create kernel cage thread. 685 * The kernel cage thread is started under the pageout process 686 * to take advantage of the less restricted page allocation 687 * in page_create_throttle(). 688 */ 689 kcage_cageout_init(); 690 691 /* 692 * Limit pushes to avoid saturating pageout devices. 693 */ 694 max_pushes = maxpgio / RATETOSCHEDPAGING; 695 CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout"); 696 697 for (;;) { 698 mutex_enter(&push_lock); 699 700 while ((arg = push_list) == NULL || pushes > max_pushes) { 701 CALLB_CPR_SAFE_BEGIN(&cprinfo); 702 cv_wait(&push_cv, &push_lock); 703 pushes = 0; 704 CALLB_CPR_SAFE_END(&cprinfo, &push_lock); 705 } 706 push_list = arg->a_next; 707 arg->a_next = NULL; 708 mutex_exit(&push_lock); 709 710 if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off, 711 arg->a_len, arg->a_flags, 712 arg->a_cred) == 0) { 713 pushes++; 714 } 715 716 /* vp held by checkpage() */ 717 VN_RELE(arg->a_vp); 718 719 mutex_enter(&push_lock); 720 arg->a_next = req_freelist; /* back on freelist */ 721 req_freelist = arg; 722 push_list_size--; 723 mutex_exit(&push_lock); 724 } 725 } 726 727 /* 728 * Kernel thread that scans pages looking for ones to free 729 */ 730 static void 731 pageout_scanner(void) 732 { 733 struct page *fronthand, *backhand; 734 uint_t count; 735 callb_cpr_t cprinfo; 736 pgcnt_t nscan_limit; 737 pgcnt_t pcount; 738 739 CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan"); 740 mutex_enter(&pageout_mutex); 741 742 /* 743 * The restart case does not attempt to point the hands at roughly 744 * the right point on the assumption that after one circuit things 745 * will have settled down - and restarts shouldn't be that often. 746 */ 747 748 /* 749 * Set the two clock hands to be separated by a reasonable amount, 750 * but no more than 360 degrees apart. 751 */ 752 backhand = page_first(); 753 if (handspreadpages >= total_pages) 754 fronthand = page_nextn(backhand, total_pages - 1); 755 else 756 fronthand = page_nextn(backhand, handspreadpages); 757 758 min_pageout_ticks = MAX(1, 759 ((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING); 760 max_pageout_ticks = MAX(min_pageout_ticks, 761 ((hz * max_percent_cpu) / 100) / RATETOSCHEDPAGING); 762 763 loop: 764 cv_signal_pageout(); 765 766 CALLB_CPR_SAFE_BEGIN(&cprinfo); 767 cv_wait(&proc_pageout->p_cv, &pageout_mutex); 768 CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex); 769 770 if (!dopageout) 771 goto loop; 772 773 if (reset_hands) { 774 reset_hands = 0; 775 776 backhand = page_first(); 777 if (handspreadpages >= total_pages) 778 fronthand = page_nextn(backhand, total_pages - 1); 779 else 780 fronthand = page_nextn(backhand, handspreadpages); 781 } 782 783 CPU_STATS_ADDQ(CPU, vm, pgrrun, 1); 784 count = 0; 785 786 TRACE_4(TR_FAC_VM, TR_PAGEOUT_START, 787 "pageout_start:freemem %ld lotsfree %ld nscan %ld desscan %ld", 788 freemem, lotsfree, nscan, desscan); 789 790 /* Kernel probe */ 791 TNF_PROBE_2(pageout_scan_start, "vm pagedaemon", /* CSTYLED */, 792 tnf_ulong, pages_free, freemem, 793 tnf_ulong, pages_needed, needfree); 794 795 pcount = 0; 796 if (pageout_sample_cnt < pageout_sample_lim) { 797 nscan_limit = total_pages; 798 } else { 799 nscan_limit = desscan; 800 } 801 pageout_lbolt = lbolt; 802 sample_start = gethrtime(); 803 804 /* 805 * Scan the appropriate number of pages for a single duty cycle. 806 * However, stop scanning as soon as there is enough free memory. 807 * For a short while, we will be sampling the performance of the 808 * scanner and need to keep running just to get sample data, in 809 * which case we keep going and don't pay attention to whether 810 * or not there is enough free memory. 811 */ 812 813 while (nscan < nscan_limit && (freemem < lotsfree + needfree || 814 pageout_sample_cnt < pageout_sample_lim)) { 815 int rvfront, rvback; 816 817 /* 818 * Check to see if we have exceeded our %CPU budget 819 * for this wakeup, but not on every single page visited, 820 * just every once in a while. 821 */ 822 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) { 823 pageout_cycle_ticks = lbolt - pageout_lbolt; 824 if (pageout_cycle_ticks >= pageout_ticks) { 825 ++pageout_timeouts; 826 break; 827 } 828 } 829 830 /* 831 * If checkpage manages to add a page to the free list, 832 * we give ourselves another couple of trips around the loop. 833 */ 834 if ((rvfront = checkpage(fronthand, FRONT)) == 1) 835 count = 0; 836 if ((rvback = checkpage(backhand, BACK)) == 1) 837 count = 0; 838 839 ++pcount; 840 841 /* 842 * protected by pageout_mutex instead of cpu_stat_lock 843 */ 844 CPU_STATS_ADDQ(CPU, vm, scan, 1); 845 846 /* 847 * Don't include ineligible pages in the number scanned. 848 */ 849 if (rvfront != -1 || rvback != -1) 850 nscan++; 851 852 backhand = page_next(backhand); 853 854 /* 855 * backhand update and wraparound check are done separately 856 * because lint barks when it finds an empty "if" body 857 */ 858 859 if ((fronthand = page_next(fronthand)) == page_first()) { 860 TRACE_2(TR_FAC_VM, TR_PAGEOUT_HAND_WRAP, 861 "pageout_hand_wrap:freemem %ld whichhand %d", 862 freemem, FRONT); 863 864 /* 865 * protected by pageout_mutex instead of cpu_stat_lock 866 */ 867 CPU_STATS_ADDQ(CPU, vm, rev, 1); 868 if (++count > 1) { 869 /* 870 * Extremely unlikely, but it happens. 871 * We went around the loop at least once 872 * and didn't get far enough. 873 * If we are still skipping `highly shared' 874 * pages, skip fewer of them. Otherwise, 875 * give up till the next clock tick. 876 */ 877 if (po_share < MAX_PO_SHARE) { 878 po_share <<= 1; 879 } else { 880 /* 881 * Really a "goto loop", but 882 * if someone is TRACing or 883 * TNF_PROBE_ing, at least 884 * make records to show 885 * where we are. 886 */ 887 break; 888 } 889 } 890 } 891 } 892 893 sample_end = gethrtime(); 894 895 TRACE_5(TR_FAC_VM, TR_PAGEOUT_END, 896 "pageout_end:freemem %ld lots %ld nscan %ld des %ld count %u", 897 freemem, lotsfree, nscan, desscan, count); 898 899 /* Kernel probe */ 900 TNF_PROBE_2(pageout_scan_end, "vm pagedaemon", /* CSTYLED */, 901 tnf_ulong, pages_scanned, nscan, 902 tnf_ulong, pages_free, freemem); 903 904 if (pageout_sample_cnt < pageout_sample_lim) { 905 pageout_sample_pages += pcount; 906 pageout_sample_etime += sample_end - sample_start; 907 ++pageout_sample_cnt; 908 } 909 if (pageout_sample_cnt >= pageout_sample_lim && 910 pageout_new_spread == 0) { 911 pageout_rate = (hrrate_t)pageout_sample_pages * 912 (hrrate_t)(NANOSEC) / pageout_sample_etime; 913 pageout_new_spread = pageout_rate / 10; 914 setupclock(1); 915 } 916 917 goto loop; 918 } 919 920 /* 921 * Look at the page at hand. If it is locked (e.g., for physical i/o), 922 * system (u., page table) or free, then leave it alone. Otherwise, 923 * if we are running the front hand, turn off the page's reference bit. 924 * If the proc is over maxrss, we take it. If running the back hand, 925 * check whether the page has been reclaimed. If not, free the page, 926 * pushing it to disk first if necessary. 927 * 928 * Return values: 929 * -1 if the page is not a candidate at all, 930 * 0 if not freed, or 931 * 1 if we freed it. 932 */ 933 static int 934 checkpage(struct page *pp, int whichhand) 935 { 936 int ppattr; 937 int isfs = 0; 938 int isexec = 0; 939 int pagesync_flag; 940 941 /* 942 * Skip pages: 943 * - associated with the kernel vnode since 944 * they are always "exclusively" locked. 945 * - that are free 946 * - that are shared more than po_share'd times 947 * - its already locked 948 * 949 * NOTE: These optimizations assume that reads are atomic. 950 */ 951 top: 952 if ((PP_ISKAS(pp)) || (PP_ISFREE(pp)) || 953 (hat_page_getshare(pp) > po_share) || PAGE_LOCKED(pp)) { 954 return (-1); 955 } 956 957 if (!page_trylock(pp, SE_EXCL)) { 958 /* 959 * Skip the page if we can't acquire the "exclusive" lock. 960 */ 961 return (-1); 962 } else if (PP_ISFREE(pp)) { 963 /* 964 * It became free between the above check and our actually 965 * locking the page. Oh, well there will be other pages. 966 */ 967 page_unlock(pp); 968 return (-1); 969 } 970 971 /* 972 * Reject pages that cannot be freed. The page_struct_lock 973 * need not be acquired to examine these 974 * fields since the page has an "exclusive" lock. 975 */ 976 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 977 page_unlock(pp); 978 return (-1); 979 } 980 981 /* 982 * Maintain statistics for what we are freeing 983 */ 984 985 if (pp->p_vnode != NULL) { 986 if (pp->p_vnode->v_flag & VVMEXEC) 987 isexec = 1; 988 989 if (!IS_SWAPFSVP(pp->p_vnode)) 990 isfs = 1; 991 } 992 993 /* 994 * Turn off REF and MOD bits with the front hand. 995 * The back hand examines the REF bit and always considers 996 * SHARED pages as referenced. 997 */ 998 if (whichhand == FRONT) 999 pagesync_flag = HAT_SYNC_ZERORM; 1000 else 1001 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF | 1002 HAT_SYNC_STOPON_SHARED; 1003 1004 ppattr = hat_pagesync(pp, pagesync_flag); 1005 1006 recheck: 1007 /* 1008 * If page is referenced; make unreferenced but reclaimable. 1009 * If this page is not referenced, then it must be reclaimable 1010 * and we can add it to the free list. 1011 */ 1012 if (ppattr & P_REF) { 1013 TRACE_2(TR_FAC_VM, TR_PAGEOUT_ISREF, 1014 "pageout_isref:pp %p whichhand %d", pp, whichhand); 1015 if (whichhand == FRONT) { 1016 /* 1017 * Checking of rss or madvise flags needed here... 1018 * 1019 * If not "well-behaved", fall through into the code 1020 * for not referenced. 1021 */ 1022 hat_clrref(pp); 1023 } 1024 /* 1025 * Somebody referenced the page since the front 1026 * hand went by, so it's not a candidate for 1027 * freeing up. 1028 */ 1029 page_unlock(pp); 1030 return (0); 1031 } 1032 1033 VM_STAT_ADD(pageoutvmstats.checkpage[0]); 1034 1035 /* 1036 * If large page, attempt to demote it. If successfully demoted, 1037 * retry the checkpage. 1038 */ 1039 if (pp->p_szc != 0) { 1040 if (!page_try_demote_pages(pp)) { 1041 VM_STAT_ADD(pageoutvmstats.checkpage[1]); 1042 page_unlock(pp); 1043 return (-1); 1044 } 1045 ASSERT(pp->p_szc == 0); 1046 VM_STAT_ADD(pageoutvmstats.checkpage[2]); 1047 /* 1048 * since page_try_demote_pages() could have unloaded some 1049 * mappings it makes sense to reload ppattr. 1050 */ 1051 ppattr = hat_page_getattr(pp, P_MOD | P_REF); 1052 } 1053 1054 /* 1055 * If the page is currently dirty, we have to arrange 1056 * to have it cleaned before it can be freed. 1057 * 1058 * XXX - ASSERT(pp->p_vnode != NULL); 1059 */ 1060 if ((ppattr & P_MOD) && pp->p_vnode) { 1061 struct vnode *vp = pp->p_vnode; 1062 u_offset_t offset = pp->p_offset; 1063 1064 /* 1065 * XXX - Test for process being swapped out or about to exit? 1066 * [Can't get back to process(es) using the page.] 1067 */ 1068 1069 /* 1070 * Hold the vnode before releasing the page lock to 1071 * prevent it from being freed and re-used by some 1072 * other thread. 1073 */ 1074 VN_HOLD(vp); 1075 page_unlock(pp); 1076 1077 /* 1078 * Queue i/o request for the pageout thread. 1079 */ 1080 if (!queue_io_request(vp, offset)) { 1081 VN_RELE(vp); 1082 return (0); 1083 } 1084 return (1); 1085 } 1086 1087 /* 1088 * Now we unload all the translations, 1089 * and put the page back on to the free list. 1090 * If the page was used (referenced or modified) after 1091 * the pagesync but before it was unloaded we catch it 1092 * and handle the page properly. 1093 */ 1094 TRACE_2(TR_FAC_VM, TR_PAGEOUT_FREE, 1095 "pageout_free:pp %p whichhand %d", pp, whichhand); 1096 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1097 ppattr = hat_page_getattr(pp, P_MOD | P_REF); 1098 if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode)) 1099 goto recheck; 1100 1101 /*LINTED: constant in conditional context*/ 1102 VN_DISPOSE(pp, B_FREE, 0, kcred); 1103 1104 CPU_STATS_ADD_K(vm, dfree, 1); 1105 1106 if (isfs) { 1107 if (isexec) { 1108 CPU_STATS_ADD_K(vm, execfree, 1); 1109 } else { 1110 CPU_STATS_ADD_K(vm, fsfree, 1); 1111 } 1112 } else { 1113 CPU_STATS_ADD_K(vm, anonfree, 1); 1114 } 1115 1116 return (1); /* freed a page! */ 1117 } 1118 1119 /* 1120 * Queue async i/o request from pageout_scanner and segment swapout 1121 * routines on one common list. This ensures that pageout devices (swap) 1122 * are not saturated by pageout_scanner or swapout requests. 1123 * The pageout thread empties this list by initiating i/o operations. 1124 */ 1125 int 1126 queue_io_request(vnode_t *vp, u_offset_t off) 1127 { 1128 struct async_reqs *arg; 1129 1130 /* 1131 * If we cannot allocate an async request struct, 1132 * skip this page. 1133 */ 1134 mutex_enter(&push_lock); 1135 if ((arg = req_freelist) == NULL) { 1136 mutex_exit(&push_lock); 1137 return (0); 1138 } 1139 req_freelist = arg->a_next; /* adjust freelist */ 1140 push_list_size++; 1141 1142 arg->a_vp = vp; 1143 arg->a_off = off; 1144 arg->a_len = PAGESIZE; 1145 arg->a_flags = B_ASYNC | B_FREE; 1146 arg->a_cred = kcred; /* always held */ 1147 1148 /* 1149 * Add to list of pending write requests. 1150 */ 1151 arg->a_next = push_list; 1152 push_list = arg; 1153 1154 if (req_freelist == NULL) { 1155 /* 1156 * No free async requests left. The lock is held so we 1157 * might as well signal the pusher thread now. 1158 */ 1159 cv_signal(&push_cv); 1160 } 1161 mutex_exit(&push_lock); 1162 return (1); 1163 } 1164 1165 /* 1166 * Wakeup pageout to initiate i/o if push_list is not empty. 1167 */ 1168 void 1169 cv_signal_pageout() 1170 { 1171 if (push_list != NULL) { 1172 mutex_enter(&push_lock); 1173 cv_signal(&push_cv); 1174 mutex_exit(&push_lock); 1175 } 1176 } 1177