1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #include <sys/types.h> 40 #include <sys/t_lock.h> 41 #include <sys/param.h> 42 #include <sys/buf.h> 43 #include <sys/uio.h> 44 #include <sys/proc.h> 45 #include <sys/systm.h> 46 #include <sys/mman.h> 47 #include <sys/cred.h> 48 #include <sys/vnode.h> 49 #include <sys/vm.h> 50 #include <sys/vmparam.h> 51 #include <sys/vtrace.h> 52 #include <sys/cmn_err.h> 53 #include <sys/cpuvar.h> 54 #include <sys/user.h> 55 #include <sys/kmem.h> 56 #include <sys/debug.h> 57 #include <sys/callb.h> 58 #include <sys/tnf_probe.h> 59 #include <sys/mem_cage.h> 60 #include <sys/time.h> 61 62 #include <vm/hat.h> 63 #include <vm/as.h> 64 #include <vm/seg.h> 65 #include <vm/page.h> 66 #include <vm/pvn.h> 67 #include <vm/seg_kmem.h> 68 69 static int checkpage(page_t *, int); 70 71 /* 72 * The following parameters control operation of the page replacement 73 * algorithm. They are initialized to 0, and then computed at boot time 74 * based on the size of the system. If they are patched non-zero in 75 * a loaded vmunix they are left alone and may thus be changed per system 76 * using adb on the loaded system. 77 */ 78 pgcnt_t slowscan = 0; 79 pgcnt_t fastscan = 0; 80 81 static pgcnt_t handspreadpages = 0; 82 static int loopfraction = 2; 83 static pgcnt_t looppages; 84 static int min_percent_cpu = 4; 85 static int max_percent_cpu = 80; 86 static pgcnt_t maxfastscan = 0; 87 static pgcnt_t maxslowscan = 100; 88 89 pgcnt_t maxpgio = 0; 90 pgcnt_t minfree = 0; 91 pgcnt_t desfree = 0; 92 pgcnt_t lotsfree = 0; 93 pgcnt_t needfree = 0; 94 pgcnt_t throttlefree = 0; 95 pgcnt_t pageout_reserve = 0; 96 97 pgcnt_t deficit; 98 pgcnt_t nscan; 99 pgcnt_t desscan; 100 101 /* 102 * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks 103 * are the number of ticks in each wakeup cycle that gives the 104 * equivalent of some underlying %CPU duty cycle. 105 * When RATETOSCHEDPAGING is 4, and hz is 100, pageout_scanner is 106 * awakened every 25 clock ticks. So, converting from %CPU to ticks 107 * per wakeup cycle would be x% of 25, that is (x * 100) / 25. 108 * So, for example, 4% == 1 tick and 80% == 20 ticks. 109 * 110 * min_pageout_ticks: 111 * ticks/wakeup equivalent of min_percent_cpu. 112 * 113 * max_pageout_ticks: 114 * ticks/wakeup equivalent of max_percent_cpu. 115 * 116 * pageout_ticks: 117 * Number of clock ticks budgeted for each wakeup cycle. 118 * Computed each time around by schedpaging(). 119 * Varies between min_pageout_ticks .. max_pageout_ticks, 120 * depending on memory pressure. 121 * 122 * pageout_lbolt: 123 * Timestamp of the last time pageout_scanner woke up and started 124 * (or resumed) scanning for not recently referenced pages. 125 */ 126 127 static clock_t min_pageout_ticks; 128 static clock_t max_pageout_ticks; 129 static clock_t pageout_ticks; 130 static clock_t pageout_lbolt; 131 132 static uint_t reset_hands; 133 134 #define PAGES_POLL_MASK 1023 135 136 /* 137 * pageout_sample_lim: 138 * The limit on the number of samples needed to establish a value 139 * for new pageout parameters, fastscan, slowscan, and handspreadpages. 140 * 141 * pageout_sample_cnt: 142 * Current sample number. Once the sample gets large enough, 143 * set new values for handspreadpages, fastscan and slowscan. 144 * 145 * pageout_sample_pages: 146 * The accumulated number of pages scanned during sampling. 147 * 148 * pageout_sample_ticks: 149 * The accumulated clock ticks for the sample. 150 * 151 * pageout_rate: 152 * Rate in pages/nanosecond, computed at the end of sampling. 153 * 154 * pageout_new_spread: 155 * The new value to use for fastscan and handspreadpages. 156 * Calculated after enough samples have been taken. 157 */ 158 159 typedef hrtime_t hrrate_t; 160 161 static uint64_t pageout_sample_lim = 4; 162 static uint64_t pageout_sample_cnt = 0; 163 static pgcnt_t pageout_sample_pages = 0; 164 static hrrate_t pageout_rate = 0; 165 static pgcnt_t pageout_new_spread = 0; 166 167 static clock_t pageout_cycle_ticks; 168 static hrtime_t sample_start, sample_end; 169 static hrtime_t pageout_sample_etime = 0; 170 171 /* 172 * Record number of times a pageout_scanner wakeup cycle finished because it 173 * timed out (exceeded its CPU budget), rather than because it visited 174 * its budgeted number of pages. 175 */ 176 uint64_t pageout_timeouts = 0; 177 178 #ifdef VM_STATS 179 static struct pageoutvmstats_str { 180 ulong_t checkpage[3]; 181 } pageoutvmstats; 182 #endif /* VM_STATS */ 183 184 /* 185 * Threads waiting for free memory use this condition variable and lock until 186 * memory becomes available. 187 */ 188 kmutex_t memavail_lock; 189 kcondvar_t memavail_cv; 190 191 /* 192 * The size of the clock loop. 193 */ 194 #define LOOPPAGES total_pages 195 196 /* 197 * Set up the paging constants for the clock algorithm. 198 * Called after the system is initialized and the amount of memory 199 * and number of paging devices is known. 200 * 201 * lotsfree is 1/64 of memory, but at least 512K. 202 * desfree is 1/2 of lotsfree. 203 * minfree is 1/2 of desfree. 204 * 205 * Note: to revert to the paging algorithm of Solaris 2.4/2.5, set: 206 * 207 * lotsfree = btop(512K) 208 * desfree = btop(200K) 209 * minfree = btop(100K) 210 * throttlefree = INT_MIN 211 * max_percent_cpu = 4 212 */ 213 void 214 setupclock(int recalc) 215 { 216 217 static spgcnt_t init_lfree, init_dfree, init_mfree; 218 static spgcnt_t init_tfree, init_preserve, init_mpgio; 219 static spgcnt_t init_mfscan, init_fscan, init_sscan, init_hspages; 220 221 looppages = LOOPPAGES; 222 223 /* 224 * setupclock can now be called to recalculate the paging 225 * parameters in the case of dynamic addition of memory. 226 * So to make sure we make the proper calculations, if such a 227 * situation should arise, we save away the initial values 228 * of each parameter so we can recall them when needed. This 229 * way we don't lose the settings an admin might have made 230 * through the /etc/system file. 231 */ 232 233 if (!recalc) { 234 init_lfree = lotsfree; 235 init_dfree = desfree; 236 init_mfree = minfree; 237 init_tfree = throttlefree; 238 init_preserve = pageout_reserve; 239 init_mpgio = maxpgio; 240 init_mfscan = maxfastscan; 241 init_fscan = fastscan; 242 init_sscan = slowscan; 243 init_hspages = handspreadpages; 244 } 245 246 /* 247 * Set up thresholds for paging: 248 */ 249 250 /* 251 * Lotsfree is threshold where paging daemon turns on. 252 */ 253 if (init_lfree == 0 || init_lfree >= looppages) 254 lotsfree = MAX(looppages / 64, btop(512 * 1024)); 255 else 256 lotsfree = init_lfree; 257 258 /* 259 * Desfree is amount of memory desired free. 260 * If less than this for extended period, start swapping. 261 */ 262 if (init_dfree == 0 || init_dfree >= lotsfree) 263 desfree = lotsfree / 2; 264 else 265 desfree = init_dfree; 266 267 /* 268 * Minfree is minimal amount of free memory which is tolerable. 269 */ 270 if (init_mfree == 0 || init_mfree >= desfree) 271 minfree = desfree / 2; 272 else 273 minfree = init_mfree; 274 275 /* 276 * Throttlefree is the point at which we start throttling 277 * PG_WAIT requests until enough memory becomes available. 278 */ 279 if (init_tfree == 0 || init_tfree >= desfree) 280 throttlefree = minfree; 281 else 282 throttlefree = init_tfree; 283 284 /* 285 * Pageout_reserve is the number of pages that we keep in 286 * stock for pageout's own use. Having a few such pages 287 * provides insurance against system deadlock due to 288 * pageout needing pages. When freemem < pageout_reserve, 289 * non-blocking allocations are denied to any threads 290 * other than pageout and sched. (At some point we might 291 * want to consider a per-thread flag like T_PUSHING_PAGES 292 * to indicate that a thread is part of the page-pushing 293 * dance (e.g. an interrupt thread) and thus is entitled 294 * to the same special dispensation we accord pageout.) 295 */ 296 if (init_preserve == 0 || init_preserve >= throttlefree) 297 pageout_reserve = throttlefree / 2; 298 else 299 pageout_reserve = init_preserve; 300 301 /* 302 * Maxpgio thresholds how much paging is acceptable. 303 * This figures that 2/3 busy on an arm is all that is 304 * tolerable for paging. We assume one operation per disk rev. 305 * 306 * XXX - Does not account for multiple swap devices. 307 */ 308 if (init_mpgio == 0) 309 maxpgio = (DISKRPM * 2) / 3; 310 else 311 maxpgio = init_mpgio; 312 313 /* 314 * The clock scan rate varies between fastscan and slowscan 315 * based on the amount of free memory available. Fastscan 316 * rate should be set based on the number pages that can be 317 * scanned per sec using ~10% of processor time. Since this 318 * value depends on the processor, MMU, Mhz etc., it is 319 * difficult to determine it in a generic manner for all 320 * architectures. 321 * 322 * Instead of trying to determine the number of pages scanned 323 * per sec for every processor, fastscan is set to be the smaller 324 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling 325 * time is limited to ~4% of processor time. 326 * 327 * Setting fastscan to be 1/2 of memory allows pageout to scan 328 * all of memory in ~2 secs. This implies that user pages not 329 * accessed within 1 sec (assuming, handspreadpages == fastscan) 330 * can be reclaimed when free memory is very low. Stealing pages 331 * not accessed within 1 sec seems reasonable and ensures that 332 * active user processes don't thrash. 333 * 334 * Smaller values of fastscan result in scanning fewer pages 335 * every second and consequently pageout may not be able to free 336 * sufficient memory to maintain the minimum threshold. Larger 337 * values of fastscan result in scanning a lot more pages which 338 * could lead to thrashing and higher CPU usage. 339 * 340 * Fastscan needs to be limited to a maximum value and should not 341 * scale with memory to prevent pageout from consuming too much 342 * time for scanning on slow CPU's and avoid thrashing, as a 343 * result of scanning too many pages, on faster CPU's. 344 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES 345 * (the upper bound for fastscan) based on the average number 346 * of pages that can potentially be scanned in ~1 sec (using ~4% 347 * of the CPU) on some of the following machines that currently 348 * run Solaris 2.x: 349 * 350 * average memory scanned in ~1 sec 351 * 352 * 25 Mhz SS1+: 23 Meg 353 * LX: 37 Meg 354 * 50 Mhz SC2000: 68 Meg 355 * 356 * 40 Mhz 486: 26 Meg 357 * 66 Mhz 486: 42 Meg 358 * 359 * When free memory falls just below lotsfree, the scan rate 360 * goes from 0 to slowscan (i.e., pageout starts running). This 361 * transition needs to be smooth and is achieved by ensuring that 362 * pageout scans a small number of pages to satisfy the transient 363 * memory demand. This is set to not exceed 100 pages/sec (25 per 364 * wakeup) since scanning that many pages has no noticible impact 365 * on system performance. 366 * 367 * In addition to setting fastscan and slowscan, pageout is 368 * limited to using ~4% of the CPU. This results in increasing 369 * the time taken to scan all of memory, which in turn means that 370 * user processes have a better opportunity of preventing their 371 * pages from being stolen. This has a positive effect on 372 * interactive and overall system performance when memory demand 373 * is high. 374 * 375 * Thus, the rate at which pages are scanned for replacement will 376 * vary linearly between slowscan and the number of pages that 377 * can be scanned using ~4% of processor time instead of varying 378 * linearly between slowscan and fastscan. 379 * 380 * Also, the processor time used by pageout will vary from ~1% 381 * at slowscan to ~4% at fastscan instead of varying between 382 * ~1% at slowscan and ~10% at fastscan. 383 * 384 * The values chosen for the various VM parameters (fastscan, 385 * handspreadpages, etc) are not universally true for all machines, 386 * but appear to be a good rule of thumb for the machines we've 387 * tested. They have the following ranges: 388 * 389 * cpu speed: 20 to 70 Mhz 390 * page size: 4K to 8K 391 * memory size: 16M to 5G 392 * page scan rate: 4000 - 17400 4K pages per sec 393 * 394 * The values need to be re-examined for machines which don't 395 * fall into the various ranges (e.g., slower or faster CPUs, 396 * smaller or larger pagesizes etc) shown above. 397 * 398 * On an MP machine, pageout is often unable to maintain the 399 * minimum paging thresholds under heavy load. This is due to 400 * the fact that user processes running on other CPU's can be 401 * dirtying memory at a much faster pace than pageout can find 402 * pages to free. The memory demands could be met by enabling 403 * more than one CPU to run the clock algorithm in such a manner 404 * that the various clock hands don't overlap. This also makes 405 * it more difficult to determine the values for fastscan, slowscan 406 * and handspreadpages. 407 * 408 * The swapper is currently used to free up memory when pageout 409 * is unable to meet memory demands by swapping out processes. 410 * In addition to freeing up memory, swapping also reduces the 411 * demand for memory by preventing user processes from running 412 * and thereby consuming memory. 413 */ 414 if (init_mfscan == 0) { 415 if (pageout_new_spread != 0) 416 maxfastscan = pageout_new_spread; 417 else 418 maxfastscan = MAXHANDSPREADPAGES; 419 } else { 420 maxfastscan = init_mfscan; 421 } 422 if (init_fscan == 0) 423 fastscan = MIN(looppages / loopfraction, maxfastscan); 424 else 425 fastscan = init_fscan; 426 if (fastscan > looppages / loopfraction) 427 fastscan = looppages / loopfraction; 428 429 /* 430 * Set slow scan time to 1/10 the fast scan time, but 431 * not to exceed maxslowscan. 432 */ 433 if (init_sscan == 0) 434 slowscan = MIN(fastscan / 10, maxslowscan); 435 else 436 slowscan = init_sscan; 437 if (slowscan > fastscan / 2) 438 slowscan = fastscan / 2; 439 440 /* 441 * Handspreadpages is distance (in pages) between front and back 442 * pageout daemon hands. The amount of time to reclaim a page 443 * once pageout examines it increases with this distance and 444 * decreases as the scan rate rises. It must be < the amount 445 * of pageable memory. 446 * 447 * Since pageout is limited to ~4% of the CPU, setting handspreadpages 448 * to be "fastscan" results in the front hand being a few secs 449 * (varies based on the processor speed) ahead of the back hand 450 * at fastscan rates. This distance can be further reduced, if 451 * necessary, by increasing the processor time used by pageout 452 * to be more than ~4% and preferrably not more than ~10%. 453 * 454 * As a result, user processes have a much better chance of 455 * referencing their pages before the back hand examines them. 456 * This also significantly lowers the number of reclaims from 457 * the freelist since pageout does not end up freeing pages which 458 * may be referenced a sec later. 459 */ 460 if (init_hspages == 0) 461 handspreadpages = fastscan; 462 else 463 handspreadpages = init_hspages; 464 465 /* 466 * Make sure that back hand follows front hand by at least 467 * 1/RATETOSCHEDPAGING seconds. Without this test, it is possible 468 * for the back hand to look at a page during the same wakeup of 469 * the pageout daemon in which the front hand cleared its ref bit. 470 */ 471 if (handspreadpages >= looppages) 472 handspreadpages = looppages - 1; 473 474 /* 475 * If we have been called to recalculate the parameters, 476 * set a flag to re-evaluate the clock hand pointers. 477 */ 478 if (recalc) 479 reset_hands = 1; 480 } 481 482 /* 483 * Pageout scheduling. 484 * 485 * Schedpaging controls the rate at which the page out daemon runs by 486 * setting the global variables nscan and desscan RATETOSCHEDPAGING 487 * times a second. Nscan records the number of pages pageout has examined 488 * in its current pass; schedpaging resets this value to zero each time 489 * it runs. Desscan records the number of pages pageout should examine 490 * in its next pass; schedpaging sets this value based on the amount of 491 * currently available memory. 492 */ 493 494 #define RATETOSCHEDPAGING 4 /* hz that is */ 495 496 static kmutex_t pageout_mutex; /* held while pageout or schedpaging running */ 497 498 /* 499 * Pool of available async pageout putpage requests. 500 */ 501 static struct async_reqs *push_req; 502 static struct async_reqs *req_freelist; /* available req structs */ 503 static struct async_reqs *push_list; /* pending reqs */ 504 static kmutex_t push_lock; /* protects req pool */ 505 static kcondvar_t push_cv; 506 507 static int async_list_size = 256; /* number of async request structs */ 508 509 static void pageout_scanner(void); 510 511 /* 512 * If a page is being shared more than "po_share" times 513 * then leave it alone- don't page it out. 514 */ 515 #define MIN_PO_SHARE (8) 516 #define MAX_PO_SHARE ((MIN_PO_SHARE) << 24) 517 ulong_t po_share = MIN_PO_SHARE; 518 519 /* 520 * Schedule rate for paging. 521 * Rate is linear interpolation between 522 * slowscan with lotsfree and fastscan when out of memory. 523 */ 524 static void 525 schedpaging(void *arg) 526 { 527 spgcnt_t vavail; 528 529 if (freemem < lotsfree + needfree + kmem_reapahead) 530 kmem_reap(); 531 532 if (freemem < lotsfree + needfree) 533 seg_preap(); 534 535 if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree)) 536 kcage_cageout_wakeup(); 537 538 if (mutex_tryenter(&pageout_mutex)) { 539 /* pageout() not running */ 540 nscan = 0; 541 vavail = freemem - deficit; 542 if (pageout_new_spread != 0) 543 vavail -= needfree; 544 if (vavail < 0) 545 vavail = 0; 546 if (vavail > lotsfree) 547 vavail = lotsfree; 548 549 /* 550 * Fix for 1161438 (CRS SPR# 73922). All variables 551 * in the original calculation for desscan were 32 bit signed 552 * ints. As freemem approaches 0x0 on a system with 1 Gig or 553 * more of memory, the calculation can overflow. When this 554 * happens, desscan becomes negative and pageout_scanner() 555 * stops paging out. 556 */ 557 if ((needfree) && (pageout_new_spread == 0)) { 558 /* 559 * If we've not yet collected enough samples to 560 * calculate a spread, use the old logic of kicking 561 * into high gear anytime needfree is non-zero. 562 */ 563 desscan = fastscan / RATETOSCHEDPAGING; 564 } else { 565 /* 566 * Once we've calculated a spread based on system 567 * memory and usage, just treat needfree as another 568 * form of deficit. 569 */ 570 spgcnt_t faststmp, slowstmp, result; 571 572 slowstmp = slowscan * vavail; 573 faststmp = fastscan * (lotsfree - vavail); 574 result = (slowstmp + faststmp) / 575 nz(lotsfree) / RATETOSCHEDPAGING; 576 desscan = (pgcnt_t)result; 577 } 578 579 pageout_ticks = min_pageout_ticks + (lotsfree - vavail) * 580 (max_pageout_ticks - min_pageout_ticks) / nz(lotsfree); 581 582 if (freemem < lotsfree + needfree || 583 pageout_sample_cnt < pageout_sample_lim) { 584 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 585 "pageout_cv_signal:freemem %ld", freemem); 586 cv_signal(&proc_pageout->p_cv); 587 } else { 588 /* 589 * There are enough free pages, no need to 590 * kick the scanner thread. And next time 591 * around, keep more of the `highly shared' 592 * pages. 593 */ 594 cv_signal_pageout(); 595 if (po_share > MIN_PO_SHARE) { 596 po_share >>= 1; 597 } 598 } 599 mutex_exit(&pageout_mutex); 600 } 601 602 /* 603 * Signal threads waiting for available memory. 604 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but 605 * in this case it is not needed - the waiters will be waken up during 606 * the next invocation of this function. 607 */ 608 if (kmem_avail() > 0) 609 cv_broadcast(&memavail_cv); 610 611 (void) timeout(schedpaging, arg, hz / RATETOSCHEDPAGING); 612 } 613 614 pgcnt_t pushes; 615 ulong_t push_list_size; /* # of requests on pageout queue */ 616 617 #define FRONT 1 618 #define BACK 2 619 620 int dopageout = 1; /* must be non-zero to turn page stealing on */ 621 622 /* 623 * The page out daemon, which runs as process 2. 624 * 625 * As long as there are at least lotsfree pages, 626 * this process is not run. When the number of free 627 * pages stays in the range desfree to lotsfree, 628 * this daemon runs through the pages in the loop 629 * at a rate determined in schedpaging(). Pageout manages 630 * two hands on the clock. The front hand moves through 631 * memory, clearing the reference bit, 632 * and stealing pages from procs that are over maxrss. 633 * The back hand travels a distance behind the front hand, 634 * freeing the pages that have not been referenced in the time 635 * since the front hand passed. If modified, they are pushed to 636 * swap before being freed. 637 * 638 * There are 2 threads that act on behalf of the pageout process. 639 * One thread scans pages (pageout_scanner) and frees them up if 640 * they don't require any VOP_PUTPAGE operation. If a page must be 641 * written back to its backing store, the request is put on a list 642 * and the other (pageout) thread is signaled. The pageout thread 643 * grabs VOP_PUTPAGE requests from the list, and processes them. 644 * Some filesystems may require resources for the VOP_PUTPAGE 645 * operations (like memory) and hence can block the pageout 646 * thread, but the scanner thread can still operate. There is still 647 * no guarantee that memory deadlocks cannot occur. 648 * 649 * For now, this thing is in very rough form. 650 */ 651 void 652 pageout() 653 { 654 struct async_reqs *arg; 655 pri_t pageout_pri; 656 int i; 657 pgcnt_t max_pushes; 658 callb_cpr_t cprinfo; 659 660 proc_pageout = ttoproc(curthread); 661 proc_pageout->p_cstime = 0; 662 proc_pageout->p_stime = 0; 663 proc_pageout->p_cutime = 0; 664 proc_pageout->p_utime = 0; 665 bcopy("pageout", PTOU(curproc)->u_psargs, 8); 666 bcopy("pageout", PTOU(curproc)->u_comm, 7); 667 668 /* 669 * Create pageout scanner thread 670 */ 671 mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL); 672 mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL); 673 674 /* 675 * Allocate and initialize the async request structures 676 * for pageout. 677 */ 678 push_req = (struct async_reqs *) 679 kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP); 680 681 req_freelist = push_req; 682 for (i = 0; i < async_list_size - 1; i++) 683 push_req[i].a_next = &push_req[i + 1]; 684 685 pageout_pri = curthread->t_pri; 686 687 /* Create the pageout scanner thread. */ 688 (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN, 689 pageout_pri - 1); 690 691 /* 692 * kick off pageout scheduler. 693 */ 694 schedpaging(NULL); 695 696 /* 697 * Create kernel cage thread. 698 * The kernel cage thread is started under the pageout process 699 * to take advantage of the less restricted page allocation 700 * in page_create_throttle(). 701 */ 702 kcage_cageout_init(); 703 704 /* 705 * Limit pushes to avoid saturating pageout devices. 706 */ 707 max_pushes = maxpgio / RATETOSCHEDPAGING; 708 CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout"); 709 710 for (;;) { 711 mutex_enter(&push_lock); 712 713 while ((arg = push_list) == NULL || pushes > max_pushes) { 714 CALLB_CPR_SAFE_BEGIN(&cprinfo); 715 cv_wait(&push_cv, &push_lock); 716 pushes = 0; 717 CALLB_CPR_SAFE_END(&cprinfo, &push_lock); 718 } 719 push_list = arg->a_next; 720 arg->a_next = NULL; 721 mutex_exit(&push_lock); 722 723 if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off, 724 arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) { 725 pushes++; 726 } 727 728 /* vp held by checkpage() */ 729 VN_RELE(arg->a_vp); 730 731 mutex_enter(&push_lock); 732 arg->a_next = req_freelist; /* back on freelist */ 733 req_freelist = arg; 734 push_list_size--; 735 mutex_exit(&push_lock); 736 } 737 } 738 739 /* 740 * Kernel thread that scans pages looking for ones to free 741 */ 742 static void 743 pageout_scanner(void) 744 { 745 struct page *fronthand, *backhand; 746 uint_t count; 747 callb_cpr_t cprinfo; 748 pgcnt_t nscan_limit; 749 pgcnt_t pcount; 750 751 CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan"); 752 mutex_enter(&pageout_mutex); 753 754 /* 755 * The restart case does not attempt to point the hands at roughly 756 * the right point on the assumption that after one circuit things 757 * will have settled down - and restarts shouldn't be that often. 758 */ 759 760 /* 761 * Set the two clock hands to be separated by a reasonable amount, 762 * but no more than 360 degrees apart. 763 */ 764 backhand = page_first(); 765 if (handspreadpages >= total_pages) 766 fronthand = page_nextn(backhand, total_pages - 1); 767 else 768 fronthand = page_nextn(backhand, handspreadpages); 769 770 min_pageout_ticks = MAX(1, 771 ((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING); 772 max_pageout_ticks = MAX(min_pageout_ticks, 773 ((hz * max_percent_cpu) / 100) / RATETOSCHEDPAGING); 774 775 loop: 776 cv_signal_pageout(); 777 778 CALLB_CPR_SAFE_BEGIN(&cprinfo); 779 cv_wait(&proc_pageout->p_cv, &pageout_mutex); 780 CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex); 781 782 if (!dopageout) 783 goto loop; 784 785 if (reset_hands) { 786 reset_hands = 0; 787 788 backhand = page_first(); 789 if (handspreadpages >= total_pages) 790 fronthand = page_nextn(backhand, total_pages - 1); 791 else 792 fronthand = page_nextn(backhand, handspreadpages); 793 } 794 795 CPU_STATS_ADDQ(CPU, vm, pgrrun, 1); 796 count = 0; 797 798 TRACE_4(TR_FAC_VM, TR_PAGEOUT_START, 799 "pageout_start:freemem %ld lotsfree %ld nscan %ld desscan %ld", 800 freemem, lotsfree, nscan, desscan); 801 802 /* Kernel probe */ 803 TNF_PROBE_2(pageout_scan_start, "vm pagedaemon", /* CSTYLED */, 804 tnf_ulong, pages_free, freemem, tnf_ulong, pages_needed, needfree); 805 806 pcount = 0; 807 if (pageout_sample_cnt < pageout_sample_lim) { 808 nscan_limit = total_pages; 809 } else { 810 nscan_limit = desscan; 811 } 812 pageout_lbolt = ddi_get_lbolt(); 813 sample_start = gethrtime(); 814 815 /* 816 * Scan the appropriate number of pages for a single duty cycle. 817 * However, stop scanning as soon as there is enough free memory. 818 * For a short while, we will be sampling the performance of the 819 * scanner and need to keep running just to get sample data, in 820 * which case we keep going and don't pay attention to whether 821 * or not there is enough free memory. 822 */ 823 824 while (nscan < nscan_limit && (freemem < lotsfree + needfree || 825 pageout_sample_cnt < pageout_sample_lim)) { 826 int rvfront, rvback; 827 828 /* 829 * Check to see if we have exceeded our %CPU budget 830 * for this wakeup, but not on every single page visited, 831 * just every once in a while. 832 */ 833 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) { 834 pageout_cycle_ticks = ddi_get_lbolt() - pageout_lbolt; 835 if (pageout_cycle_ticks >= pageout_ticks) { 836 ++pageout_timeouts; 837 break; 838 } 839 } 840 841 /* 842 * If checkpage manages to add a page to the free list, 843 * we give ourselves another couple of trips around the loop. 844 */ 845 if ((rvfront = checkpage(fronthand, FRONT)) == 1) 846 count = 0; 847 if ((rvback = checkpage(backhand, BACK)) == 1) 848 count = 0; 849 850 ++pcount; 851 852 /* 853 * protected by pageout_mutex instead of cpu_stat_lock 854 */ 855 CPU_STATS_ADDQ(CPU, vm, scan, 1); 856 857 /* 858 * Don't include ineligible pages in the number scanned. 859 */ 860 if (rvfront != -1 || rvback != -1) 861 nscan++; 862 863 backhand = page_next(backhand); 864 865 /* 866 * backhand update and wraparound check are done separately 867 * because lint barks when it finds an empty "if" body 868 */ 869 870 if ((fronthand = page_next(fronthand)) == page_first()) { 871 TRACE_2(TR_FAC_VM, TR_PAGEOUT_HAND_WRAP, 872 "pageout_hand_wrap:freemem %ld whichhand %d", 873 freemem, FRONT); 874 875 /* 876 * protected by pageout_mutex instead of cpu_stat_lock 877 */ 878 CPU_STATS_ADDQ(CPU, vm, rev, 1); 879 if (++count > 1) { 880 /* 881 * Extremely unlikely, but it happens. 882 * We went around the loop at least once 883 * and didn't get far enough. 884 * If we are still skipping `highly shared' 885 * pages, skip fewer of them. Otherwise, 886 * give up till the next clock tick. 887 */ 888 if (po_share < MAX_PO_SHARE) { 889 po_share <<= 1; 890 } else { 891 /* 892 * Really a "goto loop", but 893 * if someone is TRACing or 894 * TNF_PROBE_ing, at least 895 * make records to show 896 * where we are. 897 */ 898 break; 899 } 900 } 901 } 902 } 903 904 sample_end = gethrtime(); 905 906 TRACE_5(TR_FAC_VM, TR_PAGEOUT_END, 907 "pageout_end:freemem %ld lots %ld nscan %ld des %ld count %u", 908 freemem, lotsfree, nscan, desscan, count); 909 910 /* Kernel probe */ 911 TNF_PROBE_2(pageout_scan_end, "vm pagedaemon", /* CSTYLED */, 912 tnf_ulong, pages_scanned, nscan, tnf_ulong, pages_free, freemem); 913 914 if (pageout_sample_cnt < pageout_sample_lim) { 915 pageout_sample_pages += pcount; 916 pageout_sample_etime += sample_end - sample_start; 917 ++pageout_sample_cnt; 918 } 919 if (pageout_sample_cnt >= pageout_sample_lim && 920 pageout_new_spread == 0) { 921 pageout_rate = (hrrate_t)pageout_sample_pages * 922 (hrrate_t)(NANOSEC) / pageout_sample_etime; 923 pageout_new_spread = pageout_rate / 10; 924 setupclock(1); 925 } 926 927 goto loop; 928 } 929 930 /* 931 * Look at the page at hand. If it is locked (e.g., for physical i/o), 932 * system (u., page table) or free, then leave it alone. Otherwise, 933 * if we are running the front hand, turn off the page's reference bit. 934 * If the proc is over maxrss, we take it. If running the back hand, 935 * check whether the page has been reclaimed. If not, free the page, 936 * pushing it to disk first if necessary. 937 * 938 * Return values: 939 * -1 if the page is not a candidate at all, 940 * 0 if not freed, or 941 * 1 if we freed it. 942 */ 943 static int 944 checkpage(struct page *pp, int whichhand) 945 { 946 int ppattr; 947 int isfs = 0; 948 int isexec = 0; 949 int pagesync_flag; 950 951 /* 952 * Skip pages: 953 * - associated with the kernel vnode since 954 * they are always "exclusively" locked. 955 * - that are free 956 * - that are shared more than po_share'd times 957 * - its already locked 958 * 959 * NOTE: These optimizations assume that reads are atomic. 960 */ 961 962 if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) || 963 pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 964 hat_page_checkshare(pp, po_share)) { 965 return (-1); 966 } 967 968 if (!page_trylock(pp, SE_EXCL)) { 969 /* 970 * Skip the page if we can't acquire the "exclusive" lock. 971 */ 972 return (-1); 973 } else if (PP_ISFREE(pp)) { 974 /* 975 * It became free between the above check and our actually 976 * locking the page. Oh, well there will be other pages. 977 */ 978 page_unlock(pp); 979 return (-1); 980 } 981 982 /* 983 * Reject pages that cannot be freed. The page_struct_lock 984 * need not be acquired to examine these 985 * fields since the page has an "exclusive" lock. 986 */ 987 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 988 page_unlock(pp); 989 return (-1); 990 } 991 992 /* 993 * Maintain statistics for what we are freeing 994 */ 995 996 if (pp->p_vnode != NULL) { 997 if (pp->p_vnode->v_flag & VVMEXEC) 998 isexec = 1; 999 1000 if (!IS_SWAPFSVP(pp->p_vnode)) 1001 isfs = 1; 1002 } 1003 1004 /* 1005 * Turn off REF and MOD bits with the front hand. 1006 * The back hand examines the REF bit and always considers 1007 * SHARED pages as referenced. 1008 */ 1009 if (whichhand == FRONT) 1010 pagesync_flag = HAT_SYNC_ZERORM; 1011 else 1012 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF | 1013 HAT_SYNC_STOPON_SHARED; 1014 1015 ppattr = hat_pagesync(pp, pagesync_flag); 1016 1017 recheck: 1018 /* 1019 * If page is referenced; make unreferenced but reclaimable. 1020 * If this page is not referenced, then it must be reclaimable 1021 * and we can add it to the free list. 1022 */ 1023 if (ppattr & P_REF) { 1024 TRACE_2(TR_FAC_VM, TR_PAGEOUT_ISREF, 1025 "pageout_isref:pp %p whichhand %d", pp, whichhand); 1026 if (whichhand == FRONT) { 1027 /* 1028 * Checking of rss or madvise flags needed here... 1029 * 1030 * If not "well-behaved", fall through into the code 1031 * for not referenced. 1032 */ 1033 hat_clrref(pp); 1034 } 1035 /* 1036 * Somebody referenced the page since the front 1037 * hand went by, so it's not a candidate for 1038 * freeing up. 1039 */ 1040 page_unlock(pp); 1041 return (0); 1042 } 1043 1044 VM_STAT_ADD(pageoutvmstats.checkpage[0]); 1045 1046 /* 1047 * If large page, attempt to demote it. If successfully demoted, 1048 * retry the checkpage. 1049 */ 1050 if (pp->p_szc != 0) { 1051 if (!page_try_demote_pages(pp)) { 1052 VM_STAT_ADD(pageoutvmstats.checkpage[1]); 1053 page_unlock(pp); 1054 return (-1); 1055 } 1056 ASSERT(pp->p_szc == 0); 1057 VM_STAT_ADD(pageoutvmstats.checkpage[2]); 1058 /* 1059 * since page_try_demote_pages() could have unloaded some 1060 * mappings it makes sense to reload ppattr. 1061 */ 1062 ppattr = hat_page_getattr(pp, P_MOD | P_REF); 1063 } 1064 1065 /* 1066 * If the page is currently dirty, we have to arrange 1067 * to have it cleaned before it can be freed. 1068 * 1069 * XXX - ASSERT(pp->p_vnode != NULL); 1070 */ 1071 if ((ppattr & P_MOD) && pp->p_vnode) { 1072 struct vnode *vp = pp->p_vnode; 1073 u_offset_t offset = pp->p_offset; 1074 1075 /* 1076 * XXX - Test for process being swapped out or about to exit? 1077 * [Can't get back to process(es) using the page.] 1078 */ 1079 1080 /* 1081 * Hold the vnode before releasing the page lock to 1082 * prevent it from being freed and re-used by some 1083 * other thread. 1084 */ 1085 VN_HOLD(vp); 1086 page_unlock(pp); 1087 1088 /* 1089 * Queue i/o request for the pageout thread. 1090 */ 1091 if (!queue_io_request(vp, offset)) { 1092 VN_RELE(vp); 1093 return (0); 1094 } 1095 return (1); 1096 } 1097 1098 /* 1099 * Now we unload all the translations, 1100 * and put the page back on to the free list. 1101 * If the page was used (referenced or modified) after 1102 * the pagesync but before it was unloaded we catch it 1103 * and handle the page properly. 1104 */ 1105 TRACE_2(TR_FAC_VM, TR_PAGEOUT_FREE, 1106 "pageout_free:pp %p whichhand %d", pp, whichhand); 1107 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1108 ppattr = hat_page_getattr(pp, P_MOD | P_REF); 1109 if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode)) 1110 goto recheck; 1111 1112 /*LINTED: constant in conditional context*/ 1113 VN_DISPOSE(pp, B_FREE, 0, kcred); 1114 1115 CPU_STATS_ADD_K(vm, dfree, 1); 1116 1117 if (isfs) { 1118 if (isexec) { 1119 CPU_STATS_ADD_K(vm, execfree, 1); 1120 } else { 1121 CPU_STATS_ADD_K(vm, fsfree, 1); 1122 } 1123 } else { 1124 CPU_STATS_ADD_K(vm, anonfree, 1); 1125 } 1126 1127 return (1); /* freed a page! */ 1128 } 1129 1130 /* 1131 * Queue async i/o request from pageout_scanner and segment swapout 1132 * routines on one common list. This ensures that pageout devices (swap) 1133 * are not saturated by pageout_scanner or swapout requests. 1134 * The pageout thread empties this list by initiating i/o operations. 1135 */ 1136 int 1137 queue_io_request(vnode_t *vp, u_offset_t off) 1138 { 1139 struct async_reqs *arg; 1140 1141 /* 1142 * If we cannot allocate an async request struct, 1143 * skip this page. 1144 */ 1145 mutex_enter(&push_lock); 1146 if ((arg = req_freelist) == NULL) { 1147 mutex_exit(&push_lock); 1148 return (0); 1149 } 1150 req_freelist = arg->a_next; /* adjust freelist */ 1151 push_list_size++; 1152 1153 arg->a_vp = vp; 1154 arg->a_off = off; 1155 arg->a_len = PAGESIZE; 1156 arg->a_flags = B_ASYNC | B_FREE; 1157 arg->a_cred = kcred; /* always held */ 1158 1159 /* 1160 * Add to list of pending write requests. 1161 */ 1162 arg->a_next = push_list; 1163 push_list = arg; 1164 1165 if (req_freelist == NULL) { 1166 /* 1167 * No free async requests left. The lock is held so we 1168 * might as well signal the pusher thread now. 1169 */ 1170 cv_signal(&push_cv); 1171 } 1172 mutex_exit(&push_lock); 1173 return (1); 1174 } 1175 1176 /* 1177 * Wakeup pageout to initiate i/o if push_list is not empty. 1178 */ 1179 void 1180 cv_signal_pageout() 1181 { 1182 if (push_list != NULL) { 1183 mutex_enter(&push_lock); 1184 cv_signal(&push_cv); 1185 mutex_exit(&push_lock); 1186 } 1187 } 1188