1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2021 Oxide Computer Company 24 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. 25 */ 26 27 /* 28 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 29 * Use is subject to license terms. 30 */ 31 32 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 33 /* All Rights Reserved */ 34 35 /* 36 * University Copyright- Copyright (c) 1982, 1986, 1988 37 * The Regents of the University of California 38 * All Rights Reserved 39 * 40 * University Acknowledgment- Portions of this document are derived from 41 * software developed by the University of California, Berkeley, and its 42 * contributors. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/t_lock.h> 47 #include <sys/param.h> 48 #include <sys/buf.h> 49 #include <sys/uio.h> 50 #include <sys/proc.h> 51 #include <sys/systm.h> 52 #include <sys/mman.h> 53 #include <sys/cred.h> 54 #include <sys/vnode.h> 55 #include <sys/vm.h> 56 #include <sys/vmparam.h> 57 #include <sys/vtrace.h> 58 #include <sys/cmn_err.h> 59 #include <sys/cpuvar.h> 60 #include <sys/user.h> 61 #include <sys/kmem.h> 62 #include <sys/debug.h> 63 #include <sys/callb.h> 64 #include <sys/tnf_probe.h> 65 #include <sys/mem_cage.h> 66 #include <sys/time.h> 67 #include <sys/stdbool.h> 68 69 #include <vm/hat.h> 70 #include <vm/as.h> 71 #include <vm/seg.h> 72 #include <vm/page.h> 73 #include <vm/pvn.h> 74 #include <vm/seg_kmem.h> 75 76 /* 77 * FREE MEMORY MANAGEMENT 78 * 79 * Management of the pool of free pages is a tricky business. There are 80 * several critical threshold values which constrain our allocation of new 81 * pages and inform the rate of paging out of memory to swap. These threshold 82 * values, and the behaviour they induce, are described below in descending 83 * order of size -- and thus increasing order of severity! 84 * 85 * +---------------------------------------------------- physmem (all memory) 86 * | 87 * | Ordinarily there are no particular constraints placed on page 88 * v allocation. The page scanner is not running and page_create_va() 89 * | will effectively grant all page requests (whether from the kernel 90 * | or from user processes) without artificial delay. 91 * | 92 * +------------------------ lotsfree (1.56% of physmem, min. 16MB, max. 2GB) 93 * | 94 * | When we have less than "lotsfree" pages, pageout_scanner() is 95 * v signalled by schedpaging() to begin looking for pages that can 96 * | be evicted to disk to bring us back above lotsfree. At this 97 * | stage there is still no constraint on allocation of free pages. 98 * | 99 * | For small systems, we set a lower bound of 16MB for lotsfree; 100 * v this is the natural value for a system with 1GB memory. This is 101 * | to ensure that the pageout reserve pool contains at least 4MB 102 * | for use by ZFS. 103 * | 104 * | For systems with a large amount of memory, we constrain lotsfree 105 * | to be at most 2GB (with a pageout reserve of around 0.5GB), as 106 * v at some point the required slack relates more closely to the 107 * | rate at which paging can occur than to the total amount of memory. 108 * | 109 * +------------------- desfree (1/2 of lotsfree, 0.78% of physmem, min. 8MB) 110 * | 111 * | When we drop below desfree, a number of kernel facilities will 112 * v wait before allocating more memory, under the assumption that 113 * | pageout or reaping will make progress and free up some memory. 114 * | This behaviour is not especially coordinated; look for comparisons 115 * | of desfree and freemem. 116 * | 117 * | In addition to various attempts at advisory caution, clock() 118 * | will wake up the thread that is ordinarily parked in sched(). 119 * | This routine is responsible for the heavy-handed swapping out 120 * v of entire processes in an attempt to arrest the slide of free 121 * | memory. See comments in sched.c for more details. 122 * | 123 * +----- minfree & throttlefree (3/4 of desfree, 0.59% of physmem, min. 6MB) 124 * | 125 * | These two separate tunables have, by default, the same value. 126 * v Various parts of the kernel use minfree to signal the need for 127 * | more aggressive reclamation of memory, and sched() is more 128 * | aggressive at swapping processes out. 129 * | 130 * | If free memory falls below throttlefree, page_create_va() will 131 * | use page_create_throttle() to begin holding most requests for 132 * | new pages while pageout and reaping free up memory. Sleeping 133 * v allocations (e.g., KM_SLEEP) are held here while we wait for 134 * | more memory. Non-sleeping allocations are generally allowed to 135 * | proceed, unless their priority is explicitly lowered with 136 * | KM_NORMALPRI. 137 * | 138 * +------- pageout_reserve (3/4 of throttlefree, 0.44% of physmem, min. 4MB) 139 * | 140 * | When we hit throttlefree, the situation is already dire. The 141 * v system is generally paging out memory and swapping out entire 142 * | processes in order to free up memory for continued operation. 143 * | 144 * | Unfortunately, evicting memory to disk generally requires short 145 * | term use of additional memory; e.g., allocation of buffers for 146 * | storage drivers, updating maps of free and used blocks, etc. 147 * | As such, pageout_reserve is the number of pages that we keep in 148 * | special reserve for use by pageout() and sched() and by any 149 * v other parts of the kernel that need to be working for those to 150 * | make forward progress such as the ZFS I/O pipeline. 151 * | 152 * | When we are below pageout_reserve, we fail or hold any allocation 153 * | that has not explicitly requested access to the reserve pool. 154 * | Access to the reserve is generally granted via the KM_PUSHPAGE 155 * | flag, or by marking a thread T_PUSHPAGE such that all allocations 156 * | can implicitly tap the reserve. For more details, see the 157 * v NOMEMWAIT() macro, the T_PUSHPAGE thread flag, the KM_PUSHPAGE 158 * | and VM_PUSHPAGE allocation flags, and page_create_throttle(). 159 * | 160 * +---------------------------------------------------------- no free memory 161 * | 162 * | If we have arrived here, things are very bad indeed. It is 163 * v surprisingly difficult to tell if this condition is even fatal, 164 * | as enough memory may have been granted to pageout() and to the 165 * | ZFS I/O pipeline that requests for eviction that have already been 166 * | made will complete and free up memory some time soon. 167 * | 168 * | If free memory does not materialise, the system generally remains 169 * | deadlocked. The pageout_deadman() below is run once per second 170 * | from clock(), seeking to limit the amount of time a single request 171 * v to page out can be blocked before the system panics to get a crash 172 * | dump and return to service. 173 * | 174 * +------------------------------------------------------------------------- 175 */ 176 177 /* 178 * The following parameters control operation of the page replacement 179 * algorithm. They are initialized to 0, and then computed at boot time based 180 * on the size of the system; see setupclock(). If they are patched non-zero 181 * in a loaded vmunix they are left alone and may thus be changed per system 182 * using "mdb -kw" on the loaded system. 183 */ 184 pgcnt_t slowscan = 0; 185 pgcnt_t fastscan = 0; 186 187 static pgcnt_t handspreadpages = 0; 188 189 /* 190 * looppages: 191 * Cached copy of the total number of pages in the system (total_pages). 192 * 193 * loopfraction: 194 * Divisor used to relate fastscan to looppages in setupclock(). 195 */ 196 static uint_t loopfraction = 2; 197 static pgcnt_t looppages; 198 199 static uint_t min_percent_cpu = 4; 200 static uint_t max_percent_cpu = 80; 201 static pgcnt_t maxfastscan = 0; 202 static pgcnt_t maxslowscan = 100; 203 204 #define MEGABYTES (1024ULL * 1024ULL) 205 206 /* 207 * pageout_threshold_style: 208 * set to 1 to use the previous default threshold size calculation; 209 * i.e., each threshold is half of the next largest value. 210 */ 211 uint_t pageout_threshold_style = 0; 212 213 /* 214 * The operator may override these tunables to request a different minimum or 215 * maximum lotsfree value, or to change the divisor we use for automatic 216 * sizing. 217 * 218 * By default, we make lotsfree 1/64th of the total memory in the machine. The 219 * minimum and maximum are specified in bytes, rather than pages; a zero value 220 * means the default values (below) are used. 221 */ 222 uint_t lotsfree_fraction = 64; 223 pgcnt_t lotsfree_min = 0; 224 pgcnt_t lotsfree_max = 0; 225 226 #define LOTSFREE_MIN_DEFAULT (16 * MEGABYTES) 227 #define LOTSFREE_MAX_DEFAULT (2048 * MEGABYTES) 228 229 /* 230 * If these tunables are set to non-zero values in /etc/system, and provided 231 * the value is not larger than the threshold above, the specified value will 232 * be used directly without any additional calculation or adjustment. The boot 233 * time value of these overrides is preserved in the "clockinit" struct. More 234 * detail is available in the comment at the top of the file. 235 */ 236 pgcnt_t maxpgio = 0; 237 pgcnt_t minfree = 0; 238 pgcnt_t desfree = 0; 239 pgcnt_t lotsfree = 0; 240 pgcnt_t needfree = 0; 241 pgcnt_t throttlefree = 0; 242 pgcnt_t pageout_reserve = 0; 243 244 pgcnt_t deficit; 245 pgcnt_t nscan; 246 pgcnt_t desscan; 247 248 /* 249 * Values for min_pageout_nsec, max_pageout_nsec and pageout_nsec are the 250 * number of nanoseconds in each wakeup cycle that gives the equivalent of some 251 * underlying %CPU duty cycle. 252 * 253 * min_pageout_nsec: 254 * nanoseconds/wakeup equivalent of min_percent_cpu. 255 * 256 * max_pageout_nsec: 257 * nanoseconds/wakeup equivalent of max_percent_cpu. 258 * 259 * pageout_nsec: 260 * Number of nanoseconds budgeted for each wakeup cycle. 261 * Computed each time around by schedpaging(). 262 * Varies between min_pageout_nsec and max_pageout_nsec, 263 * depending on memory pressure. 264 */ 265 static hrtime_t min_pageout_nsec; 266 static hrtime_t max_pageout_nsec; 267 static hrtime_t pageout_nsec; 268 269 static uint_t reset_hands; 270 271 #define PAGES_POLL_MASK 1023 272 273 /* 274 * pageout_sample_lim: 275 * The limit on the number of samples needed to establish a value for new 276 * pageout parameters: fastscan, slowscan, pageout_new_spread, and 277 * handspreadpages. 278 * 279 * pageout_sample_cnt: 280 * Current sample number. Once the sample gets large enough, set new 281 * values for handspreadpages, pageout_new_spread, fastscan and slowscan. 282 * 283 * pageout_sample_pages: 284 * The accumulated number of pages scanned during sampling. 285 * 286 * pageout_sample_etime: 287 * The accumulated nanoseconds for the sample. 288 * 289 * pageout_rate: 290 * Rate in pages/nanosecond, computed at the end of sampling. 291 * 292 * pageout_new_spread: 293 * Initially zero while the system scan rate is measured by 294 * pageout_scanner(), which then sets this value once per system boot after 295 * enough samples have been recorded (pageout_sample_cnt). Once set, this 296 * new value is used for fastscan and handspreadpages. 297 * 298 * sample_start, sample_end: 299 * The hrtime at which the last pageout_scanner() sample began and ended. 300 */ 301 typedef hrtime_t hrrate_t; 302 303 static uint64_t pageout_sample_lim = 4; 304 static uint64_t pageout_sample_cnt = 0; 305 static pgcnt_t pageout_sample_pages = 0; 306 static hrrate_t pageout_rate = 0; 307 static pgcnt_t pageout_new_spread = 0; 308 309 static hrtime_t pageout_cycle_nsec; 310 static hrtime_t sample_start, sample_end; 311 static hrtime_t pageout_sample_etime = 0; 312 313 /* 314 * Record number of times a pageout_scanner() wakeup cycle finished because it 315 * timed out (exceeded its CPU budget), rather than because it visited 316 * its budgeted number of pages. 317 */ 318 uint64_t pageout_timeouts = 0; 319 320 #ifdef VM_STATS 321 static struct pageoutvmstats_str { 322 ulong_t checkpage[3]; 323 } pageoutvmstats; 324 #endif /* VM_STATS */ 325 326 /* 327 * Threads waiting for free memory use this condition variable and lock until 328 * memory becomes available. 329 */ 330 kmutex_t memavail_lock; 331 kcondvar_t memavail_cv; 332 333 typedef enum pageout_hand { 334 POH_FRONT = 1, 335 POH_BACK, 336 } pageout_hand_t; 337 338 typedef enum { 339 CKP_INELIGIBLE, 340 CKP_NOT_FREED, 341 CKP_FREED, 342 } checkpage_result_t; 343 344 static checkpage_result_t checkpage(page_t *, pageout_hand_t); 345 346 static struct clockinit { 347 bool ci_init; 348 pgcnt_t ci_lotsfree_min; 349 pgcnt_t ci_lotsfree_max; 350 pgcnt_t ci_lotsfree; 351 pgcnt_t ci_desfree; 352 pgcnt_t ci_minfree; 353 pgcnt_t ci_throttlefree; 354 pgcnt_t ci_pageout_reserve; 355 pgcnt_t ci_maxpgio; 356 pgcnt_t ci_maxfastscan; 357 pgcnt_t ci_fastscan; 358 pgcnt_t ci_slowscan; 359 pgcnt_t ci_handspreadpages; 360 } clockinit = { .ci_init = false }; 361 362 static pgcnt_t 363 clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum) 364 { 365 if (value < minimum) { 366 return (minimum); 367 } else if (value > maximum) { 368 return (maximum); 369 } else { 370 return (value); 371 } 372 } 373 374 static pgcnt_t 375 tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval) 376 { 377 if (initval == 0 || initval >= initval_ceiling) { 378 return (defval); 379 } else { 380 return (initval); 381 } 382 } 383 384 /* 385 * Set up the paging constants for the clock algorithm used by 386 * pageout_scanner(), and by the virtual memory system overall. See the 387 * comments at the top of this file for more information about the threshold 388 * values and system responses to memory pressure. 389 * 390 * This routine is called once by main() at startup, after the initial size of 391 * physical memory is determined. It may be called again later if memory is 392 * added to or removed from the system, or if new measurements of the page scan 393 * rate become available. 394 */ 395 void 396 setupclock(void) 397 { 398 pgcnt_t defval; 399 bool half = (pageout_threshold_style == 1); 400 bool recalc = true; 401 402 looppages = total_pages; 403 404 /* 405 * The operator may have provided specific values for some of the 406 * tunables via /etc/system. On our first call, we preserve those 407 * values so that they can be used for subsequent recalculations. 408 * 409 * A value of zero for any tunable means we will use the default 410 * sizing. 411 */ 412 if (!clockinit.ci_init) { 413 clockinit.ci_init = true; 414 415 clockinit.ci_lotsfree_min = lotsfree_min; 416 clockinit.ci_lotsfree_max = lotsfree_max; 417 clockinit.ci_lotsfree = lotsfree; 418 clockinit.ci_desfree = desfree; 419 clockinit.ci_minfree = minfree; 420 clockinit.ci_throttlefree = throttlefree; 421 clockinit.ci_pageout_reserve = pageout_reserve; 422 clockinit.ci_maxpgio = maxpgio; 423 clockinit.ci_maxfastscan = maxfastscan; 424 clockinit.ci_fastscan = fastscan; 425 clockinit.ci_slowscan = slowscan; 426 clockinit.ci_handspreadpages = handspreadpages; 427 428 /* 429 * The first call does not trigger a recalculation, only 430 * subsequent calls. 431 */ 432 recalc = false; 433 } 434 435 /* 436 * Configure paging threshold values. For more details on what each 437 * threshold signifies, see the comments at the top of this file. 438 */ 439 lotsfree_max = tune(clockinit.ci_lotsfree_max, looppages, 440 btop(LOTSFREE_MAX_DEFAULT)); 441 lotsfree_min = tune(clockinit.ci_lotsfree_min, lotsfree_max, 442 btop(LOTSFREE_MIN_DEFAULT)); 443 444 lotsfree = tune(clockinit.ci_lotsfree, looppages, 445 clamp(looppages / lotsfree_fraction, lotsfree_min, lotsfree_max)); 446 447 desfree = tune(clockinit.ci_desfree, lotsfree, 448 lotsfree / 2); 449 450 minfree = tune(clockinit.ci_minfree, desfree, 451 half ? desfree / 2 : 3 * desfree / 4); 452 453 throttlefree = tune(clockinit.ci_throttlefree, desfree, 454 minfree); 455 456 pageout_reserve = tune(clockinit.ci_pageout_reserve, throttlefree, 457 half ? throttlefree / 2 : 3 * throttlefree / 4); 458 459 /* 460 * Maxpgio thresholds how much paging is acceptable. 461 * This figures that 2/3 busy on an arm is all that is 462 * tolerable for paging. We assume one operation per disk rev. 463 * 464 * XXX - Does not account for multiple swap devices. 465 */ 466 if (clockinit.ci_maxpgio == 0) { 467 maxpgio = (DISKRPM * 2) / 3; 468 } else { 469 maxpgio = clockinit.ci_maxpgio; 470 } 471 472 /* 473 * The clock scan rate varies between fastscan and slowscan 474 * based on the amount of free memory available. Fastscan 475 * rate should be set based on the number pages that can be 476 * scanned per sec using ~10% of processor time. Since this 477 * value depends on the processor, MMU, Mhz etc., it is 478 * difficult to determine it in a generic manner for all 479 * architectures. 480 * 481 * Instead of trying to determine the number of pages scanned 482 * per sec for every processor, fastscan is set to be the smaller 483 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling 484 * time is limited to ~4% of processor time. 485 * 486 * Setting fastscan to be 1/2 of memory allows pageout to scan 487 * all of memory in ~2 secs. This implies that user pages not 488 * accessed within 1 sec (assuming, handspreadpages == fastscan) 489 * can be reclaimed when free memory is very low. Stealing pages 490 * not accessed within 1 sec seems reasonable and ensures that 491 * active user processes don't thrash. 492 * 493 * Smaller values of fastscan result in scanning fewer pages 494 * every second and consequently pageout may not be able to free 495 * sufficient memory to maintain the minimum threshold. Larger 496 * values of fastscan result in scanning a lot more pages which 497 * could lead to thrashing and higher CPU usage. 498 * 499 * Fastscan needs to be limited to a maximum value and should not 500 * scale with memory to prevent pageout from consuming too much 501 * time for scanning on slow CPU's and avoid thrashing, as a 502 * result of scanning too many pages, on faster CPU's. 503 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES 504 * (the upper bound for fastscan) based on the average number 505 * of pages that can potentially be scanned in ~1 sec (using ~4% 506 * of the CPU) on some of the following machines that currently 507 * run Solaris 2.x: 508 * 509 * average memory scanned in ~1 sec 510 * 511 * 25 Mhz SS1+: 23 Meg 512 * LX: 37 Meg 513 * 50 Mhz SC2000: 68 Meg 514 * 515 * 40 Mhz 486: 26 Meg 516 * 66 Mhz 486: 42 Meg 517 * 518 * When free memory falls just below lotsfree, the scan rate 519 * goes from 0 to slowscan (i.e., pageout starts running). This 520 * transition needs to be smooth and is achieved by ensuring that 521 * pageout scans a small number of pages to satisfy the transient 522 * memory demand. This is set to not exceed 100 pages/sec (25 per 523 * wakeup) since scanning that many pages has no noticible impact 524 * on system performance. 525 * 526 * In addition to setting fastscan and slowscan, pageout is 527 * limited to using ~4% of the CPU. This results in increasing 528 * the time taken to scan all of memory, which in turn means that 529 * user processes have a better opportunity of preventing their 530 * pages from being stolen. This has a positive effect on 531 * interactive and overall system performance when memory demand 532 * is high. 533 * 534 * Thus, the rate at which pages are scanned for replacement will 535 * vary linearly between slowscan and the number of pages that 536 * can be scanned using ~4% of processor time instead of varying 537 * linearly between slowscan and fastscan. 538 * 539 * Also, the processor time used by pageout will vary from ~1% 540 * at slowscan to ~4% at fastscan instead of varying between 541 * ~1% at slowscan and ~10% at fastscan. 542 * 543 * The values chosen for the various VM parameters (fastscan, 544 * handspreadpages, etc) are not universally true for all machines, 545 * but appear to be a good rule of thumb for the machines we've 546 * tested. They have the following ranges: 547 * 548 * cpu speed: 20 to 70 Mhz 549 * page size: 4K to 8K 550 * memory size: 16M to 5G 551 * page scan rate: 4000 - 17400 4K pages per sec 552 * 553 * The values need to be re-examined for machines which don't 554 * fall into the various ranges (e.g., slower or faster CPUs, 555 * smaller or larger pagesizes etc) shown above. 556 * 557 * On an MP machine, pageout is often unable to maintain the 558 * minimum paging thresholds under heavy load. This is due to 559 * the fact that user processes running on other CPU's can be 560 * dirtying memory at a much faster pace than pageout can find 561 * pages to free. The memory demands could be met by enabling 562 * more than one CPU to run the clock algorithm in such a manner 563 * that the various clock hands don't overlap. This also makes 564 * it more difficult to determine the values for fastscan, slowscan 565 * and handspreadpages. 566 * 567 * The swapper is currently used to free up memory when pageout 568 * is unable to meet memory demands by swapping out processes. 569 * In addition to freeing up memory, swapping also reduces the 570 * demand for memory by preventing user processes from running 571 * and thereby consuming memory. 572 */ 573 if (clockinit.ci_maxfastscan == 0) { 574 if (pageout_new_spread != 0) { 575 maxfastscan = pageout_new_spread; 576 } else { 577 maxfastscan = MAXHANDSPREADPAGES; 578 } 579 } else { 580 maxfastscan = clockinit.ci_maxfastscan; 581 } 582 583 if (clockinit.ci_fastscan == 0) { 584 fastscan = MIN(looppages / loopfraction, maxfastscan); 585 } else { 586 fastscan = clockinit.ci_fastscan; 587 } 588 589 if (fastscan > looppages / loopfraction) { 590 fastscan = looppages / loopfraction; 591 } 592 593 /* 594 * Set slow scan time to 1/10 the fast scan time, but 595 * not to exceed maxslowscan. 596 */ 597 if (clockinit.ci_slowscan == 0) { 598 slowscan = MIN(fastscan / 10, maxslowscan); 599 } else { 600 slowscan = clockinit.ci_slowscan; 601 } 602 603 if (slowscan > fastscan / 2) { 604 slowscan = fastscan / 2; 605 } 606 607 /* 608 * Handspreadpages is distance (in pages) between front and back 609 * pageout daemon hands. The amount of time to reclaim a page 610 * once pageout examines it increases with this distance and 611 * decreases as the scan rate rises. It must be < the amount 612 * of pageable memory. 613 * 614 * Since pageout is limited to ~4% of the CPU, setting handspreadpages 615 * to be "fastscan" results in the front hand being a few secs 616 * (varies based on the processor speed) ahead of the back hand 617 * at fastscan rates. This distance can be further reduced, if 618 * necessary, by increasing the processor time used by pageout 619 * to be more than ~4% and preferrably not more than ~10%. 620 * 621 * As a result, user processes have a much better chance of 622 * referencing their pages before the back hand examines them. 623 * This also significantly lowers the number of reclaims from 624 * the freelist since pageout does not end up freeing pages which 625 * may be referenced a sec later. 626 */ 627 if (clockinit.ci_handspreadpages == 0) { 628 handspreadpages = fastscan; 629 } else { 630 handspreadpages = clockinit.ci_handspreadpages; 631 } 632 633 /* 634 * Make sure that back hand follows front hand by at least 635 * 1/SCHEDPAGING_HZ seconds. Without this test, it is possible for the 636 * back hand to look at a page during the same wakeup of the pageout 637 * daemon in which the front hand cleared its ref bit. 638 */ 639 if (handspreadpages >= looppages) { 640 handspreadpages = looppages - 1; 641 } 642 643 /* 644 * If we have been called to recalculate the parameters, set a flag to 645 * re-evaluate the clock hand pointers. 646 */ 647 if (recalc) { 648 reset_hands = 1; 649 } 650 } 651 652 /* 653 * Pageout scheduling. 654 * 655 * Schedpaging controls the rate at which the page out daemon runs by 656 * setting the global variables nscan and desscan SCHEDPAGING_HZ 657 * times a second. Nscan records the number of pages pageout has examined 658 * in its current pass; schedpaging() resets this value to zero each time 659 * it runs. Desscan records the number of pages pageout should examine 660 * in its next pass; schedpaging() sets this value based on the amount of 661 * currently available memory. 662 */ 663 #define SCHEDPAGING_HZ 4 664 665 static kmutex_t pageout_mutex; /* held while pageout or schedpaging running */ 666 667 /* 668 * Pool of available async pageout putpage requests. 669 */ 670 static struct async_reqs *push_req; 671 static struct async_reqs *req_freelist; /* available req structs */ 672 static struct async_reqs *push_list; /* pending reqs */ 673 static kmutex_t push_lock; /* protects req pool */ 674 static kcondvar_t push_cv; 675 676 /* 677 * If pageout() is stuck on a single push for this many seconds, 678 * pageout_deadman() will assume the system has hit a memory deadlock. If set 679 * to 0, the deadman will have no effect. 680 * 681 * Note that we are only looking for stalls in the calls that pageout() makes 682 * to VOP_PUTPAGE(). These calls are merely asynchronous requests for paging 683 * I/O, which should not take long unless the underlying strategy call blocks 684 * indefinitely for memory. The actual I/O request happens (or fails) later. 685 */ 686 uint_t pageout_deadman_seconds = 90; 687 688 static uint_t pageout_stucktime = 0; 689 static bool pageout_pushing = false; 690 static uint64_t pageout_pushcount = 0; 691 static uint64_t pageout_pushcount_seen = 0; 692 693 static int async_list_size = 256; /* number of async request structs */ 694 695 static void pageout_scanner(void); 696 697 /* 698 * If a page is being shared more than "po_share" times 699 * then leave it alone- don't page it out. 700 */ 701 #define MIN_PO_SHARE (8) 702 #define MAX_PO_SHARE ((MIN_PO_SHARE) << 24) 703 ulong_t po_share = MIN_PO_SHARE; 704 705 /* 706 * Schedule rate for paging. 707 * Rate is linear interpolation between 708 * slowscan with lotsfree and fastscan when out of memory. 709 */ 710 static void 711 schedpaging(void *arg) 712 { 713 spgcnt_t vavail; 714 715 if (freemem < lotsfree + needfree + kmem_reapahead) 716 kmem_reap(); 717 718 if (freemem < lotsfree + needfree) 719 seg_preap(); 720 721 if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree)) 722 kcage_cageout_wakeup(); 723 724 if (mutex_tryenter(&pageout_mutex)) { 725 /* pageout() not running */ 726 nscan = 0; 727 vavail = freemem - deficit; 728 if (pageout_new_spread != 0) 729 vavail -= needfree; 730 if (vavail < 0) 731 vavail = 0; 732 if (vavail > lotsfree) 733 vavail = lotsfree; 734 735 /* 736 * Fix for 1161438 (CRS SPR# 73922). All variables 737 * in the original calculation for desscan were 32 bit signed 738 * ints. As freemem approaches 0x0 on a system with 1 Gig or 739 * more of memory, the calculation can overflow. When this 740 * happens, desscan becomes negative and pageout_scanner() 741 * stops paging out. 742 */ 743 if (needfree > 0 && pageout_new_spread == 0) { 744 /* 745 * If we've not yet collected enough samples to 746 * calculate a spread, use the old logic of kicking 747 * into high gear anytime needfree is non-zero. 748 */ 749 desscan = fastscan / SCHEDPAGING_HZ; 750 } else { 751 /* 752 * Once we've calculated a spread based on system 753 * memory and usage, just treat needfree as another 754 * form of deficit. 755 */ 756 spgcnt_t faststmp, slowstmp, result; 757 758 slowstmp = slowscan * vavail; 759 faststmp = fastscan * (lotsfree - vavail); 760 result = (slowstmp + faststmp) / 761 nz(lotsfree) / SCHEDPAGING_HZ; 762 desscan = (pgcnt_t)result; 763 } 764 765 pageout_nsec = min_pageout_nsec + (lotsfree - vavail) * 766 (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree); 767 768 if (freemem < lotsfree + needfree || 769 pageout_sample_cnt < pageout_sample_lim) { 770 /* 771 * Either we need more memory, or we still need to 772 * measure the average scan rate. Wake the scanner. 773 */ 774 DTRACE_PROBE(pageout__cv__signal); 775 cv_signal(&proc_pageout->p_cv); 776 } else { 777 /* 778 * There are enough free pages, no need to 779 * kick the scanner thread. And next time 780 * around, keep more of the `highly shared' 781 * pages. 782 */ 783 cv_signal_pageout(); 784 if (po_share > MIN_PO_SHARE) { 785 po_share >>= 1; 786 } 787 } 788 mutex_exit(&pageout_mutex); 789 } 790 791 /* 792 * Signal threads waiting for available memory. 793 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but 794 * in this case it is not needed - the waiters will be waken up during 795 * the next invocation of this function. 796 */ 797 if (kmem_avail() > 0) 798 cv_broadcast(&memavail_cv); 799 800 (void) timeout(schedpaging, arg, hz / SCHEDPAGING_HZ); 801 } 802 803 pgcnt_t pushes; 804 ulong_t push_list_size; /* # of requests on pageout queue */ 805 806 /* 807 * Paging out should always be enabled. This tunable exists to hold pageout 808 * for debugging purposes. If set to 0, pageout_scanner() will go back to 809 * sleep each time it is woken by schedpaging(). 810 */ 811 uint_t dopageout = 1; 812 813 /* 814 * The page out daemon, which runs as process 2. 815 * 816 * As long as there are at least lotsfree pages, 817 * this process is not run. When the number of free 818 * pages stays in the range desfree to lotsfree, 819 * this daemon runs through the pages in the loop 820 * at a rate determined in schedpaging(). Pageout manages 821 * two hands on the clock. The front hand moves through 822 * memory, clearing the reference bit, 823 * and stealing pages from procs that are over maxrss. 824 * The back hand travels a distance behind the front hand, 825 * freeing the pages that have not been referenced in the time 826 * since the front hand passed. If modified, they are pushed to 827 * swap before being freed. 828 * 829 * There are 2 threads that act on behalf of the pageout process. 830 * One thread scans pages (pageout_scanner) and frees them up if 831 * they don't require any VOP_PUTPAGE operation. If a page must be 832 * written back to its backing store, the request is put on a list 833 * and the other (pageout) thread is signaled. The pageout thread 834 * grabs VOP_PUTPAGE requests from the list, and processes them. 835 * Some filesystems may require resources for the VOP_PUTPAGE 836 * operations (like memory) and hence can block the pageout 837 * thread, but the scanner thread can still operate. There is still 838 * no guarantee that memory deadlocks cannot occur. 839 * 840 * For now, this thing is in very rough form. 841 */ 842 void 843 pageout() 844 { 845 struct async_reqs *arg; 846 pri_t pageout_pri; 847 int i; 848 pgcnt_t max_pushes; 849 callb_cpr_t cprinfo; 850 851 proc_pageout = ttoproc(curthread); 852 proc_pageout->p_cstime = 0; 853 proc_pageout->p_stime = 0; 854 proc_pageout->p_cutime = 0; 855 proc_pageout->p_utime = 0; 856 bcopy("pageout", PTOU(curproc)->u_psargs, 8); 857 bcopy("pageout", PTOU(curproc)->u_comm, 7); 858 859 /* 860 * Create pageout scanner thread 861 */ 862 mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL); 863 mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL); 864 865 /* 866 * Allocate and initialize the async request structures 867 * for pageout. 868 */ 869 push_req = (struct async_reqs *) 870 kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP); 871 872 req_freelist = push_req; 873 for (i = 0; i < async_list_size - 1; i++) { 874 push_req[i].a_next = &push_req[i + 1]; 875 } 876 877 pageout_pri = curthread->t_pri; 878 879 /* Create the pageout scanner thread. */ 880 (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN, 881 pageout_pri - 1); 882 883 /* 884 * kick off pageout scheduler. 885 */ 886 schedpaging(NULL); 887 888 /* 889 * Create kernel cage thread. 890 * The kernel cage thread is started under the pageout process 891 * to take advantage of the less restricted page allocation 892 * in page_create_throttle(). 893 */ 894 kcage_cageout_init(); 895 896 /* 897 * Limit pushes to avoid saturating pageout devices. 898 */ 899 max_pushes = maxpgio / SCHEDPAGING_HZ; 900 CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout"); 901 902 for (;;) { 903 mutex_enter(&push_lock); 904 905 while ((arg = push_list) == NULL || pushes > max_pushes) { 906 CALLB_CPR_SAFE_BEGIN(&cprinfo); 907 cv_wait(&push_cv, &push_lock); 908 pushes = 0; 909 CALLB_CPR_SAFE_END(&cprinfo, &push_lock); 910 } 911 push_list = arg->a_next; 912 arg->a_next = NULL; 913 pageout_pushing = true; 914 mutex_exit(&push_lock); 915 916 if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off, 917 arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) { 918 pushes++; 919 } 920 921 /* vp held by checkpage() */ 922 VN_RELE(arg->a_vp); 923 924 mutex_enter(&push_lock); 925 pageout_pushing = false; 926 pageout_pushcount++; 927 arg->a_next = req_freelist; /* back on freelist */ 928 req_freelist = arg; 929 push_list_size--; 930 mutex_exit(&push_lock); 931 } 932 } 933 934 /* 935 * Kernel thread that scans pages looking for ones to free 936 */ 937 static void 938 pageout_scanner(void) 939 { 940 struct page *fronthand, *backhand; 941 uint_t laps; 942 callb_cpr_t cprinfo; 943 pgcnt_t nscan_limit; 944 pgcnt_t pcount; 945 bool sampling; 946 947 CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan"); 948 mutex_enter(&pageout_mutex); 949 950 /* 951 * The restart case does not attempt to point the hands at roughly 952 * the right point on the assumption that after one circuit things 953 * will have settled down, and restarts shouldn't be that often. 954 */ 955 956 /* 957 * Set the two clock hands to be separated by a reasonable amount, 958 * but no more than 360 degrees apart. 959 */ 960 backhand = page_first(); 961 if (handspreadpages >= total_pages) { 962 fronthand = page_nextn(backhand, total_pages - 1); 963 } else { 964 fronthand = page_nextn(backhand, handspreadpages); 965 } 966 967 /* 968 * Establish the minimum and maximum length of time to be spent 969 * scanning pages per wakeup, limiting the scanner duty cycle. The 970 * input percentage values (0-100) must be converted to a fraction of 971 * the number of nanoseconds in a second of wall time, then further 972 * scaled down by the number of scanner wakeups in a second: 973 */ 974 min_pageout_nsec = MAX(1, 975 NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ); 976 max_pageout_nsec = MAX(min_pageout_nsec, 977 NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ); 978 979 loop: 980 cv_signal_pageout(); 981 982 CALLB_CPR_SAFE_BEGIN(&cprinfo); 983 cv_wait(&proc_pageout->p_cv, &pageout_mutex); 984 CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex); 985 986 /* 987 * Check if pageout has been disabled for debugging purposes: 988 */ 989 if (!dopageout) { 990 goto loop; 991 } 992 993 /* 994 * One may reset the clock hands for debugging purposes. Hands will 995 * also be reset if memory is added to or removed from the system. 996 */ 997 if (reset_hands) { 998 reset_hands = 0; 999 1000 backhand = page_first(); 1001 if (handspreadpages >= total_pages) { 1002 fronthand = page_nextn(backhand, total_pages - 1); 1003 } else { 1004 fronthand = page_nextn(backhand, handspreadpages); 1005 } 1006 } 1007 1008 CPU_STATS_ADDQ(CPU, vm, pgrrun, 1); 1009 1010 /* 1011 * Keep track of the number of times we have scanned all the way around 1012 * the loop: 1013 */ 1014 laps = 0; 1015 1016 DTRACE_PROBE(pageout__start); 1017 1018 /* 1019 * Track the number of pages visited during this scan so that we can 1020 * periodically measure our duty cycle. 1021 */ 1022 pcount = 0; 1023 1024 if (pageout_sample_cnt < pageout_sample_lim) { 1025 /* 1026 * We need to measure the rate at which the system is able to 1027 * scan pages of memory. Each of these initial samples is a 1028 * scan of all system memory, regardless of whether or not we 1029 * are experiencing memory pressure. 1030 */ 1031 nscan_limit = total_pages; 1032 sampling = true; 1033 } else { 1034 nscan_limit = desscan; 1035 sampling = false; 1036 } 1037 1038 sample_start = gethrtime(); 1039 1040 /* 1041 * Scan the appropriate number of pages for a single duty cycle. 1042 */ 1043 while (nscan < nscan_limit) { 1044 checkpage_result_t rvfront, rvback; 1045 1046 if (!sampling && freemem >= lotsfree + needfree) { 1047 /* 1048 * We are not sampling and enough memory has become 1049 * available that scanning is no longer required. 1050 */ 1051 break; 1052 } 1053 1054 /* 1055 * Periodically check to see if we have exceeded the CPU duty 1056 * cycle for a single wakeup. 1057 */ 1058 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) { 1059 pageout_cycle_nsec = gethrtime() - sample_start; 1060 if (pageout_cycle_nsec >= pageout_nsec) { 1061 ++pageout_timeouts; 1062 break; 1063 } 1064 } 1065 1066 /* 1067 * If checkpage manages to add a page to the free list, 1068 * we give ourselves another couple of trips around the loop. 1069 */ 1070 if ((rvfront = checkpage(fronthand, POH_FRONT)) == CKP_FREED) { 1071 laps = 0; 1072 } 1073 if ((rvback = checkpage(backhand, POH_BACK)) == CKP_FREED) { 1074 laps = 0; 1075 } 1076 1077 ++pcount; 1078 1079 /* 1080 * Protected by pageout_mutex instead of cpu_stat_lock: 1081 */ 1082 CPU_STATS_ADDQ(CPU, vm, scan, 1); 1083 1084 /* 1085 * Don't include ineligible pages in the number scanned. 1086 */ 1087 if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) { 1088 nscan++; 1089 } 1090 1091 backhand = page_next(backhand); 1092 fronthand = page_next(fronthand); 1093 1094 /* 1095 * The front hand has wrapped around to the first page in the 1096 * loop. 1097 */ 1098 if (fronthand == page_first()) { 1099 laps++; 1100 DTRACE_PROBE1(pageout__hand__wrap, uint_t, laps); 1101 1102 /* 1103 * Protected by pageout_mutex instead of cpu_stat_lock: 1104 */ 1105 CPU_STATS_ADDQ(CPU, vm, rev, 1); 1106 1107 if (laps > 1) { 1108 /* 1109 * Extremely unlikely, but it happens. 1110 * We went around the loop at least once 1111 * and didn't get far enough. 1112 * If we are still skipping `highly shared' 1113 * pages, skip fewer of them. Otherwise, 1114 * give up till the next clock tick. 1115 */ 1116 if (po_share < MAX_PO_SHARE) { 1117 po_share <<= 1; 1118 } else { 1119 break; 1120 } 1121 } 1122 } 1123 } 1124 1125 sample_end = gethrtime(); 1126 1127 DTRACE_PROBE1(pageout__end, uint_t, laps); 1128 1129 if (pageout_new_spread == 0) { 1130 if (pageout_sample_cnt < pageout_sample_lim) { 1131 /* 1132 * Continue accumulating samples until we have enough 1133 * to get a reasonable value for average scan rate: 1134 */ 1135 pageout_sample_pages += pcount; 1136 pageout_sample_etime += sample_end - sample_start; 1137 ++pageout_sample_cnt; 1138 } 1139 1140 if (pageout_sample_cnt >= pageout_sample_lim) { 1141 /* 1142 * We have enough samples, set the spread. 1143 */ 1144 pageout_rate = (hrrate_t)pageout_sample_pages * 1145 (hrrate_t)(NANOSEC) / pageout_sample_etime; 1146 pageout_new_spread = pageout_rate / 10; 1147 setupclock(); 1148 } 1149 } 1150 1151 goto loop; 1152 } 1153 1154 /* 1155 * The pageout deadman is run once per second by clock(). 1156 */ 1157 void 1158 pageout_deadman(void) 1159 { 1160 if (panicstr != NULL) { 1161 /* 1162 * There is no pageout after panic. 1163 */ 1164 return; 1165 } 1166 1167 if (pageout_deadman_seconds == 0) { 1168 /* 1169 * The deadman is not enabled. 1170 */ 1171 return; 1172 } 1173 1174 if (!pageout_pushing) { 1175 goto reset; 1176 } 1177 1178 /* 1179 * We are pushing a page. Check to see if it is the same call we saw 1180 * last time we looked: 1181 */ 1182 if (pageout_pushcount != pageout_pushcount_seen) { 1183 /* 1184 * It is a different call from the last check, so we are not 1185 * stuck. 1186 */ 1187 goto reset; 1188 } 1189 1190 if (++pageout_stucktime >= pageout_deadman_seconds) { 1191 panic("pageout_deadman: stuck pushing the same page for %d " 1192 "seconds (freemem is %lu)", pageout_deadman_seconds, 1193 freemem); 1194 } 1195 1196 return; 1197 1198 reset: 1199 /* 1200 * Reset our tracking state to reflect that we are not stuck: 1201 */ 1202 pageout_stucktime = 0; 1203 pageout_pushcount_seen = pageout_pushcount; 1204 } 1205 1206 /* 1207 * Look at the page at hand. If it is locked (e.g., for physical i/o), 1208 * system (u., page table) or free, then leave it alone. Otherwise, 1209 * if we are running the front hand, turn off the page's reference bit. 1210 * If the proc is over maxrss, we take it. If running the back hand, 1211 * check whether the page has been reclaimed. If not, free the page, 1212 * pushing it to disk first if necessary. 1213 * 1214 * Return values: 1215 * CKP_INELIGIBLE if the page is not a candidate at all, 1216 * CKP_NOT_FREED if the page was not freed, or 1217 * CKP_FREED if we freed it. 1218 */ 1219 static checkpage_result_t 1220 checkpage(struct page *pp, pageout_hand_t whichhand) 1221 { 1222 int ppattr; 1223 int isfs = 0; 1224 int isexec = 0; 1225 int pagesync_flag; 1226 1227 /* 1228 * Skip pages: 1229 * - associated with the kernel vnode since 1230 * they are always "exclusively" locked. 1231 * - that are free 1232 * - that are shared more than po_share'd times 1233 * - its already locked 1234 * 1235 * NOTE: These optimizations assume that reads are atomic. 1236 */ 1237 1238 if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) || 1239 pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 1240 hat_page_checkshare(pp, po_share)) { 1241 return (CKP_INELIGIBLE); 1242 } 1243 1244 if (!page_trylock(pp, SE_EXCL)) { 1245 /* 1246 * Skip the page if we can't acquire the "exclusive" lock. 1247 */ 1248 return (CKP_INELIGIBLE); 1249 } else if (PP_ISFREE(pp)) { 1250 /* 1251 * It became free between the above check and our actually 1252 * locking the page. Oh well, there will be other pages. 1253 */ 1254 page_unlock(pp); 1255 return (CKP_INELIGIBLE); 1256 } 1257 1258 /* 1259 * Reject pages that cannot be freed. The page_struct_lock 1260 * need not be acquired to examine these 1261 * fields since the page has an "exclusive" lock. 1262 */ 1263 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1264 page_unlock(pp); 1265 return (CKP_INELIGIBLE); 1266 } 1267 1268 /* 1269 * Maintain statistics for what we are freeing 1270 */ 1271 if (pp->p_vnode != NULL) { 1272 if (pp->p_vnode->v_flag & VVMEXEC) 1273 isexec = 1; 1274 1275 if (!IS_SWAPFSVP(pp->p_vnode)) 1276 isfs = 1; 1277 } 1278 1279 /* 1280 * Turn off REF and MOD bits with the front hand. 1281 * The back hand examines the REF bit and always considers 1282 * SHARED pages as referenced. 1283 */ 1284 if (whichhand == POH_FRONT) { 1285 pagesync_flag = HAT_SYNC_ZERORM; 1286 } else { 1287 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF | 1288 HAT_SYNC_STOPON_SHARED; 1289 } 1290 1291 ppattr = hat_pagesync(pp, pagesync_flag); 1292 1293 recheck: 1294 /* 1295 * If page is referenced; make unreferenced but reclaimable. 1296 * If this page is not referenced, then it must be reclaimable 1297 * and we can add it to the free list. 1298 */ 1299 if (ppattr & P_REF) { 1300 DTRACE_PROBE2(pageout__isref, page_t *, pp, 1301 pageout_hand_t, whichhand); 1302 1303 if (whichhand == POH_FRONT) { 1304 /* 1305 * Checking of rss or madvise flags needed here... 1306 * 1307 * If not "well-behaved", fall through into the code 1308 * for not referenced. 1309 */ 1310 hat_clrref(pp); 1311 } 1312 1313 /* 1314 * Somebody referenced the page since the front 1315 * hand went by, so it's not a candidate for 1316 * freeing up. 1317 */ 1318 page_unlock(pp); 1319 return (CKP_NOT_FREED); 1320 } 1321 1322 VM_STAT_ADD(pageoutvmstats.checkpage[0]); 1323 1324 /* 1325 * If large page, attempt to demote it. If successfully demoted, 1326 * retry the checkpage. 1327 */ 1328 if (pp->p_szc != 0) { 1329 if (!page_try_demote_pages(pp)) { 1330 VM_STAT_ADD(pageoutvmstats.checkpage[1]); 1331 page_unlock(pp); 1332 return (CKP_INELIGIBLE); 1333 } 1334 1335 ASSERT(pp->p_szc == 0); 1336 VM_STAT_ADD(pageoutvmstats.checkpage[2]); 1337 1338 /* 1339 * Since page_try_demote_pages() could have unloaded some 1340 * mappings it makes sense to reload ppattr. 1341 */ 1342 ppattr = hat_page_getattr(pp, P_MOD | P_REF); 1343 } 1344 1345 /* 1346 * If the page is currently dirty, we have to arrange to have it 1347 * cleaned before it can be freed. 1348 * 1349 * XXX - ASSERT(pp->p_vnode != NULL); 1350 */ 1351 if ((ppattr & P_MOD) && pp->p_vnode != NULL) { 1352 struct vnode *vp = pp->p_vnode; 1353 u_offset_t offset = pp->p_offset; 1354 1355 /* 1356 * XXX - Test for process being swapped out or about to exit? 1357 * [Can't get back to process(es) using the page.] 1358 */ 1359 1360 /* 1361 * Hold the vnode before releasing the page lock to 1362 * prevent it from being freed and re-used by some 1363 * other thread. 1364 */ 1365 VN_HOLD(vp); 1366 page_unlock(pp); 1367 1368 /* 1369 * Queue I/O request for the pageout thread. 1370 */ 1371 if (!queue_io_request(vp, offset)) { 1372 VN_RELE(vp); 1373 return (CKP_NOT_FREED); 1374 } 1375 return (CKP_FREED); 1376 } 1377 1378 /* 1379 * Now we unload all the translations and put the page back on to the 1380 * free list. If the page was used (referenced or modified) after the 1381 * pagesync but before it was unloaded we catch it and handle the page 1382 * properly. 1383 */ 1384 DTRACE_PROBE2(pageout__free, page_t *, pp, pageout_hand_t, whichhand); 1385 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1386 ppattr = hat_page_getattr(pp, P_MOD | P_REF); 1387 if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode != NULL)) { 1388 goto recheck; 1389 } 1390 1391 VN_DISPOSE(pp, B_FREE, 0, kcred); 1392 1393 CPU_STATS_ADD_K(vm, dfree, 1); 1394 1395 if (isfs) { 1396 if (isexec) { 1397 CPU_STATS_ADD_K(vm, execfree, 1); 1398 } else { 1399 CPU_STATS_ADD_K(vm, fsfree, 1); 1400 } 1401 } else { 1402 CPU_STATS_ADD_K(vm, anonfree, 1); 1403 } 1404 1405 return (CKP_FREED); 1406 } 1407 1408 /* 1409 * Queue async i/o request from pageout_scanner and segment swapout 1410 * routines on one common list. This ensures that pageout devices (swap) 1411 * are not saturated by pageout_scanner or swapout requests. 1412 * The pageout thread empties this list by initiating i/o operations. 1413 */ 1414 int 1415 queue_io_request(vnode_t *vp, u_offset_t off) 1416 { 1417 struct async_reqs *arg; 1418 1419 /* 1420 * If we cannot allocate an async request struct, 1421 * skip this page. 1422 */ 1423 mutex_enter(&push_lock); 1424 if ((arg = req_freelist) == NULL) { 1425 mutex_exit(&push_lock); 1426 return (0); 1427 } 1428 req_freelist = arg->a_next; /* adjust freelist */ 1429 push_list_size++; 1430 1431 arg->a_vp = vp; 1432 arg->a_off = off; 1433 arg->a_len = PAGESIZE; 1434 arg->a_flags = B_ASYNC | B_FREE; 1435 arg->a_cred = kcred; /* always held */ 1436 1437 /* 1438 * Add to list of pending write requests. 1439 */ 1440 arg->a_next = push_list; 1441 push_list = arg; 1442 1443 if (req_freelist == NULL) { 1444 /* 1445 * No free async requests left. The lock is held so we 1446 * might as well signal the pusher thread now. 1447 */ 1448 cv_signal(&push_cv); 1449 } 1450 mutex_exit(&push_lock); 1451 return (1); 1452 } 1453 1454 /* 1455 * Wakeup pageout to initiate i/o if push_list is not empty. 1456 */ 1457 void 1458 cv_signal_pageout() 1459 { 1460 if (push_list != NULL) { 1461 mutex_enter(&push_lock); 1462 cv_signal(&push_cv); 1463 mutex_exit(&push_lock); 1464 } 1465 } 1466