1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2021 Oxide Computer Company 24 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. 25 */ 26 27 /* 28 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 29 * Use is subject to license terms. 30 */ 31 32 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 33 /* All Rights Reserved */ 34 35 /* 36 * University Copyright- Copyright (c) 1982, 1986, 1988 37 * The Regents of the University of California 38 * All Rights Reserved 39 * 40 * University Acknowledgment- Portions of this document are derived from 41 * software developed by the University of California, Berkeley, and its 42 * contributors. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/t_lock.h> 47 #include <sys/param.h> 48 #include <sys/buf.h> 49 #include <sys/uio.h> 50 #include <sys/proc.h> 51 #include <sys/systm.h> 52 #include <sys/mman.h> 53 #include <sys/cred.h> 54 #include <sys/vnode.h> 55 #include <sys/vm.h> 56 #include <sys/vmparam.h> 57 #include <sys/vtrace.h> 58 #include <sys/cmn_err.h> 59 #include <sys/cpuvar.h> 60 #include <sys/user.h> 61 #include <sys/kmem.h> 62 #include <sys/debug.h> 63 #include <sys/callb.h> 64 #include <sys/mem_cage.h> 65 #include <sys/time.h> 66 #include <sys/stdbool.h> 67 68 #include <vm/hat.h> 69 #include <vm/as.h> 70 #include <vm/seg.h> 71 #include <vm/page.h> 72 #include <vm/pvn.h> 73 #include <vm/seg_kmem.h> 74 75 /* 76 * FREE MEMORY MANAGEMENT 77 * 78 * Management of the pool of free pages is a tricky business. There are 79 * several critical threshold values which constrain our allocation of new 80 * pages and inform the rate of paging out of memory to swap. These threshold 81 * values, and the behaviour they induce, are described below in descending 82 * order of size -- and thus increasing order of severity! 83 * 84 * +---------------------------------------------------- physmem (all memory) 85 * | 86 * | Ordinarily there are no particular constraints placed on page 87 * v allocation. The page scanner is not running and page_create_va() 88 * | will effectively grant all page requests (whether from the kernel 89 * | or from user processes) without artificial delay. 90 * | 91 * +------------------------ lotsfree (1.56% of physmem, min. 16MB, max. 2GB) 92 * | 93 * | When we have less than "lotsfree" pages, pageout_scanner() is 94 * v signalled by schedpaging() to begin looking for pages that can 95 * | be evicted to disk to bring us back above lotsfree. At this 96 * | stage there is still no constraint on allocation of free pages. 97 * | 98 * | For small systems, we set a lower bound of 16MB for lotsfree; 99 * v this is the natural value for a system with 1GB memory. This is 100 * | to ensure that the pageout reserve pool contains at least 4MB 101 * | for use by ZFS. 102 * | 103 * | For systems with a large amount of memory, we constrain lotsfree 104 * | to be at most 2GB (with a pageout reserve of around 0.5GB), as 105 * v at some point the required slack relates more closely to the 106 * | rate at which paging can occur than to the total amount of memory. 107 * | 108 * +------------------- desfree (1/2 of lotsfree, 0.78% of physmem, min. 8MB) 109 * | 110 * | When we drop below desfree, a number of kernel facilities will 111 * v wait before allocating more memory, under the assumption that 112 * | pageout or reaping will make progress and free up some memory. 113 * | This behaviour is not especially coordinated; look for comparisons 114 * | of desfree and freemem. 115 * | 116 * | In addition to various attempts at advisory caution, clock() 117 * | will wake up the thread that is ordinarily parked in sched(). 118 * | This routine is responsible for the heavy-handed swapping out 119 * v of entire processes in an attempt to arrest the slide of free 120 * | memory. See comments in sched.c for more details. 121 * | 122 * +----- minfree & throttlefree (3/4 of desfree, 0.59% of physmem, min. 6MB) 123 * | 124 * | These two separate tunables have, by default, the same value. 125 * v Various parts of the kernel use minfree to signal the need for 126 * | more aggressive reclamation of memory, and sched() is more 127 * | aggressive at swapping processes out. 128 * | 129 * | If free memory falls below throttlefree, page_create_va() will 130 * | use page_create_throttle() to begin holding most requests for 131 * | new pages while pageout and reaping free up memory. Sleeping 132 * v allocations (e.g., KM_SLEEP) are held here while we wait for 133 * | more memory. Non-sleeping allocations are generally allowed to 134 * | proceed, unless their priority is explicitly lowered with 135 * | KM_NORMALPRI (Note: KM_NOSLEEP_LAZY == (KM_NOSLEEP | KM_NORMALPRI).). 136 * | 137 * +------- pageout_reserve (3/4 of throttlefree, 0.44% of physmem, min. 4MB) 138 * | 139 * | When we hit throttlefree, the situation is already dire. The 140 * v system is generally paging out memory and swapping out entire 141 * | processes in order to free up memory for continued operation. 142 * | 143 * | Unfortunately, evicting memory to disk generally requires short 144 * | term use of additional memory; e.g., allocation of buffers for 145 * | storage drivers, updating maps of free and used blocks, etc. 146 * | As such, pageout_reserve is the number of pages that we keep in 147 * | special reserve for use by pageout() and sched() and by any 148 * v other parts of the kernel that need to be working for those to 149 * | make forward progress such as the ZFS I/O pipeline. 150 * | 151 * | When we are below pageout_reserve, we fail or hold any allocation 152 * | that has not explicitly requested access to the reserve pool. 153 * | Access to the reserve is generally granted via the KM_PUSHPAGE 154 * | flag, or by marking a thread T_PUSHPAGE such that all allocations 155 * | can implicitly tap the reserve. For more details, see the 156 * v NOMEMWAIT() macro, the T_PUSHPAGE thread flag, the KM_PUSHPAGE 157 * | and VM_PUSHPAGE allocation flags, and page_create_throttle(). 158 * | 159 * +---------------------------------------------------------- no free memory 160 * | 161 * | If we have arrived here, things are very bad indeed. It is 162 * v surprisingly difficult to tell if this condition is even fatal, 163 * | as enough memory may have been granted to pageout() and to the 164 * | ZFS I/O pipeline that requests for eviction that have already been 165 * | made will complete and free up memory some time soon. 166 * | 167 * | If free memory does not materialise, the system generally remains 168 * | deadlocked. The pageout_deadman() below is run once per second 169 * | from clock(), seeking to limit the amount of time a single request 170 * v to page out can be blocked before the system panics to get a crash 171 * | dump and return to service. 172 * | 173 * +------------------------------------------------------------------------- 174 */ 175 176 /* 177 * The following parameters control operation of the page replacement 178 * algorithm. They are initialized to 0, and then computed at boot time based 179 * on the size of the system; see setupclock(). If they are patched non-zero 180 * in a loaded vmunix they are left alone and may thus be changed per system 181 * using "mdb -kw" on the loaded system. 182 */ 183 pgcnt_t slowscan = 0; 184 pgcnt_t fastscan = 0; 185 186 static pgcnt_t handspreadpages = 0; 187 188 /* 189 * looppages: 190 * Cached copy of the total number of pages in the system (total_pages). 191 * 192 * loopfraction: 193 * Divisor used to relate fastscan to looppages in setupclock(). 194 */ 195 static uint_t loopfraction = 2; 196 static pgcnt_t looppages; 197 198 static uint_t min_percent_cpu = 4; 199 static uint_t max_percent_cpu = 80; 200 static pgcnt_t maxfastscan = 0; 201 static pgcnt_t maxslowscan = 100; 202 203 #define MEGABYTES (1024ULL * 1024ULL) 204 205 /* 206 * pageout_threshold_style: 207 * set to 1 to use the previous default threshold size calculation; 208 * i.e., each threshold is half of the next largest value. 209 */ 210 uint_t pageout_threshold_style = 0; 211 212 /* 213 * The operator may override these tunables to request a different minimum or 214 * maximum lotsfree value, or to change the divisor we use for automatic 215 * sizing. 216 * 217 * By default, we make lotsfree 1/64th of the total memory in the machine. The 218 * minimum and maximum are specified in bytes, rather than pages; a zero value 219 * means the default values (below) are used. 220 */ 221 uint_t lotsfree_fraction = 64; 222 pgcnt_t lotsfree_min = 0; 223 pgcnt_t lotsfree_max = 0; 224 225 #define LOTSFREE_MIN_DEFAULT (16 * MEGABYTES) 226 #define LOTSFREE_MAX_DEFAULT (2048 * MEGABYTES) 227 228 /* 229 * If these tunables are set to non-zero values in /etc/system, and provided 230 * the value is not larger than the threshold above, the specified value will 231 * be used directly without any additional calculation or adjustment. The boot 232 * time value of these overrides is preserved in the "clockinit" struct. More 233 * detail is available in the comment at the top of the file. 234 */ 235 pgcnt_t maxpgio = 0; 236 pgcnt_t minfree = 0; 237 pgcnt_t desfree = 0; 238 pgcnt_t lotsfree = 0; 239 pgcnt_t needfree = 0; 240 pgcnt_t throttlefree = 0; 241 pgcnt_t pageout_reserve = 0; 242 243 pgcnt_t deficit; 244 pgcnt_t nscan; 245 pgcnt_t desscan; 246 247 /* 248 * Values for min_pageout_nsec, max_pageout_nsec and pageout_nsec are the 249 * number of nanoseconds in each wakeup cycle that gives the equivalent of some 250 * underlying %CPU duty cycle. 251 * 252 * min_pageout_nsec: 253 * nanoseconds/wakeup equivalent of min_percent_cpu. 254 * 255 * max_pageout_nsec: 256 * nanoseconds/wakeup equivalent of max_percent_cpu. 257 * 258 * pageout_nsec: 259 * Number of nanoseconds budgeted for each wakeup cycle. 260 * Computed each time around by schedpaging(). 261 * Varies between min_pageout_nsec and max_pageout_nsec, 262 * depending on memory pressure. 263 */ 264 static hrtime_t min_pageout_nsec; 265 static hrtime_t max_pageout_nsec; 266 static hrtime_t pageout_nsec; 267 268 static uint_t reset_hands; 269 270 #define PAGES_POLL_MASK 1023 271 272 /* 273 * pageout_sample_lim: 274 * The limit on the number of samples needed to establish a value for new 275 * pageout parameters: fastscan, slowscan, pageout_new_spread, and 276 * handspreadpages. 277 * 278 * pageout_sample_cnt: 279 * Current sample number. Once the sample gets large enough, set new 280 * values for handspreadpages, pageout_new_spread, fastscan and slowscan. 281 * 282 * pageout_sample_pages: 283 * The accumulated number of pages scanned during sampling. 284 * 285 * pageout_sample_etime: 286 * The accumulated nanoseconds for the sample. 287 * 288 * pageout_rate: 289 * Rate in pages/nanosecond, computed at the end of sampling. 290 * 291 * pageout_new_spread: 292 * Initially zero while the system scan rate is measured by 293 * pageout_scanner(), which then sets this value once per system boot after 294 * enough samples have been recorded (pageout_sample_cnt). Once set, this 295 * new value is used for fastscan and handspreadpages. 296 * 297 * sample_start, sample_end: 298 * The hrtime at which the last pageout_scanner() sample began and ended. 299 */ 300 typedef hrtime_t hrrate_t; 301 302 static uint64_t pageout_sample_lim = 4; 303 static uint64_t pageout_sample_cnt = 0; 304 static pgcnt_t pageout_sample_pages = 0; 305 static hrrate_t pageout_rate = 0; 306 static pgcnt_t pageout_new_spread = 0; 307 308 static hrtime_t pageout_cycle_nsec; 309 static hrtime_t sample_start, sample_end; 310 static hrtime_t pageout_sample_etime = 0; 311 312 /* 313 * Record number of times a pageout_scanner() wakeup cycle finished because it 314 * timed out (exceeded its CPU budget), rather than because it visited 315 * its budgeted number of pages. 316 */ 317 uint64_t pageout_timeouts = 0; 318 319 #ifdef VM_STATS 320 static struct pageoutvmstats_str { 321 ulong_t checkpage[3]; 322 } pageoutvmstats; 323 #endif /* VM_STATS */ 324 325 /* 326 * Threads waiting for free memory use this condition variable and lock until 327 * memory becomes available. 328 */ 329 kmutex_t memavail_lock; 330 kcondvar_t memavail_cv; 331 332 typedef enum pageout_hand { 333 POH_FRONT = 1, 334 POH_BACK, 335 } pageout_hand_t; 336 337 typedef enum { 338 CKP_INELIGIBLE, 339 CKP_NOT_FREED, 340 CKP_FREED, 341 } checkpage_result_t; 342 343 static checkpage_result_t checkpage(page_t *, pageout_hand_t); 344 345 static struct clockinit { 346 bool ci_init; 347 pgcnt_t ci_lotsfree_min; 348 pgcnt_t ci_lotsfree_max; 349 pgcnt_t ci_lotsfree; 350 pgcnt_t ci_desfree; 351 pgcnt_t ci_minfree; 352 pgcnt_t ci_throttlefree; 353 pgcnt_t ci_pageout_reserve; 354 pgcnt_t ci_maxpgio; 355 pgcnt_t ci_maxfastscan; 356 pgcnt_t ci_fastscan; 357 pgcnt_t ci_slowscan; 358 pgcnt_t ci_handspreadpages; 359 } clockinit = { .ci_init = false }; 360 361 static pgcnt_t 362 clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum) 363 { 364 if (value < minimum) { 365 return (minimum); 366 } else if (value > maximum) { 367 return (maximum); 368 } else { 369 return (value); 370 } 371 } 372 373 static pgcnt_t 374 tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval) 375 { 376 if (initval == 0 || initval >= initval_ceiling) { 377 return (defval); 378 } else { 379 return (initval); 380 } 381 } 382 383 /* 384 * Set up the paging constants for the clock algorithm used by 385 * pageout_scanner(), and by the virtual memory system overall. See the 386 * comments at the top of this file for more information about the threshold 387 * values and system responses to memory pressure. 388 * 389 * This routine is called once by main() at startup, after the initial size of 390 * physical memory is determined. It may be called again later if memory is 391 * added to or removed from the system, or if new measurements of the page scan 392 * rate become available. 393 */ 394 void 395 setupclock(void) 396 { 397 pgcnt_t defval; 398 bool half = (pageout_threshold_style == 1); 399 bool recalc = true; 400 401 looppages = total_pages; 402 403 /* 404 * The operator may have provided specific values for some of the 405 * tunables via /etc/system. On our first call, we preserve those 406 * values so that they can be used for subsequent recalculations. 407 * 408 * A value of zero for any tunable means we will use the default 409 * sizing. 410 */ 411 if (!clockinit.ci_init) { 412 clockinit.ci_init = true; 413 414 clockinit.ci_lotsfree_min = lotsfree_min; 415 clockinit.ci_lotsfree_max = lotsfree_max; 416 clockinit.ci_lotsfree = lotsfree; 417 clockinit.ci_desfree = desfree; 418 clockinit.ci_minfree = minfree; 419 clockinit.ci_throttlefree = throttlefree; 420 clockinit.ci_pageout_reserve = pageout_reserve; 421 clockinit.ci_maxpgio = maxpgio; 422 clockinit.ci_maxfastscan = maxfastscan; 423 clockinit.ci_fastscan = fastscan; 424 clockinit.ci_slowscan = slowscan; 425 clockinit.ci_handspreadpages = handspreadpages; 426 427 /* 428 * The first call does not trigger a recalculation, only 429 * subsequent calls. 430 */ 431 recalc = false; 432 } 433 434 /* 435 * Configure paging threshold values. For more details on what each 436 * threshold signifies, see the comments at the top of this file. 437 */ 438 lotsfree_max = tune(clockinit.ci_lotsfree_max, looppages, 439 btop(LOTSFREE_MAX_DEFAULT)); 440 lotsfree_min = tune(clockinit.ci_lotsfree_min, lotsfree_max, 441 btop(LOTSFREE_MIN_DEFAULT)); 442 443 lotsfree = tune(clockinit.ci_lotsfree, looppages, 444 clamp(looppages / lotsfree_fraction, lotsfree_min, lotsfree_max)); 445 446 desfree = tune(clockinit.ci_desfree, lotsfree, 447 lotsfree / 2); 448 449 minfree = tune(clockinit.ci_minfree, desfree, 450 half ? desfree / 2 : 3 * desfree / 4); 451 452 throttlefree = tune(clockinit.ci_throttlefree, desfree, 453 minfree); 454 455 pageout_reserve = tune(clockinit.ci_pageout_reserve, throttlefree, 456 half ? throttlefree / 2 : 3 * throttlefree / 4); 457 458 /* 459 * Maxpgio thresholds how much paging is acceptable. 460 * This figures that 2/3 busy on an arm is all that is 461 * tolerable for paging. We assume one operation per disk rev. 462 * 463 * XXX - Does not account for multiple swap devices. 464 */ 465 if (clockinit.ci_maxpgio == 0) { 466 maxpgio = (DISKRPM * 2) / 3; 467 } else { 468 maxpgio = clockinit.ci_maxpgio; 469 } 470 471 /* 472 * The clock scan rate varies between fastscan and slowscan 473 * based on the amount of free memory available. Fastscan 474 * rate should be set based on the number pages that can be 475 * scanned per sec using ~10% of processor time. Since this 476 * value depends on the processor, MMU, Mhz etc., it is 477 * difficult to determine it in a generic manner for all 478 * architectures. 479 * 480 * Instead of trying to determine the number of pages scanned 481 * per sec for every processor, fastscan is set to be the smaller 482 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling 483 * time is limited to ~4% of processor time. 484 * 485 * Setting fastscan to be 1/2 of memory allows pageout to scan 486 * all of memory in ~2 secs. This implies that user pages not 487 * accessed within 1 sec (assuming, handspreadpages == fastscan) 488 * can be reclaimed when free memory is very low. Stealing pages 489 * not accessed within 1 sec seems reasonable and ensures that 490 * active user processes don't thrash. 491 * 492 * Smaller values of fastscan result in scanning fewer pages 493 * every second and consequently pageout may not be able to free 494 * sufficient memory to maintain the minimum threshold. Larger 495 * values of fastscan result in scanning a lot more pages which 496 * could lead to thrashing and higher CPU usage. 497 * 498 * Fastscan needs to be limited to a maximum value and should not 499 * scale with memory to prevent pageout from consuming too much 500 * time for scanning on slow CPU's and avoid thrashing, as a 501 * result of scanning too many pages, on faster CPU's. 502 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES 503 * (the upper bound for fastscan) based on the average number 504 * of pages that can potentially be scanned in ~1 sec (using ~4% 505 * of the CPU) on some of the following machines that currently 506 * run Solaris 2.x: 507 * 508 * average memory scanned in ~1 sec 509 * 510 * 25 Mhz SS1+: 23 Meg 511 * LX: 37 Meg 512 * 50 Mhz SC2000: 68 Meg 513 * 514 * 40 Mhz 486: 26 Meg 515 * 66 Mhz 486: 42 Meg 516 * 517 * When free memory falls just below lotsfree, the scan rate 518 * goes from 0 to slowscan (i.e., pageout starts running). This 519 * transition needs to be smooth and is achieved by ensuring that 520 * pageout scans a small number of pages to satisfy the transient 521 * memory demand. This is set to not exceed 100 pages/sec (25 per 522 * wakeup) since scanning that many pages has no noticible impact 523 * on system performance. 524 * 525 * In addition to setting fastscan and slowscan, pageout is 526 * limited to using ~4% of the CPU. This results in increasing 527 * the time taken to scan all of memory, which in turn means that 528 * user processes have a better opportunity of preventing their 529 * pages from being stolen. This has a positive effect on 530 * interactive and overall system performance when memory demand 531 * is high. 532 * 533 * Thus, the rate at which pages are scanned for replacement will 534 * vary linearly between slowscan and the number of pages that 535 * can be scanned using ~4% of processor time instead of varying 536 * linearly between slowscan and fastscan. 537 * 538 * Also, the processor time used by pageout will vary from ~1% 539 * at slowscan to ~4% at fastscan instead of varying between 540 * ~1% at slowscan and ~10% at fastscan. 541 * 542 * The values chosen for the various VM parameters (fastscan, 543 * handspreadpages, etc) are not universally true for all machines, 544 * but appear to be a good rule of thumb for the machines we've 545 * tested. They have the following ranges: 546 * 547 * cpu speed: 20 to 70 Mhz 548 * page size: 4K to 8K 549 * memory size: 16M to 5G 550 * page scan rate: 4000 - 17400 4K pages per sec 551 * 552 * The values need to be re-examined for machines which don't 553 * fall into the various ranges (e.g., slower or faster CPUs, 554 * smaller or larger pagesizes etc) shown above. 555 * 556 * On an MP machine, pageout is often unable to maintain the 557 * minimum paging thresholds under heavy load. This is due to 558 * the fact that user processes running on other CPU's can be 559 * dirtying memory at a much faster pace than pageout can find 560 * pages to free. The memory demands could be met by enabling 561 * more than one CPU to run the clock algorithm in such a manner 562 * that the various clock hands don't overlap. This also makes 563 * it more difficult to determine the values for fastscan, slowscan 564 * and handspreadpages. 565 * 566 * The swapper is currently used to free up memory when pageout 567 * is unable to meet memory demands by swapping out processes. 568 * In addition to freeing up memory, swapping also reduces the 569 * demand for memory by preventing user processes from running 570 * and thereby consuming memory. 571 */ 572 if (clockinit.ci_maxfastscan == 0) { 573 if (pageout_new_spread != 0) { 574 maxfastscan = pageout_new_spread; 575 } else { 576 maxfastscan = MAXHANDSPREADPAGES; 577 } 578 } else { 579 maxfastscan = clockinit.ci_maxfastscan; 580 } 581 582 if (clockinit.ci_fastscan == 0) { 583 fastscan = MIN(looppages / loopfraction, maxfastscan); 584 } else { 585 fastscan = clockinit.ci_fastscan; 586 } 587 588 if (fastscan > looppages / loopfraction) { 589 fastscan = looppages / loopfraction; 590 } 591 592 /* 593 * Set slow scan time to 1/10 the fast scan time, but 594 * not to exceed maxslowscan. 595 */ 596 if (clockinit.ci_slowscan == 0) { 597 slowscan = MIN(fastscan / 10, maxslowscan); 598 } else { 599 slowscan = clockinit.ci_slowscan; 600 } 601 602 if (slowscan > fastscan / 2) { 603 slowscan = fastscan / 2; 604 } 605 606 /* 607 * Handspreadpages is distance (in pages) between front and back 608 * pageout daemon hands. The amount of time to reclaim a page 609 * once pageout examines it increases with this distance and 610 * decreases as the scan rate rises. It must be < the amount 611 * of pageable memory. 612 * 613 * Since pageout is limited to ~4% of the CPU, setting handspreadpages 614 * to be "fastscan" results in the front hand being a few secs 615 * (varies based on the processor speed) ahead of the back hand 616 * at fastscan rates. This distance can be further reduced, if 617 * necessary, by increasing the processor time used by pageout 618 * to be more than ~4% and preferrably not more than ~10%. 619 * 620 * As a result, user processes have a much better chance of 621 * referencing their pages before the back hand examines them. 622 * This also significantly lowers the number of reclaims from 623 * the freelist since pageout does not end up freeing pages which 624 * may be referenced a sec later. 625 */ 626 if (clockinit.ci_handspreadpages == 0) { 627 handspreadpages = fastscan; 628 } else { 629 handspreadpages = clockinit.ci_handspreadpages; 630 } 631 632 /* 633 * Make sure that back hand follows front hand by at least 634 * 1/SCHEDPAGING_HZ seconds. Without this test, it is possible for the 635 * back hand to look at a page during the same wakeup of the pageout 636 * daemon in which the front hand cleared its ref bit. 637 */ 638 if (handspreadpages >= looppages) { 639 handspreadpages = looppages - 1; 640 } 641 642 /* 643 * If we have been called to recalculate the parameters, set a flag to 644 * re-evaluate the clock hand pointers. 645 */ 646 if (recalc) { 647 reset_hands = 1; 648 } 649 } 650 651 /* 652 * Pageout scheduling. 653 * 654 * Schedpaging controls the rate at which the page out daemon runs by 655 * setting the global variables nscan and desscan SCHEDPAGING_HZ 656 * times a second. Nscan records the number of pages pageout has examined 657 * in its current pass; schedpaging() resets this value to zero each time 658 * it runs. Desscan records the number of pages pageout should examine 659 * in its next pass; schedpaging() sets this value based on the amount of 660 * currently available memory. 661 */ 662 #define SCHEDPAGING_HZ 4 663 664 static kmutex_t pageout_mutex; /* held while pageout or schedpaging running */ 665 666 /* 667 * Pool of available async pageout putpage requests. 668 */ 669 static struct async_reqs *push_req; 670 static struct async_reqs *req_freelist; /* available req structs */ 671 static struct async_reqs *push_list; /* pending reqs */ 672 static kmutex_t push_lock; /* protects req pool */ 673 static kcondvar_t push_cv; 674 675 /* 676 * If pageout() is stuck on a single push for this many seconds, 677 * pageout_deadman() will assume the system has hit a memory deadlock. If set 678 * to 0, the deadman will have no effect. 679 * 680 * Note that we are only looking for stalls in the calls that pageout() makes 681 * to VOP_PUTPAGE(). These calls are merely asynchronous requests for paging 682 * I/O, which should not take long unless the underlying strategy call blocks 683 * indefinitely for memory. The actual I/O request happens (or fails) later. 684 */ 685 uint_t pageout_deadman_seconds = 90; 686 687 static uint_t pageout_stucktime = 0; 688 static bool pageout_pushing = false; 689 static uint64_t pageout_pushcount = 0; 690 static uint64_t pageout_pushcount_seen = 0; 691 692 static int async_list_size = 256; /* number of async request structs */ 693 694 static void pageout_scanner(void); 695 696 /* 697 * If a page is being shared more than "po_share" times 698 * then leave it alone- don't page it out. 699 */ 700 #define MIN_PO_SHARE (8) 701 #define MAX_PO_SHARE ((MIN_PO_SHARE) << 24) 702 ulong_t po_share = MIN_PO_SHARE; 703 704 /* 705 * Schedule rate for paging. 706 * Rate is linear interpolation between 707 * slowscan with lotsfree and fastscan when out of memory. 708 */ 709 static void 710 schedpaging(void *arg) 711 { 712 spgcnt_t vavail; 713 714 if (freemem < lotsfree + needfree + kmem_reapahead) 715 kmem_reap(); 716 717 if (freemem < lotsfree + needfree) 718 seg_preap(); 719 720 if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree)) 721 kcage_cageout_wakeup(); 722 723 if (mutex_tryenter(&pageout_mutex)) { 724 /* pageout() not running */ 725 nscan = 0; 726 vavail = freemem - deficit; 727 if (pageout_new_spread != 0) 728 vavail -= needfree; 729 if (vavail < 0) 730 vavail = 0; 731 if (vavail > lotsfree) 732 vavail = lotsfree; 733 734 /* 735 * Fix for 1161438 (CRS SPR# 73922). All variables 736 * in the original calculation for desscan were 32 bit signed 737 * ints. As freemem approaches 0x0 on a system with 1 Gig or 738 * more of memory, the calculation can overflow. When this 739 * happens, desscan becomes negative and pageout_scanner() 740 * stops paging out. 741 */ 742 if (needfree > 0 && pageout_new_spread == 0) { 743 /* 744 * If we've not yet collected enough samples to 745 * calculate a spread, use the old logic of kicking 746 * into high gear anytime needfree is non-zero. 747 */ 748 desscan = fastscan / SCHEDPAGING_HZ; 749 } else { 750 /* 751 * Once we've calculated a spread based on system 752 * memory and usage, just treat needfree as another 753 * form of deficit. 754 */ 755 spgcnt_t faststmp, slowstmp, result; 756 757 slowstmp = slowscan * vavail; 758 faststmp = fastscan * (lotsfree - vavail); 759 result = (slowstmp + faststmp) / 760 nz(lotsfree) / SCHEDPAGING_HZ; 761 desscan = (pgcnt_t)result; 762 } 763 764 pageout_nsec = min_pageout_nsec + (lotsfree - vavail) * 765 (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree); 766 767 if (freemem < lotsfree + needfree || 768 pageout_sample_cnt < pageout_sample_lim) { 769 /* 770 * Either we need more memory, or we still need to 771 * measure the average scan rate. Wake the scanner. 772 */ 773 DTRACE_PROBE(pageout__cv__signal); 774 cv_signal(&proc_pageout->p_cv); 775 } else { 776 /* 777 * There are enough free pages, no need to 778 * kick the scanner thread. And next time 779 * around, keep more of the `highly shared' 780 * pages. 781 */ 782 cv_signal_pageout(); 783 if (po_share > MIN_PO_SHARE) { 784 po_share >>= 1; 785 } 786 } 787 mutex_exit(&pageout_mutex); 788 } 789 790 /* 791 * Signal threads waiting for available memory. 792 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but 793 * in this case it is not needed - the waiters will be waken up during 794 * the next invocation of this function. 795 */ 796 if (kmem_avail() > 0) 797 cv_broadcast(&memavail_cv); 798 799 (void) timeout(schedpaging, arg, hz / SCHEDPAGING_HZ); 800 } 801 802 pgcnt_t pushes; 803 ulong_t push_list_size; /* # of requests on pageout queue */ 804 805 /* 806 * Paging out should always be enabled. This tunable exists to hold pageout 807 * for debugging purposes. If set to 0, pageout_scanner() will go back to 808 * sleep each time it is woken by schedpaging(). 809 */ 810 uint_t dopageout = 1; 811 812 /* 813 * The page out daemon, which runs as process 2. 814 * 815 * As long as there are at least lotsfree pages, 816 * this process is not run. When the number of free 817 * pages stays in the range desfree to lotsfree, 818 * this daemon runs through the pages in the loop 819 * at a rate determined in schedpaging(). Pageout manages 820 * two hands on the clock. The front hand moves through 821 * memory, clearing the reference bit, 822 * and stealing pages from procs that are over maxrss. 823 * The back hand travels a distance behind the front hand, 824 * freeing the pages that have not been referenced in the time 825 * since the front hand passed. If modified, they are pushed to 826 * swap before being freed. 827 * 828 * There are 2 threads that act on behalf of the pageout process. 829 * One thread scans pages (pageout_scanner) and frees them up if 830 * they don't require any VOP_PUTPAGE operation. If a page must be 831 * written back to its backing store, the request is put on a list 832 * and the other (pageout) thread is signaled. The pageout thread 833 * grabs VOP_PUTPAGE requests from the list, and processes them. 834 * Some filesystems may require resources for the VOP_PUTPAGE 835 * operations (like memory) and hence can block the pageout 836 * thread, but the scanner thread can still operate. There is still 837 * no guarantee that memory deadlocks cannot occur. 838 * 839 * For now, this thing is in very rough form. 840 */ 841 void 842 pageout() 843 { 844 struct async_reqs *arg; 845 pri_t pageout_pri; 846 int i; 847 pgcnt_t max_pushes; 848 callb_cpr_t cprinfo; 849 850 proc_pageout = ttoproc(curthread); 851 proc_pageout->p_cstime = 0; 852 proc_pageout->p_stime = 0; 853 proc_pageout->p_cutime = 0; 854 proc_pageout->p_utime = 0; 855 bcopy("pageout", PTOU(curproc)->u_psargs, 8); 856 bcopy("pageout", PTOU(curproc)->u_comm, 7); 857 858 /* 859 * Create pageout scanner thread 860 */ 861 mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL); 862 mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL); 863 864 /* 865 * Allocate and initialize the async request structures 866 * for pageout. 867 */ 868 push_req = (struct async_reqs *) 869 kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP); 870 871 req_freelist = push_req; 872 for (i = 0; i < async_list_size - 1; i++) { 873 push_req[i].a_next = &push_req[i + 1]; 874 } 875 876 pageout_pri = curthread->t_pri; 877 878 /* Create the pageout scanner thread. */ 879 (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN, 880 pageout_pri - 1); 881 882 /* 883 * kick off pageout scheduler. 884 */ 885 schedpaging(NULL); 886 887 /* 888 * Create kernel cage thread. 889 * The kernel cage thread is started under the pageout process 890 * to take advantage of the less restricted page allocation 891 * in page_create_throttle(). 892 */ 893 kcage_cageout_init(); 894 895 /* 896 * Limit pushes to avoid saturating pageout devices. 897 */ 898 max_pushes = maxpgio / SCHEDPAGING_HZ; 899 CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout"); 900 901 for (;;) { 902 mutex_enter(&push_lock); 903 904 while ((arg = push_list) == NULL || pushes > max_pushes) { 905 CALLB_CPR_SAFE_BEGIN(&cprinfo); 906 cv_wait(&push_cv, &push_lock); 907 pushes = 0; 908 CALLB_CPR_SAFE_END(&cprinfo, &push_lock); 909 } 910 push_list = arg->a_next; 911 arg->a_next = NULL; 912 pageout_pushing = true; 913 mutex_exit(&push_lock); 914 915 if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off, 916 arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) { 917 pushes++; 918 } 919 920 /* vp held by checkpage() */ 921 VN_RELE(arg->a_vp); 922 923 mutex_enter(&push_lock); 924 pageout_pushing = false; 925 pageout_pushcount++; 926 arg->a_next = req_freelist; /* back on freelist */ 927 req_freelist = arg; 928 push_list_size--; 929 mutex_exit(&push_lock); 930 } 931 } 932 933 /* 934 * Kernel thread that scans pages looking for ones to free 935 */ 936 static void 937 pageout_scanner(void) 938 { 939 struct page *fronthand, *backhand; 940 uint_t laps; 941 callb_cpr_t cprinfo; 942 pgcnt_t nscan_limit; 943 pgcnt_t pcount; 944 bool sampling; 945 946 CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan"); 947 mutex_enter(&pageout_mutex); 948 949 /* 950 * The restart case does not attempt to point the hands at roughly 951 * the right point on the assumption that after one circuit things 952 * will have settled down, and restarts shouldn't be that often. 953 */ 954 955 /* 956 * Set the two clock hands to be separated by a reasonable amount, 957 * but no more than 360 degrees apart. 958 */ 959 backhand = page_first(); 960 if (handspreadpages >= total_pages) { 961 fronthand = page_nextn(backhand, total_pages - 1); 962 } else { 963 fronthand = page_nextn(backhand, handspreadpages); 964 } 965 966 /* 967 * Establish the minimum and maximum length of time to be spent 968 * scanning pages per wakeup, limiting the scanner duty cycle. The 969 * input percentage values (0-100) must be converted to a fraction of 970 * the number of nanoseconds in a second of wall time, then further 971 * scaled down by the number of scanner wakeups in a second: 972 */ 973 min_pageout_nsec = MAX(1, 974 NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ); 975 max_pageout_nsec = MAX(min_pageout_nsec, 976 NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ); 977 978 loop: 979 cv_signal_pageout(); 980 981 CALLB_CPR_SAFE_BEGIN(&cprinfo); 982 cv_wait(&proc_pageout->p_cv, &pageout_mutex); 983 CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex); 984 985 /* 986 * Check if pageout has been disabled for debugging purposes: 987 */ 988 if (!dopageout) { 989 goto loop; 990 } 991 992 /* 993 * One may reset the clock hands for debugging purposes. Hands will 994 * also be reset if memory is added to or removed from the system. 995 */ 996 if (reset_hands) { 997 reset_hands = 0; 998 999 backhand = page_first(); 1000 if (handspreadpages >= total_pages) { 1001 fronthand = page_nextn(backhand, total_pages - 1); 1002 } else { 1003 fronthand = page_nextn(backhand, handspreadpages); 1004 } 1005 } 1006 1007 CPU_STATS_ADDQ(CPU, vm, pgrrun, 1); 1008 1009 /* 1010 * Keep track of the number of times we have scanned all the way around 1011 * the loop: 1012 */ 1013 laps = 0; 1014 1015 DTRACE_PROBE(pageout__start); 1016 1017 /* 1018 * Track the number of pages visited during this scan so that we can 1019 * periodically measure our duty cycle. 1020 */ 1021 pcount = 0; 1022 1023 if (pageout_sample_cnt < pageout_sample_lim) { 1024 /* 1025 * We need to measure the rate at which the system is able to 1026 * scan pages of memory. Each of these initial samples is a 1027 * scan of all system memory, regardless of whether or not we 1028 * are experiencing memory pressure. 1029 */ 1030 nscan_limit = total_pages; 1031 sampling = true; 1032 } else { 1033 nscan_limit = desscan; 1034 sampling = false; 1035 } 1036 1037 sample_start = gethrtime(); 1038 1039 /* 1040 * Scan the appropriate number of pages for a single duty cycle. 1041 */ 1042 while (nscan < nscan_limit) { 1043 checkpage_result_t rvfront, rvback; 1044 1045 if (!sampling && freemem >= lotsfree + needfree) { 1046 /* 1047 * We are not sampling and enough memory has become 1048 * available that scanning is no longer required. 1049 */ 1050 break; 1051 } 1052 1053 /* 1054 * Periodically check to see if we have exceeded the CPU duty 1055 * cycle for a single wakeup. 1056 */ 1057 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) { 1058 pageout_cycle_nsec = gethrtime() - sample_start; 1059 if (pageout_cycle_nsec >= pageout_nsec) { 1060 ++pageout_timeouts; 1061 break; 1062 } 1063 } 1064 1065 /* 1066 * If checkpage manages to add a page to the free list, 1067 * we give ourselves another couple of trips around the loop. 1068 */ 1069 if ((rvfront = checkpage(fronthand, POH_FRONT)) == CKP_FREED) { 1070 laps = 0; 1071 } 1072 if ((rvback = checkpage(backhand, POH_BACK)) == CKP_FREED) { 1073 laps = 0; 1074 } 1075 1076 ++pcount; 1077 1078 /* 1079 * Protected by pageout_mutex instead of cpu_stat_lock: 1080 */ 1081 CPU_STATS_ADDQ(CPU, vm, scan, 1); 1082 1083 /* 1084 * Don't include ineligible pages in the number scanned. 1085 */ 1086 if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) { 1087 nscan++; 1088 } 1089 1090 backhand = page_next(backhand); 1091 fronthand = page_next(fronthand); 1092 1093 /* 1094 * The front hand has wrapped around to the first page in the 1095 * loop. 1096 */ 1097 if (fronthand == page_first()) { 1098 laps++; 1099 DTRACE_PROBE1(pageout__hand__wrap, uint_t, laps); 1100 1101 /* 1102 * Protected by pageout_mutex instead of cpu_stat_lock: 1103 */ 1104 CPU_STATS_ADDQ(CPU, vm, rev, 1); 1105 1106 if (laps > 1) { 1107 /* 1108 * Extremely unlikely, but it happens. 1109 * We went around the loop at least once 1110 * and didn't get far enough. 1111 * If we are still skipping `highly shared' 1112 * pages, skip fewer of them. Otherwise, 1113 * give up till the next clock tick. 1114 */ 1115 if (po_share < MAX_PO_SHARE) { 1116 po_share <<= 1; 1117 } else { 1118 break; 1119 } 1120 } 1121 } 1122 } 1123 1124 sample_end = gethrtime(); 1125 1126 DTRACE_PROBE1(pageout__end, uint_t, laps); 1127 1128 if (pageout_new_spread == 0) { 1129 if (pageout_sample_cnt < pageout_sample_lim) { 1130 /* 1131 * Continue accumulating samples until we have enough 1132 * to get a reasonable value for average scan rate: 1133 */ 1134 pageout_sample_pages += pcount; 1135 pageout_sample_etime += sample_end - sample_start; 1136 ++pageout_sample_cnt; 1137 } 1138 1139 if (pageout_sample_cnt >= pageout_sample_lim) { 1140 /* 1141 * We have enough samples, set the spread. 1142 */ 1143 pageout_rate = (hrrate_t)pageout_sample_pages * 1144 (hrrate_t)(NANOSEC) / pageout_sample_etime; 1145 pageout_new_spread = pageout_rate / 10; 1146 setupclock(); 1147 } 1148 } 1149 1150 goto loop; 1151 } 1152 1153 /* 1154 * The pageout deadman is run once per second by clock(). 1155 */ 1156 void 1157 pageout_deadman(void) 1158 { 1159 if (panicstr != NULL) { 1160 /* 1161 * There is no pageout after panic. 1162 */ 1163 return; 1164 } 1165 1166 if (pageout_deadman_seconds == 0) { 1167 /* 1168 * The deadman is not enabled. 1169 */ 1170 return; 1171 } 1172 1173 if (!pageout_pushing) { 1174 goto reset; 1175 } 1176 1177 /* 1178 * We are pushing a page. Check to see if it is the same call we saw 1179 * last time we looked: 1180 */ 1181 if (pageout_pushcount != pageout_pushcount_seen) { 1182 /* 1183 * It is a different call from the last check, so we are not 1184 * stuck. 1185 */ 1186 goto reset; 1187 } 1188 1189 if (++pageout_stucktime >= pageout_deadman_seconds) { 1190 panic("pageout_deadman: stuck pushing the same page for %d " 1191 "seconds (freemem is %lu)", pageout_deadman_seconds, 1192 freemem); 1193 } 1194 1195 return; 1196 1197 reset: 1198 /* 1199 * Reset our tracking state to reflect that we are not stuck: 1200 */ 1201 pageout_stucktime = 0; 1202 pageout_pushcount_seen = pageout_pushcount; 1203 } 1204 1205 /* 1206 * Look at the page at hand. If it is locked (e.g., for physical i/o), 1207 * system (u., page table) or free, then leave it alone. Otherwise, 1208 * if we are running the front hand, turn off the page's reference bit. 1209 * If the proc is over maxrss, we take it. If running the back hand, 1210 * check whether the page has been reclaimed. If not, free the page, 1211 * pushing it to disk first if necessary. 1212 * 1213 * Return values: 1214 * CKP_INELIGIBLE if the page is not a candidate at all, 1215 * CKP_NOT_FREED if the page was not freed, or 1216 * CKP_FREED if we freed it. 1217 */ 1218 static checkpage_result_t 1219 checkpage(struct page *pp, pageout_hand_t whichhand) 1220 { 1221 int ppattr; 1222 int isfs = 0; 1223 int isexec = 0; 1224 int pagesync_flag; 1225 1226 /* 1227 * Skip pages: 1228 * - associated with the kernel vnode since 1229 * they are always "exclusively" locked. 1230 * - that are free 1231 * - that are shared more than po_share'd times 1232 * - its already locked 1233 * 1234 * NOTE: These optimizations assume that reads are atomic. 1235 */ 1236 1237 if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) || 1238 pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 1239 hat_page_checkshare(pp, po_share)) { 1240 return (CKP_INELIGIBLE); 1241 } 1242 1243 if (!page_trylock(pp, SE_EXCL)) { 1244 /* 1245 * Skip the page if we can't acquire the "exclusive" lock. 1246 */ 1247 return (CKP_INELIGIBLE); 1248 } else if (PP_ISFREE(pp)) { 1249 /* 1250 * It became free between the above check and our actually 1251 * locking the page. Oh well, there will be other pages. 1252 */ 1253 page_unlock(pp); 1254 return (CKP_INELIGIBLE); 1255 } 1256 1257 /* 1258 * Reject pages that cannot be freed. The page_struct_lock 1259 * need not be acquired to examine these 1260 * fields since the page has an "exclusive" lock. 1261 */ 1262 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1263 page_unlock(pp); 1264 return (CKP_INELIGIBLE); 1265 } 1266 1267 /* 1268 * Maintain statistics for what we are freeing 1269 */ 1270 if (pp->p_vnode != NULL) { 1271 if (pp->p_vnode->v_flag & VVMEXEC) 1272 isexec = 1; 1273 1274 if (!IS_SWAPFSVP(pp->p_vnode)) 1275 isfs = 1; 1276 } 1277 1278 /* 1279 * Turn off REF and MOD bits with the front hand. 1280 * The back hand examines the REF bit and always considers 1281 * SHARED pages as referenced. 1282 */ 1283 if (whichhand == POH_FRONT) { 1284 pagesync_flag = HAT_SYNC_ZERORM; 1285 } else { 1286 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF | 1287 HAT_SYNC_STOPON_SHARED; 1288 } 1289 1290 ppattr = hat_pagesync(pp, pagesync_flag); 1291 1292 recheck: 1293 /* 1294 * If page is referenced; make unreferenced but reclaimable. 1295 * If this page is not referenced, then it must be reclaimable 1296 * and we can add it to the free list. 1297 */ 1298 if (ppattr & P_REF) { 1299 DTRACE_PROBE2(pageout__isref, page_t *, pp, 1300 pageout_hand_t, whichhand); 1301 1302 if (whichhand == POH_FRONT) { 1303 /* 1304 * Checking of rss or madvise flags needed here... 1305 * 1306 * If not "well-behaved", fall through into the code 1307 * for not referenced. 1308 */ 1309 hat_clrref(pp); 1310 } 1311 1312 /* 1313 * Somebody referenced the page since the front 1314 * hand went by, so it's not a candidate for 1315 * freeing up. 1316 */ 1317 page_unlock(pp); 1318 return (CKP_NOT_FREED); 1319 } 1320 1321 VM_STAT_ADD(pageoutvmstats.checkpage[0]); 1322 1323 /* 1324 * If large page, attempt to demote it. If successfully demoted, 1325 * retry the checkpage. 1326 */ 1327 if (pp->p_szc != 0) { 1328 if (!page_try_demote_pages(pp)) { 1329 VM_STAT_ADD(pageoutvmstats.checkpage[1]); 1330 page_unlock(pp); 1331 return (CKP_INELIGIBLE); 1332 } 1333 1334 ASSERT(pp->p_szc == 0); 1335 VM_STAT_ADD(pageoutvmstats.checkpage[2]); 1336 1337 /* 1338 * Since page_try_demote_pages() could have unloaded some 1339 * mappings it makes sense to reload ppattr. 1340 */ 1341 ppattr = hat_page_getattr(pp, P_MOD | P_REF); 1342 } 1343 1344 /* 1345 * If the page is currently dirty, we have to arrange to have it 1346 * cleaned before it can be freed. 1347 * 1348 * XXX - ASSERT(pp->p_vnode != NULL); 1349 */ 1350 if ((ppattr & P_MOD) && pp->p_vnode != NULL) { 1351 struct vnode *vp = pp->p_vnode; 1352 u_offset_t offset = pp->p_offset; 1353 1354 /* 1355 * XXX - Test for process being swapped out or about to exit? 1356 * [Can't get back to process(es) using the page.] 1357 */ 1358 1359 /* 1360 * Hold the vnode before releasing the page lock to 1361 * prevent it from being freed and re-used by some 1362 * other thread. 1363 */ 1364 VN_HOLD(vp); 1365 page_unlock(pp); 1366 1367 /* 1368 * Queue I/O request for the pageout thread. 1369 */ 1370 if (!queue_io_request(vp, offset)) { 1371 VN_RELE(vp); 1372 return (CKP_NOT_FREED); 1373 } 1374 return (CKP_FREED); 1375 } 1376 1377 /* 1378 * Now we unload all the translations and put the page back on to the 1379 * free list. If the page was used (referenced or modified) after the 1380 * pagesync but before it was unloaded we catch it and handle the page 1381 * properly. 1382 */ 1383 DTRACE_PROBE2(pageout__free, page_t *, pp, pageout_hand_t, whichhand); 1384 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1385 ppattr = hat_page_getattr(pp, P_MOD | P_REF); 1386 if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode != NULL)) { 1387 goto recheck; 1388 } 1389 1390 VN_DISPOSE(pp, B_FREE, 0, kcred); 1391 1392 CPU_STATS_ADD_K(vm, dfree, 1); 1393 1394 if (isfs) { 1395 if (isexec) { 1396 CPU_STATS_ADD_K(vm, execfree, 1); 1397 } else { 1398 CPU_STATS_ADD_K(vm, fsfree, 1); 1399 } 1400 } else { 1401 CPU_STATS_ADD_K(vm, anonfree, 1); 1402 } 1403 1404 return (CKP_FREED); 1405 } 1406 1407 /* 1408 * Queue async i/o request from pageout_scanner and segment swapout 1409 * routines on one common list. This ensures that pageout devices (swap) 1410 * are not saturated by pageout_scanner or swapout requests. 1411 * The pageout thread empties this list by initiating i/o operations. 1412 */ 1413 int 1414 queue_io_request(vnode_t *vp, u_offset_t off) 1415 { 1416 struct async_reqs *arg; 1417 1418 /* 1419 * If we cannot allocate an async request struct, 1420 * skip this page. 1421 */ 1422 mutex_enter(&push_lock); 1423 if ((arg = req_freelist) == NULL) { 1424 mutex_exit(&push_lock); 1425 return (0); 1426 } 1427 req_freelist = arg->a_next; /* adjust freelist */ 1428 push_list_size++; 1429 1430 arg->a_vp = vp; 1431 arg->a_off = off; 1432 arg->a_len = PAGESIZE; 1433 arg->a_flags = B_ASYNC | B_FREE; 1434 arg->a_cred = kcred; /* always held */ 1435 1436 /* 1437 * Add to list of pending write requests. 1438 */ 1439 arg->a_next = push_list; 1440 push_list = arg; 1441 1442 if (req_freelist == NULL) { 1443 /* 1444 * No free async requests left. The lock is held so we 1445 * might as well signal the pusher thread now. 1446 */ 1447 cv_signal(&push_cv); 1448 } 1449 mutex_exit(&push_lock); 1450 return (1); 1451 } 1452 1453 /* 1454 * Wakeup pageout to initiate i/o if push_list is not empty. 1455 */ 1456 void 1457 cv_signal_pageout() 1458 { 1459 if (push_list != NULL) { 1460 mutex_enter(&push_lock); 1461 cv_signal(&push_cv); 1462 mutex_exit(&push_lock); 1463 } 1464 } 1465