1 /*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2005 Yahoo! Technologies Norway AS 9 * All rights reserved. 10 * 11 * This code is derived from software contributed to Berkeley by 12 * The Mach Operating System project at Carnegie-Mellon University. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. All advertising materials mentioning features or use of this software 23 * must display the following acknowledgement: 24 * This product includes software developed by the University of 25 * California, Berkeley and its contributors. 26 * 4. Neither the name of the University nor the names of its contributors 27 * may be used to endorse or promote products derived from this software 28 * without specific prior written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 40 * SUCH DAMAGE. 41 * 42 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 43 * 44 * 45 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 46 * All rights reserved. 47 * 48 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 49 * 50 * Permission to use, copy, modify and distribute this software and 51 * its documentation is hereby granted, provided that both the copyright 52 * notice and this permission notice appear in all copies of the 53 * software, derivative works or modified versions, and any portions 54 * thereof, and that both notices appear in supporting documentation. 55 * 56 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 57 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 58 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 59 * 60 * Carnegie Mellon requests users of this software to return to 61 * 62 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 63 * School of Computer Science 64 * Carnegie Mellon University 65 * Pittsburgh PA 15213-3890 66 * 67 * any improvements or extensions that they make and grant Carnegie the 68 * rights to redistribute these changes. 69 */ 70 71 /* 72 * The proverbial page-out daemon. 73 */ 74 75 #include <sys/cdefs.h> 76 __FBSDID("$FreeBSD$"); 77 78 #include "opt_vm.h" 79 #include <sys/param.h> 80 #include <sys/systm.h> 81 #include <sys/kernel.h> 82 #include <sys/eventhandler.h> 83 #include <sys/lock.h> 84 #include <sys/mutex.h> 85 #include <sys/proc.h> 86 #include <sys/kthread.h> 87 #include <sys/ktr.h> 88 #include <sys/resourcevar.h> 89 #include <sys/sched.h> 90 #include <sys/signalvar.h> 91 #include <sys/vnode.h> 92 #include <sys/vmmeter.h> 93 #include <sys/sx.h> 94 #include <sys/sysctl.h> 95 96 #include <vm/vm.h> 97 #include <vm/vm_param.h> 98 #include <vm/vm_object.h> 99 #include <vm/vm_page.h> 100 #include <vm/vm_map.h> 101 #include <vm/vm_pageout.h> 102 #include <vm/vm_pager.h> 103 #include <vm/swap_pager.h> 104 #include <vm/vm_extern.h> 105 #include <vm/uma.h> 106 107 #include <machine/mutex.h> 108 109 /* 110 * System initialization 111 */ 112 113 /* the kernel process "vm_pageout"*/ 114 static void vm_pageout(void); 115 static int vm_pageout_clean(vm_page_t); 116 static void vm_pageout_pmap_collect(void); 117 static void vm_pageout_scan(int pass); 118 119 struct proc *pageproc; 120 121 static struct kproc_desc page_kp = { 122 "pagedaemon", 123 vm_pageout, 124 &pageproc 125 }; 126 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp) 127 128 #if !defined(NO_SWAPPING) 129 /* the kernel process "vm_daemon"*/ 130 static void vm_daemon(void); 131 static struct proc *vmproc; 132 133 static struct kproc_desc vm_kp = { 134 "vmdaemon", 135 vm_daemon, 136 &vmproc 137 }; 138 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp) 139 #endif 140 141 142 int vm_pages_needed; /* Event on which pageout daemon sleeps */ 143 int vm_pageout_deficit; /* Estimated number of pages deficit */ 144 int vm_pageout_pages_needed; /* flag saying that the pageout daemon needs pages */ 145 146 #if !defined(NO_SWAPPING) 147 static int vm_pageout_req_swapout; /* XXX */ 148 static int vm_daemon_needed; 149 #endif 150 static int vm_max_launder = 32; 151 static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0; 152 static int vm_pageout_full_stats_interval = 0; 153 static int vm_pageout_algorithm=0; 154 static int defer_swap_pageouts=0; 155 static int disable_swap_pageouts=0; 156 157 #if defined(NO_SWAPPING) 158 static int vm_swap_enabled=0; 159 static int vm_swap_idle_enabled=0; 160 #else 161 static int vm_swap_enabled=1; 162 static int vm_swap_idle_enabled=0; 163 #endif 164 165 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm, 166 CTLFLAG_RW, &vm_pageout_algorithm, 0, "LRU page mgmt"); 167 168 SYSCTL_INT(_vm, OID_AUTO, max_launder, 169 CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); 170 171 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max, 172 CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length"); 173 174 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval, 175 CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan"); 176 177 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval, 178 CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan"); 179 180 #if defined(NO_SWAPPING) 181 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 182 CTLFLAG_RD, &vm_swap_enabled, 0, ""); 183 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 184 CTLFLAG_RD, &vm_swap_idle_enabled, 0, ""); 185 #else 186 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 187 CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); 188 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 189 CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); 190 #endif 191 192 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, 193 CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); 194 195 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, 196 CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); 197 198 static int pageout_lock_miss; 199 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, 200 CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); 201 202 #define VM_PAGEOUT_PAGE_COUNT 16 203 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; 204 205 int vm_page_max_wired; /* XXX max # of wired pages system-wide */ 206 207 #if !defined(NO_SWAPPING) 208 static void vm_pageout_map_deactivate_pages(vm_map_t, long); 209 static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long); 210 static void vm_req_vmdaemon(void); 211 #endif 212 static void vm_pageout_page_stats(void); 213 214 /* 215 * vm_pageout_fallback_object_lock: 216 * 217 * Lock vm object currently associated with `m'. VM_OBJECT_TRYLOCK is 218 * known to have failed and page queue must be either PQ_ACTIVE or 219 * PQ_INACTIVE. To avoid lock order violation, unlock the page queues 220 * while locking the vm object. Use marker page to detect page queue 221 * changes and maintain notion of next page on page queue. Return 222 * TRUE if no changes were detected, FALSE otherwise. vm object is 223 * locked on return. 224 * 225 * This function depends on both the lock portion of struct vm_object 226 * and normal struct vm_page being type stable. 227 */ 228 static boolean_t 229 vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next) 230 { 231 struct vm_page marker; 232 boolean_t unchanged; 233 u_short queue; 234 vm_object_t object; 235 236 /* 237 * Initialize our marker 238 */ 239 bzero(&marker, sizeof(marker)); 240 marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; 241 marker.queue = m->queue; 242 marker.wire_count = 1; 243 244 queue = m->queue; 245 object = m->object; 246 247 TAILQ_INSERT_AFTER(&vm_page_queues[queue].pl, 248 m, &marker, pageq); 249 vm_page_unlock_queues(); 250 VM_OBJECT_LOCK(object); 251 vm_page_lock_queues(); 252 253 /* Page queue might have changed. */ 254 *next = TAILQ_NEXT(&marker, pageq); 255 unchanged = (m->queue == queue && 256 m->object == object && 257 &marker == TAILQ_NEXT(m, pageq)); 258 TAILQ_REMOVE(&vm_page_queues[queue].pl, 259 &marker, pageq); 260 return (unchanged); 261 } 262 263 /* 264 * vm_pageout_clean: 265 * 266 * Clean the page and remove it from the laundry. 267 * 268 * We set the busy bit to cause potential page faults on this page to 269 * block. Note the careful timing, however, the busy bit isn't set till 270 * late and we cannot do anything that will mess with the page. 271 */ 272 static int 273 vm_pageout_clean(m) 274 vm_page_t m; 275 { 276 vm_object_t object; 277 vm_page_t mc[2*vm_pageout_page_count]; 278 int pageout_count; 279 int ib, is, page_base; 280 vm_pindex_t pindex = m->pindex; 281 282 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 283 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 284 285 /* 286 * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP 287 * with the new swapper, but we could have serious problems paging 288 * out other object types if there is insufficient memory. 289 * 290 * Unfortunately, checking free memory here is far too late, so the 291 * check has been moved up a procedural level. 292 */ 293 294 /* 295 * Don't mess with the page if it's busy, held, or special 296 */ 297 if ((m->hold_count != 0) || 298 ((m->busy != 0) || (m->flags & (PG_BUSY|PG_UNMANAGED)))) { 299 return 0; 300 } 301 302 mc[vm_pageout_page_count] = m; 303 pageout_count = 1; 304 page_base = vm_pageout_page_count; 305 ib = 1; 306 is = 1; 307 308 /* 309 * Scan object for clusterable pages. 310 * 311 * We can cluster ONLY if: ->> the page is NOT 312 * clean, wired, busy, held, or mapped into a 313 * buffer, and one of the following: 314 * 1) The page is inactive, or a seldom used 315 * active page. 316 * -or- 317 * 2) we force the issue. 318 * 319 * During heavy mmap/modification loads the pageout 320 * daemon can really fragment the underlying file 321 * due to flushing pages out of order and not trying 322 * align the clusters (which leave sporatic out-of-order 323 * holes). To solve this problem we do the reverse scan 324 * first and attempt to align our cluster, then do a 325 * forward scan if room remains. 326 */ 327 object = m->object; 328 more: 329 while (ib && pageout_count < vm_pageout_page_count) { 330 vm_page_t p; 331 332 if (ib > pindex) { 333 ib = 0; 334 break; 335 } 336 337 if ((p = vm_page_lookup(object, pindex - ib)) == NULL) { 338 ib = 0; 339 break; 340 } 341 if (((p->queue - p->pc) == PQ_CACHE) || 342 (p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) { 343 ib = 0; 344 break; 345 } 346 vm_page_test_dirty(p); 347 if ((p->dirty & p->valid) == 0 || 348 p->queue != PQ_INACTIVE || 349 p->wire_count != 0 || /* may be held by buf cache */ 350 p->hold_count != 0) { /* may be undergoing I/O */ 351 ib = 0; 352 break; 353 } 354 mc[--page_base] = p; 355 ++pageout_count; 356 ++ib; 357 /* 358 * alignment boundry, stop here and switch directions. Do 359 * not clear ib. 360 */ 361 if ((pindex - (ib - 1)) % vm_pageout_page_count == 0) 362 break; 363 } 364 365 while (pageout_count < vm_pageout_page_count && 366 pindex + is < object->size) { 367 vm_page_t p; 368 369 if ((p = vm_page_lookup(object, pindex + is)) == NULL) 370 break; 371 if (((p->queue - p->pc) == PQ_CACHE) || 372 (p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) { 373 break; 374 } 375 vm_page_test_dirty(p); 376 if ((p->dirty & p->valid) == 0 || 377 p->queue != PQ_INACTIVE || 378 p->wire_count != 0 || /* may be held by buf cache */ 379 p->hold_count != 0) { /* may be undergoing I/O */ 380 break; 381 } 382 mc[page_base + pageout_count] = p; 383 ++pageout_count; 384 ++is; 385 } 386 387 /* 388 * If we exhausted our forward scan, continue with the reverse scan 389 * when possible, even past a page boundry. This catches boundry 390 * conditions. 391 */ 392 if (ib && pageout_count < vm_pageout_page_count) 393 goto more; 394 395 /* 396 * we allow reads during pageouts... 397 */ 398 return (vm_pageout_flush(&mc[page_base], pageout_count, 0)); 399 } 400 401 /* 402 * vm_pageout_flush() - launder the given pages 403 * 404 * The given pages are laundered. Note that we setup for the start of 405 * I/O ( i.e. busy the page ), mark it read-only, and bump the object 406 * reference count all in here rather then in the parent. If we want 407 * the parent to do more sophisticated things we may have to change 408 * the ordering. 409 */ 410 int 411 vm_pageout_flush(vm_page_t *mc, int count, int flags) 412 { 413 vm_object_t object = mc[0]->object; 414 int pageout_status[count]; 415 int numpagedout = 0; 416 int i; 417 418 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 419 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 420 /* 421 * Initiate I/O. Bump the vm_page_t->busy counter and 422 * mark the pages read-only. 423 * 424 * We do not have to fixup the clean/dirty bits here... we can 425 * allow the pager to do it after the I/O completes. 426 * 427 * NOTE! mc[i]->dirty may be partial or fragmented due to an 428 * edge case with file fragments. 429 */ 430 for (i = 0; i < count; i++) { 431 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, 432 ("vm_pageout_flush: partially invalid page %p index %d/%d", 433 mc[i], i, count)); 434 vm_page_io_start(mc[i]); 435 pmap_page_protect(mc[i], VM_PROT_READ); 436 } 437 vm_page_unlock_queues(); 438 vm_object_pip_add(object, count); 439 440 vm_pager_put_pages(object, mc, count, 441 (flags | ((object == kernel_object) ? VM_PAGER_PUT_SYNC : 0)), 442 pageout_status); 443 444 vm_page_lock_queues(); 445 for (i = 0; i < count; i++) { 446 vm_page_t mt = mc[i]; 447 448 KASSERT((mt->flags & PG_WRITEABLE) == 0, 449 ("vm_pageout_flush: page %p is not write protected", mt)); 450 switch (pageout_status[i]) { 451 case VM_PAGER_OK: 452 case VM_PAGER_PEND: 453 numpagedout++; 454 break; 455 case VM_PAGER_BAD: 456 /* 457 * Page outside of range of object. Right now we 458 * essentially lose the changes by pretending it 459 * worked. 460 */ 461 pmap_clear_modify(mt); 462 vm_page_undirty(mt); 463 break; 464 case VM_PAGER_ERROR: 465 case VM_PAGER_FAIL: 466 /* 467 * If page couldn't be paged out, then reactivate the 468 * page so it doesn't clog the inactive list. (We 469 * will try paging out it again later). 470 */ 471 vm_page_activate(mt); 472 break; 473 case VM_PAGER_AGAIN: 474 break; 475 } 476 477 /* 478 * If the operation is still going, leave the page busy to 479 * block all other accesses. Also, leave the paging in 480 * progress indicator set so that we don't attempt an object 481 * collapse. 482 */ 483 if (pageout_status[i] != VM_PAGER_PEND) { 484 vm_object_pip_wakeup(object); 485 vm_page_io_finish(mt); 486 if (vm_page_count_severe()) 487 vm_page_try_to_cache(mt); 488 } 489 } 490 return numpagedout; 491 } 492 493 #if !defined(NO_SWAPPING) 494 /* 495 * vm_pageout_object_deactivate_pages 496 * 497 * deactivate enough pages to satisfy the inactive target 498 * requirements or if vm_page_proc_limit is set, then 499 * deactivate all of the pages in the object and its 500 * backing_objects. 501 * 502 * The object and map must be locked. 503 */ 504 static void 505 vm_pageout_object_deactivate_pages(pmap, first_object, desired) 506 pmap_t pmap; 507 vm_object_t first_object; 508 long desired; 509 { 510 vm_object_t backing_object, object; 511 vm_page_t p, next; 512 int actcount, rcount, remove_mode; 513 514 VM_OBJECT_LOCK_ASSERT(first_object, MA_OWNED); 515 if (first_object->type == OBJT_DEVICE || first_object->type == OBJT_PHYS) 516 return; 517 for (object = first_object;; object = backing_object) { 518 if (pmap_resident_count(pmap) <= desired) 519 goto unlock_return; 520 if (object->paging_in_progress) 521 goto unlock_return; 522 523 remove_mode = 0; 524 if (object->shadow_count > 1) 525 remove_mode = 1; 526 /* 527 * scan the objects entire memory queue 528 */ 529 rcount = object->resident_page_count; 530 p = TAILQ_FIRST(&object->memq); 531 vm_page_lock_queues(); 532 while (p && (rcount-- > 0)) { 533 if (pmap_resident_count(pmap) <= desired) { 534 vm_page_unlock_queues(); 535 goto unlock_return; 536 } 537 next = TAILQ_NEXT(p, listq); 538 cnt.v_pdpages++; 539 if (p->wire_count != 0 || 540 p->hold_count != 0 || 541 p->busy != 0 || 542 (p->flags & (PG_BUSY|PG_UNMANAGED)) || 543 !pmap_page_exists_quick(pmap, p)) { 544 p = next; 545 continue; 546 } 547 actcount = pmap_ts_referenced(p); 548 if (actcount) { 549 vm_page_flag_set(p, PG_REFERENCED); 550 } else if (p->flags & PG_REFERENCED) { 551 actcount = 1; 552 } 553 if ((p->queue != PQ_ACTIVE) && 554 (p->flags & PG_REFERENCED)) { 555 vm_page_activate(p); 556 p->act_count += actcount; 557 vm_page_flag_clear(p, PG_REFERENCED); 558 } else if (p->queue == PQ_ACTIVE) { 559 if ((p->flags & PG_REFERENCED) == 0) { 560 p->act_count -= min(p->act_count, ACT_DECLINE); 561 if (!remove_mode && (vm_pageout_algorithm || (p->act_count == 0))) { 562 pmap_remove_all(p); 563 vm_page_deactivate(p); 564 } else { 565 vm_pageq_requeue(p); 566 } 567 } else { 568 vm_page_activate(p); 569 vm_page_flag_clear(p, PG_REFERENCED); 570 if (p->act_count < (ACT_MAX - ACT_ADVANCE)) 571 p->act_count += ACT_ADVANCE; 572 vm_pageq_requeue(p); 573 } 574 } else if (p->queue == PQ_INACTIVE) { 575 pmap_remove_all(p); 576 } 577 p = next; 578 } 579 vm_page_unlock_queues(); 580 if ((backing_object = object->backing_object) == NULL) 581 goto unlock_return; 582 VM_OBJECT_LOCK(backing_object); 583 if (object != first_object) 584 VM_OBJECT_UNLOCK(object); 585 } 586 unlock_return: 587 if (object != first_object) 588 VM_OBJECT_UNLOCK(object); 589 } 590 591 /* 592 * deactivate some number of pages in a map, try to do it fairly, but 593 * that is really hard to do. 594 */ 595 static void 596 vm_pageout_map_deactivate_pages(map, desired) 597 vm_map_t map; 598 long desired; 599 { 600 vm_map_entry_t tmpe; 601 vm_object_t obj, bigobj; 602 int nothingwired; 603 604 if (!vm_map_trylock(map)) 605 return; 606 607 bigobj = NULL; 608 nothingwired = TRUE; 609 610 /* 611 * first, search out the biggest object, and try to free pages from 612 * that. 613 */ 614 tmpe = map->header.next; 615 while (tmpe != &map->header) { 616 if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { 617 obj = tmpe->object.vm_object; 618 if (obj != NULL && VM_OBJECT_TRYLOCK(obj)) { 619 if (obj->shadow_count <= 1 && 620 (bigobj == NULL || 621 bigobj->resident_page_count < obj->resident_page_count)) { 622 if (bigobj != NULL) 623 VM_OBJECT_UNLOCK(bigobj); 624 bigobj = obj; 625 } else 626 VM_OBJECT_UNLOCK(obj); 627 } 628 } 629 if (tmpe->wired_count > 0) 630 nothingwired = FALSE; 631 tmpe = tmpe->next; 632 } 633 634 if (bigobj != NULL) { 635 vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired); 636 VM_OBJECT_UNLOCK(bigobj); 637 } 638 /* 639 * Next, hunt around for other pages to deactivate. We actually 640 * do this search sort of wrong -- .text first is not the best idea. 641 */ 642 tmpe = map->header.next; 643 while (tmpe != &map->header) { 644 if (pmap_resident_count(vm_map_pmap(map)) <= desired) 645 break; 646 if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { 647 obj = tmpe->object.vm_object; 648 if (obj != NULL) { 649 VM_OBJECT_LOCK(obj); 650 vm_pageout_object_deactivate_pages(map->pmap, obj, desired); 651 VM_OBJECT_UNLOCK(obj); 652 } 653 } 654 tmpe = tmpe->next; 655 } 656 657 /* 658 * Remove all mappings if a process is swapped out, this will free page 659 * table pages. 660 */ 661 if (desired == 0 && nothingwired) { 662 pmap_remove(vm_map_pmap(map), vm_map_min(map), 663 vm_map_max(map)); 664 } 665 vm_map_unlock(map); 666 } 667 #endif /* !defined(NO_SWAPPING) */ 668 669 /* 670 * This routine is very drastic, but can save the system 671 * in a pinch. 672 */ 673 static void 674 vm_pageout_pmap_collect(void) 675 { 676 int i; 677 vm_page_t m; 678 static int warningdone; 679 680 if (pmap_pagedaemon_waken == 0) 681 return; 682 if (warningdone < 5) { 683 printf("collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n"); 684 warningdone++; 685 } 686 vm_page_lock_queues(); 687 for (i = 0; i < vm_page_array_size; i++) { 688 m = &vm_page_array[i]; 689 if (m->wire_count || m->hold_count || m->busy || 690 (m->flags & (PG_BUSY | PG_UNMANAGED))) 691 continue; 692 pmap_remove_all(m); 693 } 694 vm_page_unlock_queues(); 695 pmap_pagedaemon_waken = 0; 696 } 697 698 /* 699 * vm_pageout_scan does the dirty work for the pageout daemon. 700 */ 701 static void 702 vm_pageout_scan(int pass) 703 { 704 vm_page_t m, next; 705 struct vm_page marker; 706 int page_shortage, maxscan, pcount; 707 int addl_page_shortage, addl_page_shortage_init; 708 struct proc *p, *bigproc; 709 struct thread *td; 710 vm_offset_t size, bigsize; 711 vm_object_t object; 712 int actcount, cache_cur, cache_first_failure; 713 static int cache_last_free; 714 int vnodes_skipped = 0; 715 int maxlaunder; 716 717 mtx_lock(&Giant); 718 /* 719 * Decrease registered cache sizes. 720 */ 721 EVENTHANDLER_INVOKE(vm_lowmem, 0); 722 /* 723 * We do this explicitly after the caches have been drained above. 724 */ 725 uma_reclaim(); 726 /* 727 * Do whatever cleanup that the pmap code can. 728 */ 729 vm_pageout_pmap_collect(); 730 731 addl_page_shortage_init = atomic_readandclear_int(&vm_pageout_deficit); 732 733 /* 734 * Calculate the number of pages we want to either free or move 735 * to the cache. 736 */ 737 page_shortage = vm_paging_target() + addl_page_shortage_init; 738 739 /* 740 * Initialize our marker 741 */ 742 bzero(&marker, sizeof(marker)); 743 marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; 744 marker.queue = PQ_INACTIVE; 745 marker.wire_count = 1; 746 747 /* 748 * Start scanning the inactive queue for pages we can move to the 749 * cache or free. The scan will stop when the target is reached or 750 * we have scanned the entire inactive queue. Note that m->act_count 751 * is not used to form decisions for the inactive queue, only for the 752 * active queue. 753 * 754 * maxlaunder limits the number of dirty pages we flush per scan. 755 * For most systems a smaller value (16 or 32) is more robust under 756 * extreme memory and disk pressure because any unnecessary writes 757 * to disk can result in extreme performance degredation. However, 758 * systems with excessive dirty pages (especially when MAP_NOSYNC is 759 * used) will die horribly with limited laundering. If the pageout 760 * daemon cannot clean enough pages in the first pass, we let it go 761 * all out in succeeding passes. 762 */ 763 if ((maxlaunder = vm_max_launder) <= 1) 764 maxlaunder = 1; 765 if (pass) 766 maxlaunder = 10000; 767 vm_page_lock_queues(); 768 rescan0: 769 addl_page_shortage = addl_page_shortage_init; 770 maxscan = cnt.v_inactive_count; 771 772 for (m = TAILQ_FIRST(&vm_page_queues[PQ_INACTIVE].pl); 773 m != NULL && maxscan-- > 0 && page_shortage > 0; 774 m = next) { 775 776 cnt.v_pdpages++; 777 778 if (m->queue != PQ_INACTIVE) { 779 goto rescan0; 780 } 781 782 next = TAILQ_NEXT(m, pageq); 783 object = m->object; 784 785 /* 786 * skip marker pages 787 */ 788 if (m->flags & PG_MARKER) 789 continue; 790 791 /* 792 * A held page may be undergoing I/O, so skip it. 793 */ 794 if (m->hold_count) { 795 vm_pageq_requeue(m); 796 addl_page_shortage++; 797 continue; 798 } 799 /* 800 * Don't mess with busy pages, keep in the front of the 801 * queue, most likely are being paged out. 802 */ 803 if (!VM_OBJECT_TRYLOCK(object) && 804 (!vm_pageout_fallback_object_lock(m, &next) || 805 m->hold_count != 0)) { 806 VM_OBJECT_UNLOCK(object); 807 addl_page_shortage++; 808 continue; 809 } 810 if (m->busy || (m->flags & PG_BUSY)) { 811 VM_OBJECT_UNLOCK(object); 812 addl_page_shortage++; 813 continue; 814 } 815 816 /* 817 * If the object is not being used, we ignore previous 818 * references. 819 */ 820 if (object->ref_count == 0) { 821 vm_page_flag_clear(m, PG_REFERENCED); 822 pmap_clear_reference(m); 823 824 /* 825 * Otherwise, if the page has been referenced while in the 826 * inactive queue, we bump the "activation count" upwards, 827 * making it less likely that the page will be added back to 828 * the inactive queue prematurely again. Here we check the 829 * page tables (or emulated bits, if any), given the upper 830 * level VM system not knowing anything about existing 831 * references. 832 */ 833 } else if (((m->flags & PG_REFERENCED) == 0) && 834 (actcount = pmap_ts_referenced(m))) { 835 vm_page_activate(m); 836 VM_OBJECT_UNLOCK(object); 837 m->act_count += (actcount + ACT_ADVANCE); 838 continue; 839 } 840 841 /* 842 * If the upper level VM system knows about any page 843 * references, we activate the page. We also set the 844 * "activation count" higher than normal so that we will less 845 * likely place pages back onto the inactive queue again. 846 */ 847 if ((m->flags & PG_REFERENCED) != 0) { 848 vm_page_flag_clear(m, PG_REFERENCED); 849 actcount = pmap_ts_referenced(m); 850 vm_page_activate(m); 851 VM_OBJECT_UNLOCK(object); 852 m->act_count += (actcount + ACT_ADVANCE + 1); 853 continue; 854 } 855 856 /* 857 * If the upper level VM system doesn't know anything about 858 * the page being dirty, we have to check for it again. As 859 * far as the VM code knows, any partially dirty pages are 860 * fully dirty. 861 */ 862 if (m->dirty == 0 && !pmap_is_modified(m)) { 863 /* 864 * Avoid a race condition: Unless write access is 865 * removed from the page, another processor could 866 * modify it before all access is removed by the call 867 * to vm_page_cache() below. If vm_page_cache() finds 868 * that the page has been modified when it removes all 869 * access, it panics because it cannot cache dirty 870 * pages. In principle, we could eliminate just write 871 * access here rather than all access. In the expected 872 * case, when there are no last instant modifications 873 * to the page, removing all access will be cheaper 874 * overall. 875 */ 876 if ((m->flags & PG_WRITEABLE) != 0) 877 pmap_remove_all(m); 878 } else { 879 vm_page_dirty(m); 880 } 881 882 if (m->valid == 0) { 883 /* 884 * Invalid pages can be easily freed 885 */ 886 pmap_remove_all(m); 887 vm_page_free(m); 888 cnt.v_dfree++; 889 --page_shortage; 890 } else if (m->dirty == 0) { 891 /* 892 * Clean pages can be placed onto the cache queue. 893 * This effectively frees them. 894 */ 895 vm_page_cache(m); 896 --page_shortage; 897 } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) { 898 /* 899 * Dirty pages need to be paged out, but flushing 900 * a page is extremely expensive verses freeing 901 * a clean page. Rather then artificially limiting 902 * the number of pages we can flush, we instead give 903 * dirty pages extra priority on the inactive queue 904 * by forcing them to be cycled through the queue 905 * twice before being flushed, after which the 906 * (now clean) page will cycle through once more 907 * before being freed. This significantly extends 908 * the thrash point for a heavily loaded machine. 909 */ 910 vm_page_flag_set(m, PG_WINATCFLS); 911 vm_pageq_requeue(m); 912 } else if (maxlaunder > 0) { 913 /* 914 * We always want to try to flush some dirty pages if 915 * we encounter them, to keep the system stable. 916 * Normally this number is small, but under extreme 917 * pressure where there are insufficient clean pages 918 * on the inactive queue, we may have to go all out. 919 */ 920 int swap_pageouts_ok; 921 struct vnode *vp = NULL; 922 struct mount *mp; 923 924 if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) { 925 swap_pageouts_ok = 1; 926 } else { 927 swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts); 928 swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts && 929 vm_page_count_min()); 930 931 } 932 933 /* 934 * We don't bother paging objects that are "dead". 935 * Those objects are in a "rundown" state. 936 */ 937 if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) { 938 VM_OBJECT_UNLOCK(object); 939 vm_pageq_requeue(m); 940 continue; 941 } 942 943 /* 944 * The object is already known NOT to be dead. It 945 * is possible for the vget() to block the whole 946 * pageout daemon, but the new low-memory handling 947 * code should prevent it. 948 * 949 * The previous code skipped locked vnodes and, worse, 950 * reordered pages in the queue. This results in 951 * completely non-deterministic operation and, on a 952 * busy system, can lead to extremely non-optimal 953 * pageouts. For example, it can cause clean pages 954 * to be freed and dirty pages to be moved to the end 955 * of the queue. Since dirty pages are also moved to 956 * the end of the queue once-cleaned, this gives 957 * way too large a weighting to defering the freeing 958 * of dirty pages. 959 * 960 * We can't wait forever for the vnode lock, we might 961 * deadlock due to a vn_read() getting stuck in 962 * vm_wait while holding this vnode. We skip the 963 * vnode if we can't get it in a reasonable amount 964 * of time. 965 */ 966 if (object->type == OBJT_VNODE) { 967 vp = object->handle; 968 mp = NULL; 969 if (vp->v_type == VREG) 970 vn_start_write(vp, &mp, V_NOWAIT); 971 vm_page_unlock_queues(); 972 VI_LOCK(vp); 973 VM_OBJECT_UNLOCK(object); 974 if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK | 975 LK_TIMELOCK, curthread)) { 976 VM_OBJECT_LOCK(object); 977 vm_page_lock_queues(); 978 ++pageout_lock_miss; 979 vn_finished_write(mp); 980 if (object->flags & OBJ_MIGHTBEDIRTY) 981 vnodes_skipped++; 982 VM_OBJECT_UNLOCK(object); 983 continue; 984 } 985 VM_OBJECT_LOCK(object); 986 vm_page_lock_queues(); 987 /* 988 * The page might have been moved to another 989 * queue during potential blocking in vget() 990 * above. The page might have been freed and 991 * reused for another vnode. The object might 992 * have been reused for another vnode. 993 */ 994 if (m->queue != PQ_INACTIVE || 995 m->object != object || 996 object->handle != vp) { 997 if (object->flags & OBJ_MIGHTBEDIRTY) 998 vnodes_skipped++; 999 goto unlock_and_continue; 1000 } 1001 1002 /* 1003 * The page may have been busied during the 1004 * blocking in vput(); We don't move the 1005 * page back onto the end of the queue so that 1006 * statistics are more correct if we don't. 1007 */ 1008 if (m->busy || (m->flags & PG_BUSY)) { 1009 goto unlock_and_continue; 1010 } 1011 1012 /* 1013 * If the page has become held it might 1014 * be undergoing I/O, so skip it 1015 */ 1016 if (m->hold_count) { 1017 vm_pageq_requeue(m); 1018 if (object->flags & OBJ_MIGHTBEDIRTY) 1019 vnodes_skipped++; 1020 goto unlock_and_continue; 1021 } 1022 } 1023 1024 /* 1025 * If a page is dirty, then it is either being washed 1026 * (but not yet cleaned) or it is still in the 1027 * laundry. If it is still in the laundry, then we 1028 * start the cleaning operation. 1029 * 1030 * This operation may cluster, invalidating the 'next' 1031 * pointer. To prevent an inordinate number of 1032 * restarts we use our marker to remember our place. 1033 * 1034 * decrement page_shortage on success to account for 1035 * the (future) cleaned page. Otherwise we could wind 1036 * up laundering or cleaning too many pages. 1037 */ 1038 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl, m, &marker, pageq); 1039 if (vm_pageout_clean(m) != 0) { 1040 --page_shortage; 1041 --maxlaunder; 1042 } 1043 next = TAILQ_NEXT(&marker, pageq); 1044 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq); 1045 unlock_and_continue: 1046 VM_OBJECT_UNLOCK(object); 1047 if (vp) { 1048 vm_page_unlock_queues(); 1049 vput(vp); 1050 vn_finished_write(mp); 1051 vm_page_lock_queues(); 1052 } 1053 continue; 1054 } 1055 VM_OBJECT_UNLOCK(object); 1056 } 1057 1058 /* 1059 * Compute the number of pages we want to try to move from the 1060 * active queue to the inactive queue. 1061 */ 1062 page_shortage = vm_paging_target() + 1063 cnt.v_inactive_target - cnt.v_inactive_count; 1064 page_shortage += addl_page_shortage; 1065 1066 /* 1067 * Scan the active queue for things we can deactivate. We nominally 1068 * track the per-page activity counter and use it to locate 1069 * deactivation candidates. 1070 */ 1071 pcount = cnt.v_active_count; 1072 m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl); 1073 1074 while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) { 1075 1076 KASSERT(m->queue == PQ_ACTIVE, 1077 ("vm_pageout_scan: page %p isn't active", m)); 1078 1079 next = TAILQ_NEXT(m, pageq); 1080 object = m->object; 1081 if ((m->flags & PG_MARKER) != 0) { 1082 m = next; 1083 continue; 1084 } 1085 if (!VM_OBJECT_TRYLOCK(object) && 1086 !vm_pageout_fallback_object_lock(m, &next)) { 1087 VM_OBJECT_UNLOCK(object); 1088 m = next; 1089 continue; 1090 } 1091 1092 /* 1093 * Don't deactivate pages that are busy. 1094 */ 1095 if ((m->busy != 0) || 1096 (m->flags & PG_BUSY) || 1097 (m->hold_count != 0)) { 1098 VM_OBJECT_UNLOCK(object); 1099 vm_pageq_requeue(m); 1100 m = next; 1101 continue; 1102 } 1103 1104 /* 1105 * The count for pagedaemon pages is done after checking the 1106 * page for eligibility... 1107 */ 1108 cnt.v_pdpages++; 1109 1110 /* 1111 * Check to see "how much" the page has been used. 1112 */ 1113 actcount = 0; 1114 if (object->ref_count != 0) { 1115 if (m->flags & PG_REFERENCED) { 1116 actcount += 1; 1117 } 1118 actcount += pmap_ts_referenced(m); 1119 if (actcount) { 1120 m->act_count += ACT_ADVANCE + actcount; 1121 if (m->act_count > ACT_MAX) 1122 m->act_count = ACT_MAX; 1123 } 1124 } 1125 1126 /* 1127 * Since we have "tested" this bit, we need to clear it now. 1128 */ 1129 vm_page_flag_clear(m, PG_REFERENCED); 1130 1131 /* 1132 * Only if an object is currently being used, do we use the 1133 * page activation count stats. 1134 */ 1135 if (actcount && (object->ref_count != 0)) { 1136 vm_pageq_requeue(m); 1137 } else { 1138 m->act_count -= min(m->act_count, ACT_DECLINE); 1139 if (vm_pageout_algorithm || 1140 object->ref_count == 0 || 1141 m->act_count == 0) { 1142 page_shortage--; 1143 if (object->ref_count == 0) { 1144 pmap_remove_all(m); 1145 if (m->dirty == 0) 1146 vm_page_cache(m); 1147 else 1148 vm_page_deactivate(m); 1149 } else { 1150 vm_page_deactivate(m); 1151 } 1152 } else { 1153 vm_pageq_requeue(m); 1154 } 1155 } 1156 VM_OBJECT_UNLOCK(object); 1157 m = next; 1158 } 1159 1160 /* 1161 * We try to maintain some *really* free pages, this allows interrupt 1162 * code to be guaranteed space. Since both cache and free queues 1163 * are considered basically 'free', moving pages from cache to free 1164 * does not effect other calculations. 1165 */ 1166 cache_cur = cache_last_free; 1167 cache_first_failure = -1; 1168 while (cnt.v_free_count < cnt.v_free_reserved && (cache_cur = 1169 (cache_cur + PQ_PRIME2) & PQ_L2_MASK) != cache_first_failure) { 1170 TAILQ_FOREACH(m, &vm_page_queues[PQ_CACHE + cache_cur].pl, 1171 pageq) { 1172 KASSERT(m->dirty == 0, 1173 ("Found dirty cache page %p", m)); 1174 KASSERT(!pmap_page_is_mapped(m), 1175 ("Found mapped cache page %p", m)); 1176 KASSERT((m->flags & PG_UNMANAGED) == 0, 1177 ("Found unmanaged cache page %p", m)); 1178 KASSERT(m->wire_count == 0, 1179 ("Found wired cache page %p", m)); 1180 if (m->hold_count == 0 && VM_OBJECT_TRYLOCK(object = 1181 m->object)) { 1182 KASSERT((m->flags & PG_BUSY) == 0 && 1183 m->busy == 0, ("Found busy cache page %p", 1184 m)); 1185 vm_page_free(m); 1186 VM_OBJECT_UNLOCK(object); 1187 cnt.v_dfree++; 1188 cache_last_free = cache_cur; 1189 cache_first_failure = -1; 1190 break; 1191 } 1192 } 1193 if (m == NULL && cache_first_failure == -1) 1194 cache_first_failure = cache_cur; 1195 } 1196 vm_page_unlock_queues(); 1197 #if !defined(NO_SWAPPING) 1198 /* 1199 * Idle process swapout -- run once per second. 1200 */ 1201 if (vm_swap_idle_enabled) { 1202 static long lsec; 1203 if (time_second != lsec) { 1204 vm_pageout_req_swapout |= VM_SWAP_IDLE; 1205 vm_req_vmdaemon(); 1206 lsec = time_second; 1207 } 1208 } 1209 #endif 1210 1211 /* 1212 * If we didn't get enough free pages, and we have skipped a vnode 1213 * in a writeable object, wakeup the sync daemon. And kick swapout 1214 * if we did not get enough free pages. 1215 */ 1216 if (vm_paging_target() > 0) { 1217 if (vnodes_skipped && vm_page_count_min()) 1218 (void) speedup_syncer(); 1219 #if !defined(NO_SWAPPING) 1220 if (vm_swap_enabled && vm_page_count_target()) { 1221 vm_req_vmdaemon(); 1222 vm_pageout_req_swapout |= VM_SWAP_NORMAL; 1223 } 1224 #endif 1225 } 1226 1227 /* 1228 * If we are critically low on one of RAM or swap and low on 1229 * the other, kill the largest process. However, we avoid 1230 * doing this on the first pass in order to give ourselves a 1231 * chance to flush out dirty vnode-backed pages and to allow 1232 * active pages to be moved to the inactive queue and reclaimed. 1233 * 1234 * We keep the process bigproc locked once we find it to keep anyone 1235 * from messing with it; however, there is a possibility of 1236 * deadlock if process B is bigproc and one of it's child processes 1237 * attempts to propagate a signal to B while we are waiting for A's 1238 * lock while walking this list. To avoid this, we don't block on 1239 * the process lock but just skip a process if it is already locked. 1240 */ 1241 if (pass != 0 && 1242 ((swap_pager_avail < 64 && vm_page_count_min()) || 1243 (swap_pager_full && vm_paging_target() > 0))) { 1244 bigproc = NULL; 1245 bigsize = 0; 1246 sx_slock(&allproc_lock); 1247 FOREACH_PROC_IN_SYSTEM(p) { 1248 int breakout; 1249 1250 if (PROC_TRYLOCK(p) == 0) 1251 continue; 1252 /* 1253 * If this is a system or protected process, skip it. 1254 */ 1255 if ((p->p_flag & P_SYSTEM) || (p->p_pid == 1) || 1256 (p->p_flag & P_PROTECTED) || 1257 ((p->p_pid < 48) && (swap_pager_avail != 0))) { 1258 PROC_UNLOCK(p); 1259 continue; 1260 } 1261 /* 1262 * If the process is in a non-running type state, 1263 * don't touch it. Check all the threads individually. 1264 */ 1265 mtx_lock_spin(&sched_lock); 1266 breakout = 0; 1267 FOREACH_THREAD_IN_PROC(p, td) { 1268 if (!TD_ON_RUNQ(td) && 1269 !TD_IS_RUNNING(td) && 1270 !TD_IS_SLEEPING(td)) { 1271 breakout = 1; 1272 break; 1273 } 1274 } 1275 if (breakout) { 1276 mtx_unlock_spin(&sched_lock); 1277 PROC_UNLOCK(p); 1278 continue; 1279 } 1280 mtx_unlock_spin(&sched_lock); 1281 /* 1282 * get the process size 1283 */ 1284 if (!vm_map_trylock_read(&p->p_vmspace->vm_map)) { 1285 PROC_UNLOCK(p); 1286 continue; 1287 } 1288 size = vmspace_swap_count(p->p_vmspace); 1289 vm_map_unlock_read(&p->p_vmspace->vm_map); 1290 size += vmspace_resident_count(p->p_vmspace); 1291 /* 1292 * if the this process is bigger than the biggest one 1293 * remember it. 1294 */ 1295 if (size > bigsize) { 1296 if (bigproc != NULL) 1297 PROC_UNLOCK(bigproc); 1298 bigproc = p; 1299 bigsize = size; 1300 } else 1301 PROC_UNLOCK(p); 1302 } 1303 sx_sunlock(&allproc_lock); 1304 if (bigproc != NULL) { 1305 killproc(bigproc, "out of swap space"); 1306 mtx_lock_spin(&sched_lock); 1307 sched_nice(bigproc, PRIO_MIN); 1308 mtx_unlock_spin(&sched_lock); 1309 PROC_UNLOCK(bigproc); 1310 wakeup(&cnt.v_free_count); 1311 } 1312 } 1313 mtx_unlock(&Giant); 1314 } 1315 1316 /* 1317 * This routine tries to maintain the pseudo LRU active queue, 1318 * so that during long periods of time where there is no paging, 1319 * that some statistic accumulation still occurs. This code 1320 * helps the situation where paging just starts to occur. 1321 */ 1322 static void 1323 vm_pageout_page_stats() 1324 { 1325 vm_object_t object; 1326 vm_page_t m,next; 1327 int pcount,tpcount; /* Number of pages to check */ 1328 static int fullintervalcount = 0; 1329 int page_shortage; 1330 1331 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1332 page_shortage = 1333 (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) - 1334 (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count); 1335 1336 if (page_shortage <= 0) 1337 return; 1338 1339 pcount = cnt.v_active_count; 1340 fullintervalcount += vm_pageout_stats_interval; 1341 if (fullintervalcount < vm_pageout_full_stats_interval) { 1342 tpcount = (vm_pageout_stats_max * cnt.v_active_count) / cnt.v_page_count; 1343 if (pcount > tpcount) 1344 pcount = tpcount; 1345 } else { 1346 fullintervalcount = 0; 1347 } 1348 1349 m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl); 1350 while ((m != NULL) && (pcount-- > 0)) { 1351 int actcount; 1352 1353 KASSERT(m->queue == PQ_ACTIVE, 1354 ("vm_pageout_page_stats: page %p isn't active", m)); 1355 1356 next = TAILQ_NEXT(m, pageq); 1357 object = m->object; 1358 1359 if ((m->flags & PG_MARKER) != 0) { 1360 m = next; 1361 continue; 1362 } 1363 if (!VM_OBJECT_TRYLOCK(object) && 1364 !vm_pageout_fallback_object_lock(m, &next)) { 1365 VM_OBJECT_UNLOCK(object); 1366 m = next; 1367 continue; 1368 } 1369 1370 /* 1371 * Don't deactivate pages that are busy. 1372 */ 1373 if ((m->busy != 0) || 1374 (m->flags & PG_BUSY) || 1375 (m->hold_count != 0)) { 1376 VM_OBJECT_UNLOCK(object); 1377 vm_pageq_requeue(m); 1378 m = next; 1379 continue; 1380 } 1381 1382 actcount = 0; 1383 if (m->flags & PG_REFERENCED) { 1384 vm_page_flag_clear(m, PG_REFERENCED); 1385 actcount += 1; 1386 } 1387 1388 actcount += pmap_ts_referenced(m); 1389 if (actcount) { 1390 m->act_count += ACT_ADVANCE + actcount; 1391 if (m->act_count > ACT_MAX) 1392 m->act_count = ACT_MAX; 1393 vm_pageq_requeue(m); 1394 } else { 1395 if (m->act_count == 0) { 1396 /* 1397 * We turn off page access, so that we have 1398 * more accurate RSS stats. We don't do this 1399 * in the normal page deactivation when the 1400 * system is loaded VM wise, because the 1401 * cost of the large number of page protect 1402 * operations would be higher than the value 1403 * of doing the operation. 1404 */ 1405 pmap_remove_all(m); 1406 vm_page_deactivate(m); 1407 } else { 1408 m->act_count -= min(m->act_count, ACT_DECLINE); 1409 vm_pageq_requeue(m); 1410 } 1411 } 1412 VM_OBJECT_UNLOCK(object); 1413 m = next; 1414 } 1415 } 1416 1417 /* 1418 * vm_pageout is the high level pageout daemon. 1419 */ 1420 static void 1421 vm_pageout() 1422 { 1423 int error, pass; 1424 1425 /* 1426 * Initialize some paging parameters. 1427 */ 1428 cnt.v_interrupt_free_min = 2; 1429 if (cnt.v_page_count < 2000) 1430 vm_pageout_page_count = 8; 1431 1432 /* 1433 * v_free_reserved needs to include enough for the largest 1434 * swap pager structures plus enough for any pv_entry structs 1435 * when paging. 1436 */ 1437 if (cnt.v_page_count > 1024) 1438 cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200; 1439 else 1440 cnt.v_free_min = 4; 1441 cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE + 1442 cnt.v_interrupt_free_min; 1443 cnt.v_free_reserved = vm_pageout_page_count + 1444 cnt.v_pageout_free_min + (cnt.v_page_count / 768) + PQ_L2_SIZE; 1445 cnt.v_free_severe = cnt.v_free_min / 2; 1446 cnt.v_free_min += cnt.v_free_reserved; 1447 cnt.v_free_severe += cnt.v_free_reserved; 1448 1449 /* 1450 * v_free_target and v_cache_min control pageout hysteresis. Note 1451 * that these are more a measure of the VM cache queue hysteresis 1452 * then the VM free queue. Specifically, v_free_target is the 1453 * high water mark (free+cache pages). 1454 * 1455 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the 1456 * low water mark, while v_free_min is the stop. v_cache_min must 1457 * be big enough to handle memory needs while the pageout daemon 1458 * is signalled and run to free more pages. 1459 */ 1460 if (cnt.v_free_count > 6144) 1461 cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved; 1462 else 1463 cnt.v_free_target = 2 * cnt.v_free_min + cnt.v_free_reserved; 1464 1465 if (cnt.v_free_count > 2048) { 1466 cnt.v_cache_min = cnt.v_free_target; 1467 cnt.v_cache_max = 2 * cnt.v_cache_min; 1468 cnt.v_inactive_target = (3 * cnt.v_free_target) / 2; 1469 } else { 1470 cnt.v_cache_min = 0; 1471 cnt.v_cache_max = 0; 1472 cnt.v_inactive_target = cnt.v_free_count / 4; 1473 } 1474 if (cnt.v_inactive_target > cnt.v_free_count / 3) 1475 cnt.v_inactive_target = cnt.v_free_count / 3; 1476 1477 /* XXX does not really belong here */ 1478 if (vm_page_max_wired == 0) 1479 vm_page_max_wired = cnt.v_free_count / 3; 1480 1481 if (vm_pageout_stats_max == 0) 1482 vm_pageout_stats_max = cnt.v_free_target; 1483 1484 /* 1485 * Set interval in seconds for stats scan. 1486 */ 1487 if (vm_pageout_stats_interval == 0) 1488 vm_pageout_stats_interval = 5; 1489 if (vm_pageout_full_stats_interval == 0) 1490 vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4; 1491 1492 swap_pager_swap_init(); 1493 pass = 0; 1494 /* 1495 * The pageout daemon is never done, so loop forever. 1496 */ 1497 while (TRUE) { 1498 vm_page_lock_queues(); 1499 /* 1500 * If we have enough free memory, wakeup waiters. Do 1501 * not clear vm_pages_needed until we reach our target, 1502 * otherwise we may be woken up over and over again and 1503 * waste a lot of cpu. 1504 */ 1505 if (vm_pages_needed && !vm_page_count_min()) { 1506 if (!vm_paging_needed()) 1507 vm_pages_needed = 0; 1508 wakeup(&cnt.v_free_count); 1509 } 1510 if (vm_pages_needed) { 1511 /* 1512 * Still not done, take a second pass without waiting 1513 * (unlimited dirty cleaning), otherwise sleep a bit 1514 * and try again. 1515 */ 1516 ++pass; 1517 if (pass > 1) 1518 msleep(&vm_pages_needed, &vm_page_queue_mtx, PVM, 1519 "psleep", hz/2); 1520 } else { 1521 /* 1522 * Good enough, sleep & handle stats. Prime the pass 1523 * for the next run. 1524 */ 1525 if (pass > 1) 1526 pass = 1; 1527 else 1528 pass = 0; 1529 error = msleep(&vm_pages_needed, &vm_page_queue_mtx, PVM, 1530 "psleep", vm_pageout_stats_interval * hz); 1531 if (error && !vm_pages_needed) { 1532 pass = 0; 1533 vm_pageout_page_stats(); 1534 vm_page_unlock_queues(); 1535 continue; 1536 } 1537 } 1538 if (vm_pages_needed) 1539 cnt.v_pdwakeups++; 1540 vm_page_unlock_queues(); 1541 vm_pageout_scan(pass); 1542 } 1543 } 1544 1545 /* 1546 * Unless the page queue lock is held by the caller, this function 1547 * should be regarded as advisory. Specifically, the caller should 1548 * not msleep() on &cnt.v_free_count following this function unless 1549 * the page queue lock is held until the msleep() is performed. 1550 */ 1551 void 1552 pagedaemon_wakeup() 1553 { 1554 1555 if (!vm_pages_needed && curthread->td_proc != pageproc) { 1556 vm_pages_needed = 1; 1557 wakeup(&vm_pages_needed); 1558 } 1559 } 1560 1561 #if !defined(NO_SWAPPING) 1562 static void 1563 vm_req_vmdaemon() 1564 { 1565 static int lastrun = 0; 1566 1567 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { 1568 wakeup(&vm_daemon_needed); 1569 lastrun = ticks; 1570 } 1571 } 1572 1573 static void 1574 vm_daemon() 1575 { 1576 struct rlimit rsslim; 1577 struct proc *p; 1578 struct thread *td; 1579 int breakout; 1580 1581 mtx_lock(&Giant); 1582 while (TRUE) { 1583 tsleep(&vm_daemon_needed, PPAUSE, "psleep", 0); 1584 if (vm_pageout_req_swapout) { 1585 swapout_procs(vm_pageout_req_swapout); 1586 vm_pageout_req_swapout = 0; 1587 } 1588 /* 1589 * scan the processes for exceeding their rlimits or if 1590 * process is swapped out -- deactivate pages 1591 */ 1592 sx_slock(&allproc_lock); 1593 LIST_FOREACH(p, &allproc, p_list) { 1594 vm_pindex_t limit, size; 1595 1596 /* 1597 * if this is a system process or if we have already 1598 * looked at this process, skip it. 1599 */ 1600 PROC_LOCK(p); 1601 if (p->p_flag & (P_SYSTEM | P_WEXIT)) { 1602 PROC_UNLOCK(p); 1603 continue; 1604 } 1605 /* 1606 * if the process is in a non-running type state, 1607 * don't touch it. 1608 */ 1609 mtx_lock_spin(&sched_lock); 1610 breakout = 0; 1611 FOREACH_THREAD_IN_PROC(p, td) { 1612 if (!TD_ON_RUNQ(td) && 1613 !TD_IS_RUNNING(td) && 1614 !TD_IS_SLEEPING(td)) { 1615 breakout = 1; 1616 break; 1617 } 1618 } 1619 mtx_unlock_spin(&sched_lock); 1620 if (breakout) { 1621 PROC_UNLOCK(p); 1622 continue; 1623 } 1624 /* 1625 * get a limit 1626 */ 1627 lim_rlimit(p, RLIMIT_RSS, &rsslim); 1628 limit = OFF_TO_IDX( 1629 qmin(rsslim.rlim_cur, rsslim.rlim_max)); 1630 1631 /* 1632 * let processes that are swapped out really be 1633 * swapped out set the limit to nothing (will force a 1634 * swap-out.) 1635 */ 1636 if ((p->p_sflag & PS_INMEM) == 0) 1637 limit = 0; /* XXX */ 1638 PROC_UNLOCK(p); 1639 1640 size = vmspace_resident_count(p->p_vmspace); 1641 if (limit >= 0 && size >= limit) { 1642 vm_pageout_map_deactivate_pages( 1643 &p->p_vmspace->vm_map, limit); 1644 } 1645 } 1646 sx_sunlock(&allproc_lock); 1647 } 1648 } 1649 #endif /* !defined(NO_SWAPPING) */ 1650