1 /*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2005 Yahoo! Technologies Norway AS 9 * All rights reserved. 10 * 11 * This code is derived from software contributed to Berkeley by 12 * The Mach Operating System project at Carnegie-Mellon University. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. All advertising materials mentioning features or use of this software 23 * must display the following acknowledgement: 24 * This product includes software developed by the University of 25 * California, Berkeley and its contributors. 26 * 4. Neither the name of the University nor the names of its contributors 27 * may be used to endorse or promote products derived from this software 28 * without specific prior written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 40 * SUCH DAMAGE. 41 * 42 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 43 * 44 * 45 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 46 * All rights reserved. 47 * 48 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 49 * 50 * Permission to use, copy, modify and distribute this software and 51 * its documentation is hereby granted, provided that both the copyright 52 * notice and this permission notice appear in all copies of the 53 * software, derivative works or modified versions, and any portions 54 * thereof, and that both notices appear in supporting documentation. 55 * 56 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 57 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 58 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 59 * 60 * Carnegie Mellon requests users of this software to return to 61 * 62 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 63 * School of Computer Science 64 * Carnegie Mellon University 65 * Pittsburgh PA 15213-3890 66 * 67 * any improvements or extensions that they make and grant Carnegie the 68 * rights to redistribute these changes. 69 */ 70 71 /* 72 * The proverbial page-out daemon. 73 */ 74 75 #include <sys/cdefs.h> 76 __FBSDID("$FreeBSD$"); 77 78 #include "opt_vm.h" 79 #include <sys/param.h> 80 #include <sys/systm.h> 81 #include <sys/kernel.h> 82 #include <sys/eventhandler.h> 83 #include <sys/lock.h> 84 #include <sys/mutex.h> 85 #include <sys/proc.h> 86 #include <sys/kthread.h> 87 #include <sys/ktr.h> 88 #include <sys/mount.h> 89 #include <sys/racct.h> 90 #include <sys/resourcevar.h> 91 #include <sys/sched.h> 92 #include <sys/signalvar.h> 93 #include <sys/vnode.h> 94 #include <sys/vmmeter.h> 95 #include <sys/rwlock.h> 96 #include <sys/sx.h> 97 #include <sys/sysctl.h> 98 99 #include <vm/vm.h> 100 #include <vm/vm_param.h> 101 #include <vm/vm_object.h> 102 #include <vm/vm_page.h> 103 #include <vm/vm_map.h> 104 #include <vm/vm_pageout.h> 105 #include <vm/vm_pager.h> 106 #include <vm/swap_pager.h> 107 #include <vm/vm_extern.h> 108 #include <vm/uma.h> 109 110 /* 111 * System initialization 112 */ 113 114 /* the kernel process "vm_pageout"*/ 115 static void vm_pageout(void); 116 static int vm_pageout_clean(vm_page_t); 117 static void vm_pageout_scan(int pass); 118 119 struct proc *pageproc; 120 121 static struct kproc_desc page_kp = { 122 "pagedaemon", 123 vm_pageout, 124 &pageproc 125 }; 126 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, 127 &page_kp); 128 129 #if !defined(NO_SWAPPING) 130 /* the kernel process "vm_daemon"*/ 131 static void vm_daemon(void); 132 static struct proc *vmproc; 133 134 static struct kproc_desc vm_kp = { 135 "vmdaemon", 136 vm_daemon, 137 &vmproc 138 }; 139 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); 140 #endif 141 142 143 int vm_pages_needed; /* Event on which pageout daemon sleeps */ 144 int vm_pageout_deficit; /* Estimated number of pages deficit */ 145 int vm_pageout_pages_needed; /* flag saying that the pageout daemon needs pages */ 146 147 #if !defined(NO_SWAPPING) 148 static int vm_pageout_req_swapout; /* XXX */ 149 static int vm_daemon_needed; 150 static struct mtx vm_daemon_mtx; 151 /* Allow for use by vm_pageout before vm_daemon is initialized. */ 152 MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF); 153 #endif 154 static int vm_max_launder = 32; 155 static int vm_pageout_stats_max; 156 static int vm_pageout_stats; 157 static int vm_pageout_stats_interval; 158 static int vm_pageout_full_stats; 159 static int vm_pageout_full_stats_interval; 160 static int defer_swap_pageouts; 161 static int disable_swap_pageouts; 162 163 #if defined(NO_SWAPPING) 164 static int vm_swap_enabled = 0; 165 static int vm_swap_idle_enabled = 0; 166 #else 167 static int vm_swap_enabled = 1; 168 static int vm_swap_idle_enabled = 0; 169 #endif 170 171 SYSCTL_INT(_vm, OID_AUTO, max_launder, 172 CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); 173 174 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max, 175 CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length"); 176 177 SYSCTL_INT(_vm, OID_AUTO, pageout_stats, 178 CTLFLAG_RD, &vm_pageout_stats, 0, "Number of partial stats scans"); 179 180 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval, 181 CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan"); 182 183 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats, 184 CTLFLAG_RD, &vm_pageout_full_stats, 0, "Number of full stats scans"); 185 186 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval, 187 CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan"); 188 189 #if defined(NO_SWAPPING) 190 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 191 CTLFLAG_RD, &vm_swap_enabled, 0, "Enable entire process swapout"); 192 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 193 CTLFLAG_RD, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); 194 #else 195 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 196 CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); 197 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 198 CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); 199 #endif 200 201 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, 202 CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); 203 204 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, 205 CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); 206 207 static int pageout_lock_miss; 208 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, 209 CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); 210 211 #define VM_PAGEOUT_PAGE_COUNT 16 212 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; 213 214 int vm_page_max_wired; /* XXX max # of wired pages system-wide */ 215 SYSCTL_INT(_vm, OID_AUTO, max_wired, 216 CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count"); 217 218 static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *); 219 static boolean_t vm_pageout_launder(int, int, vm_paddr_t, vm_paddr_t); 220 #if !defined(NO_SWAPPING) 221 static void vm_pageout_map_deactivate_pages(vm_map_t, long); 222 static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long); 223 static void vm_req_vmdaemon(int req); 224 #endif 225 static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *); 226 static void vm_pageout_page_stats(void); 227 228 /* 229 * Initialize a dummy page for marking the caller's place in the specified 230 * paging queue. In principle, this function only needs to set the flag 231 * PG_MARKER. Nonetheless, it sets the flag VPO_BUSY and initializes the hold 232 * count to one as safety precautions. 233 */ 234 static void 235 vm_pageout_init_marker(vm_page_t marker, u_short queue) 236 { 237 238 bzero(marker, sizeof(*marker)); 239 marker->flags = PG_MARKER; 240 marker->oflags = VPO_BUSY; 241 marker->queue = queue; 242 marker->hold_count = 1; 243 } 244 245 /* 246 * vm_pageout_fallback_object_lock: 247 * 248 * Lock vm object currently associated with `m'. VM_OBJECT_TRYWLOCK is 249 * known to have failed and page queue must be either PQ_ACTIVE or 250 * PQ_INACTIVE. To avoid lock order violation, unlock the page queues 251 * while locking the vm object. Use marker page to detect page queue 252 * changes and maintain notion of next page on page queue. Return 253 * TRUE if no changes were detected, FALSE otherwise. vm object is 254 * locked on return. 255 * 256 * This function depends on both the lock portion of struct vm_object 257 * and normal struct vm_page being type stable. 258 */ 259 static boolean_t 260 vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next) 261 { 262 struct vm_page marker; 263 struct vm_pagequeue *pq; 264 boolean_t unchanged; 265 u_short queue; 266 vm_object_t object; 267 268 queue = m->queue; 269 vm_pageout_init_marker(&marker, queue); 270 pq = &vm_pagequeues[queue]; 271 object = m->object; 272 273 TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, pageq); 274 vm_pagequeue_unlock(pq); 275 vm_page_unlock(m); 276 VM_OBJECT_WLOCK(object); 277 vm_page_lock(m); 278 vm_pagequeue_lock(pq); 279 280 /* Page queue might have changed. */ 281 *next = TAILQ_NEXT(&marker, pageq); 282 unchanged = (m->queue == queue && 283 m->object == object && 284 &marker == TAILQ_NEXT(m, pageq)); 285 TAILQ_REMOVE(&pq->pq_pl, &marker, pageq); 286 return (unchanged); 287 } 288 289 /* 290 * Lock the page while holding the page queue lock. Use marker page 291 * to detect page queue changes and maintain notion of next page on 292 * page queue. Return TRUE if no changes were detected, FALSE 293 * otherwise. The page is locked on return. The page queue lock might 294 * be dropped and reacquired. 295 * 296 * This function depends on normal struct vm_page being type stable. 297 */ 298 static boolean_t 299 vm_pageout_page_lock(vm_page_t m, vm_page_t *next) 300 { 301 struct vm_page marker; 302 struct vm_pagequeue *pq; 303 boolean_t unchanged; 304 u_short queue; 305 306 vm_page_lock_assert(m, MA_NOTOWNED); 307 if (vm_page_trylock(m)) 308 return (TRUE); 309 310 queue = m->queue; 311 vm_pageout_init_marker(&marker, queue); 312 pq = &vm_pagequeues[queue]; 313 314 TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, pageq); 315 vm_pagequeue_unlock(pq); 316 vm_page_lock(m); 317 vm_pagequeue_lock(pq); 318 319 /* Page queue might have changed. */ 320 *next = TAILQ_NEXT(&marker, pageq); 321 unchanged = (m->queue == queue && &marker == TAILQ_NEXT(m, pageq)); 322 TAILQ_REMOVE(&pq->pq_pl, &marker, pageq); 323 return (unchanged); 324 } 325 326 /* 327 * vm_pageout_clean: 328 * 329 * Clean the page and remove it from the laundry. 330 * 331 * We set the busy bit to cause potential page faults on this page to 332 * block. Note the careful timing, however, the busy bit isn't set till 333 * late and we cannot do anything that will mess with the page. 334 */ 335 static int 336 vm_pageout_clean(vm_page_t m) 337 { 338 vm_object_t object; 339 vm_page_t mc[2*vm_pageout_page_count], pb, ps; 340 int pageout_count; 341 int ib, is, page_base; 342 vm_pindex_t pindex = m->pindex; 343 344 vm_page_lock_assert(m, MA_OWNED); 345 object = m->object; 346 VM_OBJECT_ASSERT_WLOCKED(object); 347 348 /* 349 * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP 350 * with the new swapper, but we could have serious problems paging 351 * out other object types if there is insufficient memory. 352 * 353 * Unfortunately, checking free memory here is far too late, so the 354 * check has been moved up a procedural level. 355 */ 356 357 /* 358 * Can't clean the page if it's busy or held. 359 */ 360 KASSERT(m->busy == 0 && (m->oflags & VPO_BUSY) == 0, 361 ("vm_pageout_clean: page %p is busy", m)); 362 KASSERT(m->hold_count == 0, ("vm_pageout_clean: page %p is held", m)); 363 vm_page_unlock(m); 364 365 mc[vm_pageout_page_count] = pb = ps = m; 366 pageout_count = 1; 367 page_base = vm_pageout_page_count; 368 ib = 1; 369 is = 1; 370 371 /* 372 * Scan object for clusterable pages. 373 * 374 * We can cluster ONLY if: ->> the page is NOT 375 * clean, wired, busy, held, or mapped into a 376 * buffer, and one of the following: 377 * 1) The page is inactive, or a seldom used 378 * active page. 379 * -or- 380 * 2) we force the issue. 381 * 382 * During heavy mmap/modification loads the pageout 383 * daemon can really fragment the underlying file 384 * due to flushing pages out of order and not trying 385 * align the clusters (which leave sporatic out-of-order 386 * holes). To solve this problem we do the reverse scan 387 * first and attempt to align our cluster, then do a 388 * forward scan if room remains. 389 */ 390 more: 391 while (ib && pageout_count < vm_pageout_page_count) { 392 vm_page_t p; 393 394 if (ib > pindex) { 395 ib = 0; 396 break; 397 } 398 399 if ((p = vm_page_prev(pb)) == NULL || 400 (p->oflags & VPO_BUSY) != 0 || p->busy != 0) { 401 ib = 0; 402 break; 403 } 404 vm_page_lock(p); 405 vm_page_test_dirty(p); 406 if (p->dirty == 0 || 407 p->queue != PQ_INACTIVE || 408 p->hold_count != 0) { /* may be undergoing I/O */ 409 vm_page_unlock(p); 410 ib = 0; 411 break; 412 } 413 vm_page_unlock(p); 414 mc[--page_base] = pb = p; 415 ++pageout_count; 416 ++ib; 417 /* 418 * alignment boundry, stop here and switch directions. Do 419 * not clear ib. 420 */ 421 if ((pindex - (ib - 1)) % vm_pageout_page_count == 0) 422 break; 423 } 424 425 while (pageout_count < vm_pageout_page_count && 426 pindex + is < object->size) { 427 vm_page_t p; 428 429 if ((p = vm_page_next(ps)) == NULL || 430 (p->oflags & VPO_BUSY) != 0 || p->busy != 0) 431 break; 432 vm_page_lock(p); 433 vm_page_test_dirty(p); 434 if (p->dirty == 0 || 435 p->queue != PQ_INACTIVE || 436 p->hold_count != 0) { /* may be undergoing I/O */ 437 vm_page_unlock(p); 438 break; 439 } 440 vm_page_unlock(p); 441 mc[page_base + pageout_count] = ps = p; 442 ++pageout_count; 443 ++is; 444 } 445 446 /* 447 * If we exhausted our forward scan, continue with the reverse scan 448 * when possible, even past a page boundry. This catches boundry 449 * conditions. 450 */ 451 if (ib && pageout_count < vm_pageout_page_count) 452 goto more; 453 454 /* 455 * we allow reads during pageouts... 456 */ 457 return (vm_pageout_flush(&mc[page_base], pageout_count, 0, 0, NULL, 458 NULL)); 459 } 460 461 /* 462 * vm_pageout_flush() - launder the given pages 463 * 464 * The given pages are laundered. Note that we setup for the start of 465 * I/O ( i.e. busy the page ), mark it read-only, and bump the object 466 * reference count all in here rather then in the parent. If we want 467 * the parent to do more sophisticated things we may have to change 468 * the ordering. 469 * 470 * Returned runlen is the count of pages between mreq and first 471 * page after mreq with status VM_PAGER_AGAIN. 472 * *eio is set to TRUE if pager returned VM_PAGER_ERROR or VM_PAGER_FAIL 473 * for any page in runlen set. 474 */ 475 int 476 vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen, 477 boolean_t *eio) 478 { 479 vm_object_t object = mc[0]->object; 480 int pageout_status[count]; 481 int numpagedout = 0; 482 int i, runlen; 483 484 VM_OBJECT_ASSERT_WLOCKED(object); 485 486 /* 487 * Initiate I/O. Bump the vm_page_t->busy counter and 488 * mark the pages read-only. 489 * 490 * We do not have to fixup the clean/dirty bits here... we can 491 * allow the pager to do it after the I/O completes. 492 * 493 * NOTE! mc[i]->dirty may be partial or fragmented due to an 494 * edge case with file fragments. 495 */ 496 for (i = 0; i < count; i++) { 497 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, 498 ("vm_pageout_flush: partially invalid page %p index %d/%d", 499 mc[i], i, count)); 500 vm_page_io_start(mc[i]); 501 pmap_remove_write(mc[i]); 502 } 503 vm_object_pip_add(object, count); 504 505 vm_pager_put_pages(object, mc, count, flags, pageout_status); 506 507 runlen = count - mreq; 508 if (eio != NULL) 509 *eio = FALSE; 510 for (i = 0; i < count; i++) { 511 vm_page_t mt = mc[i]; 512 513 KASSERT(pageout_status[i] == VM_PAGER_PEND || 514 !pmap_page_is_write_mapped(mt), 515 ("vm_pageout_flush: page %p is not write protected", mt)); 516 switch (pageout_status[i]) { 517 case VM_PAGER_OK: 518 case VM_PAGER_PEND: 519 numpagedout++; 520 break; 521 case VM_PAGER_BAD: 522 /* 523 * Page outside of range of object. Right now we 524 * essentially lose the changes by pretending it 525 * worked. 526 */ 527 vm_page_undirty(mt); 528 break; 529 case VM_PAGER_ERROR: 530 case VM_PAGER_FAIL: 531 /* 532 * If page couldn't be paged out, then reactivate the 533 * page so it doesn't clog the inactive list. (We 534 * will try paging out it again later). 535 */ 536 vm_page_lock(mt); 537 vm_page_activate(mt); 538 vm_page_unlock(mt); 539 if (eio != NULL && i >= mreq && i - mreq < runlen) 540 *eio = TRUE; 541 break; 542 case VM_PAGER_AGAIN: 543 if (i >= mreq && i - mreq < runlen) 544 runlen = i - mreq; 545 break; 546 } 547 548 /* 549 * If the operation is still going, leave the page busy to 550 * block all other accesses. Also, leave the paging in 551 * progress indicator set so that we don't attempt an object 552 * collapse. 553 */ 554 if (pageout_status[i] != VM_PAGER_PEND) { 555 vm_object_pip_wakeup(object); 556 vm_page_io_finish(mt); 557 if (vm_page_count_severe()) { 558 vm_page_lock(mt); 559 vm_page_try_to_cache(mt); 560 vm_page_unlock(mt); 561 } 562 } 563 } 564 if (prunlen != NULL) 565 *prunlen = runlen; 566 return (numpagedout); 567 } 568 569 static boolean_t 570 vm_pageout_launder(int queue, int tries, vm_paddr_t low, vm_paddr_t high) 571 { 572 struct mount *mp; 573 struct vm_pagequeue *pq; 574 struct vnode *vp; 575 vm_object_t object; 576 vm_paddr_t pa; 577 vm_page_t m, m_tmp, next; 578 579 pq = &vm_pagequeues[queue]; 580 vm_pagequeue_lock(pq); 581 TAILQ_FOREACH_SAFE(m, &pq->pq_pl, pageq, next) { 582 KASSERT(m->queue == queue, 583 ("vm_pageout_launder: page %p's queue is not %d", m, 584 queue)); 585 if ((m->flags & PG_MARKER) != 0) 586 continue; 587 pa = VM_PAGE_TO_PHYS(m); 588 if (pa < low || pa + PAGE_SIZE > high) 589 continue; 590 if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) { 591 vm_page_unlock(m); 592 continue; 593 } 594 object = m->object; 595 if ((!VM_OBJECT_TRYWLOCK(object) && 596 (!vm_pageout_fallback_object_lock(m, &next) || 597 m->hold_count != 0)) || (m->oflags & VPO_BUSY) != 0 || 598 m->busy != 0) { 599 vm_page_unlock(m); 600 VM_OBJECT_WUNLOCK(object); 601 continue; 602 } 603 vm_page_test_dirty(m); 604 if (m->dirty == 0 && object->ref_count != 0) 605 pmap_remove_all(m); 606 if (m->dirty != 0) { 607 vm_page_unlock(m); 608 if (tries == 0 || (object->flags & OBJ_DEAD) != 0) { 609 VM_OBJECT_WUNLOCK(object); 610 continue; 611 } 612 if (object->type == OBJT_VNODE) { 613 vm_pagequeue_unlock(pq); 614 vp = object->handle; 615 vm_object_reference_locked(object); 616 VM_OBJECT_WUNLOCK(object); 617 (void)vn_start_write(vp, &mp, V_WAIT); 618 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 619 VM_OBJECT_WLOCK(object); 620 vm_object_page_clean(object, 0, 0, OBJPC_SYNC); 621 VM_OBJECT_WUNLOCK(object); 622 VOP_UNLOCK(vp, 0); 623 vm_object_deallocate(object); 624 vn_finished_write(mp); 625 return (TRUE); 626 } else if (object->type == OBJT_SWAP || 627 object->type == OBJT_DEFAULT) { 628 vm_pagequeue_unlock(pq); 629 m_tmp = m; 630 vm_pageout_flush(&m_tmp, 1, VM_PAGER_PUT_SYNC, 631 0, NULL, NULL); 632 VM_OBJECT_WUNLOCK(object); 633 return (TRUE); 634 } 635 } else { 636 /* 637 * Dequeue here to prevent lock recursion in 638 * vm_page_cache(). 639 */ 640 vm_page_dequeue_locked(m); 641 vm_page_cache(m); 642 vm_page_unlock(m); 643 } 644 VM_OBJECT_WUNLOCK(object); 645 } 646 vm_pagequeue_unlock(pq); 647 return (FALSE); 648 } 649 650 /* 651 * Increase the number of cached pages. The specified value, "tries", 652 * determines which categories of pages are cached: 653 * 654 * 0: All clean, inactive pages within the specified physical address range 655 * are cached. Will not sleep. 656 * 1: The vm_lowmem handlers are called. All inactive pages within 657 * the specified physical address range are cached. May sleep. 658 * 2: The vm_lowmem handlers are called. All inactive and active pages 659 * within the specified physical address range are cached. May sleep. 660 */ 661 void 662 vm_pageout_grow_cache(int tries, vm_paddr_t low, vm_paddr_t high) 663 { 664 int actl, actmax, inactl, inactmax; 665 666 if (tries > 0) { 667 /* 668 * Decrease registered cache sizes. The vm_lowmem handlers 669 * may acquire locks and/or sleep, so they can only be invoked 670 * when "tries" is greater than zero. 671 */ 672 EVENTHANDLER_INVOKE(vm_lowmem, 0); 673 674 /* 675 * We do this explicitly after the caches have been drained 676 * above. 677 */ 678 uma_reclaim(); 679 } 680 inactl = 0; 681 inactmax = cnt.v_inactive_count; 682 actl = 0; 683 actmax = tries < 2 ? 0 : cnt.v_active_count; 684 again: 685 if (inactl < inactmax && vm_pageout_launder(PQ_INACTIVE, tries, low, 686 high)) { 687 inactl++; 688 goto again; 689 } 690 if (actl < actmax && vm_pageout_launder(PQ_ACTIVE, tries, low, high)) { 691 actl++; 692 goto again; 693 } 694 } 695 696 #if !defined(NO_SWAPPING) 697 /* 698 * vm_pageout_object_deactivate_pages 699 * 700 * Deactivate enough pages to satisfy the inactive target 701 * requirements. 702 * 703 * The object and map must be locked. 704 */ 705 static void 706 vm_pageout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object, 707 long desired) 708 { 709 vm_object_t backing_object, object; 710 vm_page_t p; 711 int act_delta, remove_mode; 712 713 VM_OBJECT_ASSERT_LOCKED(first_object); 714 if ((first_object->flags & OBJ_FICTITIOUS) != 0) 715 return; 716 for (object = first_object;; object = backing_object) { 717 if (pmap_resident_count(pmap) <= desired) 718 goto unlock_return; 719 VM_OBJECT_ASSERT_LOCKED(object); 720 if ((object->flags & OBJ_UNMANAGED) != 0 || 721 object->paging_in_progress != 0) 722 goto unlock_return; 723 724 remove_mode = 0; 725 if (object->shadow_count > 1) 726 remove_mode = 1; 727 /* 728 * Scan the object's entire memory queue. 729 */ 730 TAILQ_FOREACH(p, &object->memq, listq) { 731 if (pmap_resident_count(pmap) <= desired) 732 goto unlock_return; 733 if ((p->oflags & VPO_BUSY) != 0 || p->busy != 0) 734 continue; 735 PCPU_INC(cnt.v_pdpages); 736 vm_page_lock(p); 737 if (p->wire_count != 0 || p->hold_count != 0 || 738 !pmap_page_exists_quick(pmap, p)) { 739 vm_page_unlock(p); 740 continue; 741 } 742 act_delta = pmap_ts_referenced(p); 743 if ((p->aflags & PGA_REFERENCED) != 0) { 744 if (act_delta == 0) 745 act_delta = 1; 746 vm_page_aflag_clear(p, PGA_REFERENCED); 747 } 748 if (p->queue != PQ_ACTIVE && act_delta != 0) { 749 vm_page_activate(p); 750 p->act_count += act_delta; 751 } else if (p->queue == PQ_ACTIVE) { 752 if (act_delta == 0) { 753 p->act_count -= min(p->act_count, 754 ACT_DECLINE); 755 if (!remove_mode && p->act_count == 0) { 756 pmap_remove_all(p); 757 vm_page_deactivate(p); 758 } else 759 vm_page_requeue(p); 760 } else { 761 vm_page_activate(p); 762 if (p->act_count < ACT_MAX - 763 ACT_ADVANCE) 764 p->act_count += ACT_ADVANCE; 765 vm_page_requeue(p); 766 } 767 } else if (p->queue == PQ_INACTIVE) 768 pmap_remove_all(p); 769 vm_page_unlock(p); 770 } 771 if ((backing_object = object->backing_object) == NULL) 772 goto unlock_return; 773 VM_OBJECT_RLOCK(backing_object); 774 if (object != first_object) 775 VM_OBJECT_RUNLOCK(object); 776 } 777 unlock_return: 778 if (object != first_object) 779 VM_OBJECT_RUNLOCK(object); 780 } 781 782 /* 783 * deactivate some number of pages in a map, try to do it fairly, but 784 * that is really hard to do. 785 */ 786 static void 787 vm_pageout_map_deactivate_pages(map, desired) 788 vm_map_t map; 789 long desired; 790 { 791 vm_map_entry_t tmpe; 792 vm_object_t obj, bigobj; 793 int nothingwired; 794 795 if (!vm_map_trylock(map)) 796 return; 797 798 bigobj = NULL; 799 nothingwired = TRUE; 800 801 /* 802 * first, search out the biggest object, and try to free pages from 803 * that. 804 */ 805 tmpe = map->header.next; 806 while (tmpe != &map->header) { 807 if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { 808 obj = tmpe->object.vm_object; 809 if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) { 810 if (obj->shadow_count <= 1 && 811 (bigobj == NULL || 812 bigobj->resident_page_count < obj->resident_page_count)) { 813 if (bigobj != NULL) 814 VM_OBJECT_RUNLOCK(bigobj); 815 bigobj = obj; 816 } else 817 VM_OBJECT_RUNLOCK(obj); 818 } 819 } 820 if (tmpe->wired_count > 0) 821 nothingwired = FALSE; 822 tmpe = tmpe->next; 823 } 824 825 if (bigobj != NULL) { 826 vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired); 827 VM_OBJECT_RUNLOCK(bigobj); 828 } 829 /* 830 * Next, hunt around for other pages to deactivate. We actually 831 * do this search sort of wrong -- .text first is not the best idea. 832 */ 833 tmpe = map->header.next; 834 while (tmpe != &map->header) { 835 if (pmap_resident_count(vm_map_pmap(map)) <= desired) 836 break; 837 if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { 838 obj = tmpe->object.vm_object; 839 if (obj != NULL) { 840 VM_OBJECT_RLOCK(obj); 841 vm_pageout_object_deactivate_pages(map->pmap, obj, desired); 842 VM_OBJECT_RUNLOCK(obj); 843 } 844 } 845 tmpe = tmpe->next; 846 } 847 848 /* 849 * Remove all mappings if a process is swapped out, this will free page 850 * table pages. 851 */ 852 if (desired == 0 && nothingwired) { 853 pmap_remove(vm_map_pmap(map), vm_map_min(map), 854 vm_map_max(map)); 855 } 856 vm_map_unlock(map); 857 } 858 #endif /* !defined(NO_SWAPPING) */ 859 860 /* 861 * vm_pageout_scan does the dirty work for the pageout daemon. 862 */ 863 static void 864 vm_pageout_scan(int pass) 865 { 866 vm_page_t m, next; 867 struct vm_page marker; 868 struct vm_pagequeue *pq; 869 int page_shortage, maxscan, pcount; 870 int addl_page_shortage; 871 vm_object_t object; 872 int act_delta; 873 int vnodes_skipped = 0; 874 int maxlaunder; 875 boolean_t queues_locked; 876 877 vm_pageout_init_marker(&marker, PQ_INACTIVE); 878 879 /* 880 * Decrease registered cache sizes. 881 */ 882 EVENTHANDLER_INVOKE(vm_lowmem, 0); 883 /* 884 * We do this explicitly after the caches have been drained above. 885 */ 886 uma_reclaim(); 887 888 /* 889 * The addl_page_shortage is the number of temporarily 890 * stuck pages in the inactive queue. In other words, the 891 * number of pages from cnt.v_inactive_count that should be 892 * discounted in setting the target for the active queue scan. 893 */ 894 addl_page_shortage = atomic_readandclear_int(&vm_pageout_deficit); 895 896 /* 897 * Calculate the number of pages we want to either free or move 898 * to the cache. 899 */ 900 page_shortage = vm_paging_target() + addl_page_shortage; 901 902 /* 903 * maxlaunder limits the number of dirty pages we flush per scan. 904 * For most systems a smaller value (16 or 32) is more robust under 905 * extreme memory and disk pressure because any unnecessary writes 906 * to disk can result in extreme performance degredation. However, 907 * systems with excessive dirty pages (especially when MAP_NOSYNC is 908 * used) will die horribly with limited laundering. If the pageout 909 * daemon cannot clean enough pages in the first pass, we let it go 910 * all out in succeeding passes. 911 */ 912 if ((maxlaunder = vm_max_launder) <= 1) 913 maxlaunder = 1; 914 if (pass) 915 maxlaunder = 10000; 916 917 maxscan = cnt.v_inactive_count; 918 919 /* 920 * Start scanning the inactive queue for pages we can move to the 921 * cache or free. The scan will stop when the target is reached or 922 * we have scanned the entire inactive queue. Note that m->act_count 923 * is not used to form decisions for the inactive queue, only for the 924 * active queue. 925 */ 926 pq = &vm_pagequeues[PQ_INACTIVE]; 927 vm_pagequeue_lock(pq); 928 queues_locked = TRUE; 929 for (m = TAILQ_FIRST(&pq->pq_pl); 930 m != NULL && maxscan-- > 0 && page_shortage > 0; 931 m = next) { 932 vm_pagequeue_assert_locked(pq); 933 KASSERT(queues_locked, ("unlocked queues")); 934 KASSERT(m->queue == PQ_INACTIVE, ("Inactive queue %p", m)); 935 936 PCPU_INC(cnt.v_pdpages); 937 next = TAILQ_NEXT(m, pageq); 938 939 /* 940 * skip marker pages 941 */ 942 if (m->flags & PG_MARKER) 943 continue; 944 945 KASSERT((m->flags & PG_FICTITIOUS) == 0, 946 ("Fictitious page %p cannot be in inactive queue", m)); 947 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 948 ("Unmanaged page %p cannot be in inactive queue", m)); 949 950 /* 951 * The page or object lock acquisitions fail if the 952 * page was removed from the queue or moved to a 953 * different position within the queue. In either 954 * case, addl_page_shortage should not be incremented. 955 */ 956 if (!vm_pageout_page_lock(m, &next)) { 957 vm_page_unlock(m); 958 continue; 959 } 960 object = m->object; 961 if (!VM_OBJECT_TRYWLOCK(object) && 962 !vm_pageout_fallback_object_lock(m, &next)) { 963 vm_page_unlock(m); 964 VM_OBJECT_WUNLOCK(object); 965 continue; 966 } 967 968 /* 969 * Don't mess with busy pages, keep them at at the 970 * front of the queue, most likely they are being 971 * paged out. Increment addl_page_shortage for busy 972 * pages, because they may leave the inactive queue 973 * shortly after page scan is finished. 974 */ 975 if (m->busy != 0 || (m->oflags & VPO_BUSY) != 0) { 976 vm_page_unlock(m); 977 VM_OBJECT_WUNLOCK(object); 978 addl_page_shortage++; 979 continue; 980 } 981 982 /* 983 * We unlock the inactive page queue, invalidating the 984 * 'next' pointer. Use our marker to remember our 985 * place. 986 */ 987 TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, pageq); 988 vm_pagequeue_unlock(pq); 989 queues_locked = FALSE; 990 991 /* 992 * We bump the activation count if the page has been 993 * referenced while in the inactive queue. This makes 994 * it less likely that the page will be added back to the 995 * inactive queue prematurely again. Here we check the 996 * page tables (or emulated bits, if any), given the upper 997 * level VM system not knowing anything about existing 998 * references. 999 */ 1000 act_delta = 0; 1001 if ((m->aflags & PGA_REFERENCED) != 0) { 1002 vm_page_aflag_clear(m, PGA_REFERENCED); 1003 act_delta = 1; 1004 } 1005 if (object->ref_count != 0) { 1006 act_delta += pmap_ts_referenced(m); 1007 } else { 1008 KASSERT(!pmap_page_is_mapped(m), 1009 ("vm_pageout_scan: page %p is mapped", m)); 1010 } 1011 1012 /* 1013 * If the upper level VM system knows about any page 1014 * references, we reactivate the page or requeue it. 1015 */ 1016 if (act_delta != 0) { 1017 if (object->ref_count) { 1018 vm_page_activate(m); 1019 m->act_count += act_delta + ACT_ADVANCE; 1020 } else { 1021 vm_pagequeue_lock(pq); 1022 queues_locked = TRUE; 1023 vm_page_requeue_locked(m); 1024 } 1025 VM_OBJECT_WUNLOCK(object); 1026 vm_page_unlock(m); 1027 goto relock_queues; 1028 } 1029 1030 if (m->hold_count != 0) { 1031 vm_page_unlock(m); 1032 VM_OBJECT_WUNLOCK(object); 1033 1034 /* 1035 * Held pages are essentially stuck in the 1036 * queue. So, they ought to be discounted 1037 * from cnt.v_inactive_count. See the 1038 * calculation of the page_shortage for the 1039 * loop over the active queue below. 1040 */ 1041 addl_page_shortage++; 1042 goto relock_queues; 1043 } 1044 1045 /* 1046 * If the page appears to be clean at the machine-independent 1047 * layer, then remove all of its mappings from the pmap in 1048 * anticipation of placing it onto the cache queue. If, 1049 * however, any of the page's mappings allow write access, 1050 * then the page may still be modified until the last of those 1051 * mappings are removed. 1052 */ 1053 vm_page_test_dirty(m); 1054 if (m->dirty == 0 && object->ref_count != 0) 1055 pmap_remove_all(m); 1056 1057 if (m->valid == 0) { 1058 /* 1059 * Invalid pages can be easily freed 1060 */ 1061 vm_page_free(m); 1062 PCPU_INC(cnt.v_dfree); 1063 --page_shortage; 1064 } else if (m->dirty == 0) { 1065 /* 1066 * Clean pages can be placed onto the cache queue. 1067 * This effectively frees them. 1068 */ 1069 vm_page_cache(m); 1070 --page_shortage; 1071 } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) { 1072 /* 1073 * Dirty pages need to be paged out, but flushing 1074 * a page is extremely expensive verses freeing 1075 * a clean page. Rather then artificially limiting 1076 * the number of pages we can flush, we instead give 1077 * dirty pages extra priority on the inactive queue 1078 * by forcing them to be cycled through the queue 1079 * twice before being flushed, after which the 1080 * (now clean) page will cycle through once more 1081 * before being freed. This significantly extends 1082 * the thrash point for a heavily loaded machine. 1083 */ 1084 m->flags |= PG_WINATCFLS; 1085 vm_pagequeue_lock(pq); 1086 queues_locked = TRUE; 1087 vm_page_requeue_locked(m); 1088 } else if (maxlaunder > 0) { 1089 /* 1090 * We always want to try to flush some dirty pages if 1091 * we encounter them, to keep the system stable. 1092 * Normally this number is small, but under extreme 1093 * pressure where there are insufficient clean pages 1094 * on the inactive queue, we may have to go all out. 1095 */ 1096 int swap_pageouts_ok; 1097 struct vnode *vp = NULL; 1098 struct mount *mp = NULL; 1099 1100 if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) { 1101 swap_pageouts_ok = 1; 1102 } else { 1103 swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts); 1104 swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts && 1105 vm_page_count_min()); 1106 1107 } 1108 1109 /* 1110 * We don't bother paging objects that are "dead". 1111 * Those objects are in a "rundown" state. 1112 */ 1113 if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) { 1114 vm_pagequeue_lock(pq); 1115 vm_page_unlock(m); 1116 VM_OBJECT_WUNLOCK(object); 1117 queues_locked = TRUE; 1118 vm_page_requeue_locked(m); 1119 goto relock_queues; 1120 } 1121 1122 /* 1123 * The object is already known NOT to be dead. It 1124 * is possible for the vget() to block the whole 1125 * pageout daemon, but the new low-memory handling 1126 * code should prevent it. 1127 * 1128 * The previous code skipped locked vnodes and, worse, 1129 * reordered pages in the queue. This results in 1130 * completely non-deterministic operation and, on a 1131 * busy system, can lead to extremely non-optimal 1132 * pageouts. For example, it can cause clean pages 1133 * to be freed and dirty pages to be moved to the end 1134 * of the queue. Since dirty pages are also moved to 1135 * the end of the queue once-cleaned, this gives 1136 * way too large a weighting to defering the freeing 1137 * of dirty pages. 1138 * 1139 * We can't wait forever for the vnode lock, we might 1140 * deadlock due to a vn_read() getting stuck in 1141 * vm_wait while holding this vnode. We skip the 1142 * vnode if we can't get it in a reasonable amount 1143 * of time. 1144 */ 1145 if (object->type == OBJT_VNODE) { 1146 vm_page_unlock(m); 1147 vp = object->handle; 1148 if (vp->v_type == VREG && 1149 vn_start_write(vp, &mp, V_NOWAIT) != 0) { 1150 mp = NULL; 1151 ++pageout_lock_miss; 1152 if (object->flags & OBJ_MIGHTBEDIRTY) 1153 vnodes_skipped++; 1154 goto unlock_and_continue; 1155 } 1156 KASSERT(mp != NULL, 1157 ("vp %p with NULL v_mount", vp)); 1158 vm_object_reference_locked(object); 1159 VM_OBJECT_WUNLOCK(object); 1160 if (vget(vp, LK_EXCLUSIVE | LK_TIMELOCK, 1161 curthread)) { 1162 VM_OBJECT_WLOCK(object); 1163 ++pageout_lock_miss; 1164 if (object->flags & OBJ_MIGHTBEDIRTY) 1165 vnodes_skipped++; 1166 vp = NULL; 1167 goto unlock_and_continue; 1168 } 1169 VM_OBJECT_WLOCK(object); 1170 vm_page_lock(m); 1171 vm_pagequeue_lock(pq); 1172 queues_locked = TRUE; 1173 /* 1174 * The page might have been moved to another 1175 * queue during potential blocking in vget() 1176 * above. The page might have been freed and 1177 * reused for another vnode. 1178 */ 1179 if (m->queue != PQ_INACTIVE || 1180 m->object != object || 1181 TAILQ_NEXT(m, pageq) != &marker) { 1182 vm_page_unlock(m); 1183 if (object->flags & OBJ_MIGHTBEDIRTY) 1184 vnodes_skipped++; 1185 goto unlock_and_continue; 1186 } 1187 1188 /* 1189 * The page may have been busied during the 1190 * blocking in vget(). We don't move the 1191 * page back onto the end of the queue so that 1192 * statistics are more correct if we don't. 1193 */ 1194 if (m->busy || (m->oflags & VPO_BUSY)) { 1195 vm_page_unlock(m); 1196 goto unlock_and_continue; 1197 } 1198 1199 /* 1200 * If the page has become held it might 1201 * be undergoing I/O, so skip it 1202 */ 1203 if (m->hold_count) { 1204 vm_page_unlock(m); 1205 vm_page_requeue_locked(m); 1206 if (object->flags & OBJ_MIGHTBEDIRTY) 1207 vnodes_skipped++; 1208 goto unlock_and_continue; 1209 } 1210 vm_pagequeue_unlock(pq); 1211 queues_locked = FALSE; 1212 } 1213 1214 /* 1215 * If a page is dirty, then it is either being washed 1216 * (but not yet cleaned) or it is still in the 1217 * laundry. If it is still in the laundry, then we 1218 * start the cleaning operation. 1219 * 1220 * decrement page_shortage on success to account for 1221 * the (future) cleaned page. Otherwise we could wind 1222 * up laundering or cleaning too many pages. 1223 */ 1224 if (vm_pageout_clean(m) != 0) { 1225 --page_shortage; 1226 --maxlaunder; 1227 } 1228 unlock_and_continue: 1229 vm_page_lock_assert(m, MA_NOTOWNED); 1230 VM_OBJECT_WUNLOCK(object); 1231 if (mp != NULL) { 1232 if (queues_locked) { 1233 vm_pagequeue_unlock(pq); 1234 queues_locked = FALSE; 1235 } 1236 if (vp != NULL) 1237 vput(vp); 1238 vm_object_deallocate(object); 1239 vn_finished_write(mp); 1240 } 1241 vm_page_lock_assert(m, MA_NOTOWNED); 1242 goto relock_queues; 1243 } 1244 vm_page_unlock(m); 1245 VM_OBJECT_WUNLOCK(object); 1246 relock_queues: 1247 if (!queues_locked) { 1248 vm_pagequeue_lock(pq); 1249 queues_locked = TRUE; 1250 } 1251 next = TAILQ_NEXT(&marker, pageq); 1252 TAILQ_REMOVE(&pq->pq_pl, &marker, pageq); 1253 } 1254 vm_pagequeue_unlock(pq); 1255 1256 /* 1257 * Compute the number of pages we want to try to move from the 1258 * active queue to the inactive queue. 1259 */ 1260 page_shortage = vm_paging_target() + 1261 cnt.v_inactive_target - cnt.v_inactive_count; 1262 page_shortage += addl_page_shortage; 1263 1264 /* 1265 * Scan the active queue for things we can deactivate. We nominally 1266 * track the per-page activity counter and use it to locate 1267 * deactivation candidates. 1268 */ 1269 pcount = cnt.v_active_count; 1270 pq = &vm_pagequeues[PQ_ACTIVE]; 1271 vm_pagequeue_lock(pq); 1272 m = TAILQ_FIRST(&pq->pq_pl); 1273 while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) { 1274 1275 KASSERT(m->queue == PQ_ACTIVE, 1276 ("vm_pageout_scan: page %p isn't active", m)); 1277 1278 next = TAILQ_NEXT(m, pageq); 1279 if ((m->flags & PG_MARKER) != 0) { 1280 m = next; 1281 continue; 1282 } 1283 KASSERT((m->flags & PG_FICTITIOUS) == 0, 1284 ("Fictitious page %p cannot be in active queue", m)); 1285 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 1286 ("Unmanaged page %p cannot be in active queue", m)); 1287 if (!vm_pageout_page_lock(m, &next)) { 1288 vm_page_unlock(m); 1289 m = next; 1290 continue; 1291 } 1292 object = m->object; 1293 if (!VM_OBJECT_TRYWLOCK(object) && 1294 !vm_pageout_fallback_object_lock(m, &next)) { 1295 VM_OBJECT_WUNLOCK(object); 1296 vm_page_unlock(m); 1297 m = next; 1298 continue; 1299 } 1300 1301 /* 1302 * Don't deactivate pages that are busy. 1303 */ 1304 if ((m->busy != 0) || 1305 (m->oflags & VPO_BUSY) || 1306 (m->hold_count != 0)) { 1307 vm_page_unlock(m); 1308 VM_OBJECT_WUNLOCK(object); 1309 vm_page_requeue_locked(m); 1310 m = next; 1311 continue; 1312 } 1313 1314 /* 1315 * The count for pagedaemon pages is done after checking the 1316 * page for eligibility... 1317 */ 1318 PCPU_INC(cnt.v_pdpages); 1319 1320 /* 1321 * Check to see "how much" the page has been used. 1322 */ 1323 act_delta = 0; 1324 if (m->aflags & PGA_REFERENCED) { 1325 vm_page_aflag_clear(m, PGA_REFERENCED); 1326 act_delta += 1; 1327 } 1328 if (object->ref_count != 0) 1329 act_delta += pmap_ts_referenced(m); 1330 1331 /* 1332 * Advance or decay the act_count based on recent usage. 1333 */ 1334 if (act_delta) { 1335 m->act_count += ACT_ADVANCE + act_delta; 1336 if (m->act_count > ACT_MAX) 1337 m->act_count = ACT_MAX; 1338 } else { 1339 m->act_count -= min(m->act_count, ACT_DECLINE); 1340 act_delta = m->act_count; 1341 } 1342 1343 /* 1344 * Move this page to the tail of the active or inactive 1345 * queue depending on usage. 1346 */ 1347 if (act_delta == 0) { 1348 KASSERT(object->ref_count != 0 || 1349 !pmap_page_is_mapped(m), 1350 ("vm_pageout_scan: page %p is mapped", m)); 1351 /* Dequeue to avoid later lock recursion. */ 1352 vm_page_dequeue_locked(m); 1353 vm_page_deactivate(m); 1354 page_shortage--; 1355 } else 1356 vm_page_requeue_locked(m); 1357 vm_page_unlock(m); 1358 VM_OBJECT_WUNLOCK(object); 1359 m = next; 1360 } 1361 vm_pagequeue_unlock(pq); 1362 #if !defined(NO_SWAPPING) 1363 /* 1364 * Idle process swapout -- run once per second. 1365 */ 1366 if (vm_swap_idle_enabled) { 1367 static long lsec; 1368 if (time_second != lsec) { 1369 vm_req_vmdaemon(VM_SWAP_IDLE); 1370 lsec = time_second; 1371 } 1372 } 1373 #endif 1374 1375 /* 1376 * If we didn't get enough free pages, and we have skipped a vnode 1377 * in a writeable object, wakeup the sync daemon. And kick swapout 1378 * if we did not get enough free pages. 1379 */ 1380 if (vm_paging_target() > 0) { 1381 if (vnodes_skipped && vm_page_count_min()) 1382 (void) speedup_syncer(); 1383 #if !defined(NO_SWAPPING) 1384 if (vm_swap_enabled && vm_page_count_target()) 1385 vm_req_vmdaemon(VM_SWAP_NORMAL); 1386 #endif 1387 } 1388 1389 /* 1390 * If we are critically low on one of RAM or swap and low on 1391 * the other, kill the largest process. However, we avoid 1392 * doing this on the first pass in order to give ourselves a 1393 * chance to flush out dirty vnode-backed pages and to allow 1394 * active pages to be moved to the inactive queue and reclaimed. 1395 */ 1396 if (pass != 0 && 1397 ((swap_pager_avail < 64 && vm_page_count_min()) || 1398 (swap_pager_full && vm_paging_target() > 0))) 1399 vm_pageout_oom(VM_OOM_MEM); 1400 } 1401 1402 1403 void 1404 vm_pageout_oom(int shortage) 1405 { 1406 struct proc *p, *bigproc; 1407 vm_offset_t size, bigsize; 1408 struct thread *td; 1409 struct vmspace *vm; 1410 1411 /* 1412 * We keep the process bigproc locked once we find it to keep anyone 1413 * from messing with it; however, there is a possibility of 1414 * deadlock if process B is bigproc and one of it's child processes 1415 * attempts to propagate a signal to B while we are waiting for A's 1416 * lock while walking this list. To avoid this, we don't block on 1417 * the process lock but just skip a process if it is already locked. 1418 */ 1419 bigproc = NULL; 1420 bigsize = 0; 1421 sx_slock(&allproc_lock); 1422 FOREACH_PROC_IN_SYSTEM(p) { 1423 int breakout; 1424 1425 if (PROC_TRYLOCK(p) == 0) 1426 continue; 1427 /* 1428 * If this is a system, protected or killed process, skip it. 1429 */ 1430 if (p->p_state != PRS_NORMAL || 1431 (p->p_flag & (P_INEXEC | P_PROTECTED | P_SYSTEM)) || 1432 (p->p_pid == 1) || P_KILLED(p) || 1433 ((p->p_pid < 48) && (swap_pager_avail != 0))) { 1434 PROC_UNLOCK(p); 1435 continue; 1436 } 1437 /* 1438 * If the process is in a non-running type state, 1439 * don't touch it. Check all the threads individually. 1440 */ 1441 breakout = 0; 1442 FOREACH_THREAD_IN_PROC(p, td) { 1443 thread_lock(td); 1444 if (!TD_ON_RUNQ(td) && 1445 !TD_IS_RUNNING(td) && 1446 !TD_IS_SLEEPING(td) && 1447 !TD_IS_SUSPENDED(td)) { 1448 thread_unlock(td); 1449 breakout = 1; 1450 break; 1451 } 1452 thread_unlock(td); 1453 } 1454 if (breakout) { 1455 PROC_UNLOCK(p); 1456 continue; 1457 } 1458 /* 1459 * get the process size 1460 */ 1461 vm = vmspace_acquire_ref(p); 1462 if (vm == NULL) { 1463 PROC_UNLOCK(p); 1464 continue; 1465 } 1466 if (!vm_map_trylock_read(&vm->vm_map)) { 1467 vmspace_free(vm); 1468 PROC_UNLOCK(p); 1469 continue; 1470 } 1471 size = vmspace_swap_count(vm); 1472 vm_map_unlock_read(&vm->vm_map); 1473 if (shortage == VM_OOM_MEM) 1474 size += vmspace_resident_count(vm); 1475 vmspace_free(vm); 1476 /* 1477 * if the this process is bigger than the biggest one 1478 * remember it. 1479 */ 1480 if (size > bigsize) { 1481 if (bigproc != NULL) 1482 PROC_UNLOCK(bigproc); 1483 bigproc = p; 1484 bigsize = size; 1485 } else 1486 PROC_UNLOCK(p); 1487 } 1488 sx_sunlock(&allproc_lock); 1489 if (bigproc != NULL) { 1490 killproc(bigproc, "out of swap space"); 1491 sched_nice(bigproc, PRIO_MIN); 1492 PROC_UNLOCK(bigproc); 1493 wakeup(&cnt.v_free_count); 1494 } 1495 } 1496 1497 /* 1498 * This routine tries to maintain the pseudo LRU active queue, 1499 * so that during long periods of time where there is no paging, 1500 * that some statistic accumulation still occurs. This code 1501 * helps the situation where paging just starts to occur. 1502 */ 1503 static void 1504 vm_pageout_page_stats(void) 1505 { 1506 struct vm_pagequeue *pq; 1507 vm_object_t object; 1508 vm_page_t m, next; 1509 int pcount, tpcount; /* Number of pages to check */ 1510 static int fullintervalcount = 0; 1511 int page_shortage; 1512 1513 page_shortage = 1514 (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) - 1515 (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count); 1516 1517 if (page_shortage <= 0) 1518 return; 1519 1520 pcount = cnt.v_active_count; 1521 fullintervalcount += vm_pageout_stats_interval; 1522 if (fullintervalcount < vm_pageout_full_stats_interval) { 1523 vm_pageout_stats++; 1524 tpcount = (int64_t)vm_pageout_stats_max * cnt.v_active_count / 1525 cnt.v_page_count; 1526 if (pcount > tpcount) 1527 pcount = tpcount; 1528 } else { 1529 vm_pageout_full_stats++; 1530 fullintervalcount = 0; 1531 } 1532 1533 pq = &vm_pagequeues[PQ_ACTIVE]; 1534 vm_pagequeue_lock(pq); 1535 m = TAILQ_FIRST(&pq->pq_pl); 1536 while ((m != NULL) && (pcount-- > 0)) { 1537 int actcount; 1538 1539 KASSERT(m->queue == PQ_ACTIVE, 1540 ("vm_pageout_page_stats: page %p isn't active", m)); 1541 1542 next = TAILQ_NEXT(m, pageq); 1543 if ((m->flags & PG_MARKER) != 0) { 1544 m = next; 1545 continue; 1546 } 1547 vm_page_lock_assert(m, MA_NOTOWNED); 1548 if (!vm_pageout_page_lock(m, &next)) { 1549 vm_page_unlock(m); 1550 m = next; 1551 continue; 1552 } 1553 object = m->object; 1554 if (!VM_OBJECT_TRYWLOCK(object) && 1555 !vm_pageout_fallback_object_lock(m, &next)) { 1556 VM_OBJECT_WUNLOCK(object); 1557 vm_page_unlock(m); 1558 m = next; 1559 continue; 1560 } 1561 1562 /* 1563 * Don't deactivate pages that are busy. 1564 */ 1565 if ((m->busy != 0) || 1566 (m->oflags & VPO_BUSY) || 1567 (m->hold_count != 0)) { 1568 vm_page_unlock(m); 1569 VM_OBJECT_WUNLOCK(object); 1570 vm_page_requeue_locked(m); 1571 m = next; 1572 continue; 1573 } 1574 1575 actcount = 0; 1576 if (m->aflags & PGA_REFERENCED) { 1577 vm_page_aflag_clear(m, PGA_REFERENCED); 1578 actcount += 1; 1579 } 1580 1581 actcount += pmap_ts_referenced(m); 1582 if (actcount) { 1583 m->act_count += ACT_ADVANCE + actcount; 1584 if (m->act_count > ACT_MAX) 1585 m->act_count = ACT_MAX; 1586 vm_page_requeue_locked(m); 1587 } else { 1588 if (m->act_count == 0) { 1589 /* 1590 * We turn off page access, so that we have 1591 * more accurate RSS stats. We don't do this 1592 * in the normal page deactivation when the 1593 * system is loaded VM wise, because the 1594 * cost of the large number of page protect 1595 * operations would be higher than the value 1596 * of doing the operation. 1597 */ 1598 pmap_remove_all(m); 1599 /* Dequeue to avoid later lock recursion. */ 1600 vm_page_dequeue_locked(m); 1601 vm_page_deactivate(m); 1602 } else { 1603 m->act_count -= min(m->act_count, ACT_DECLINE); 1604 vm_page_requeue_locked(m); 1605 } 1606 } 1607 vm_page_unlock(m); 1608 VM_OBJECT_WUNLOCK(object); 1609 m = next; 1610 } 1611 vm_pagequeue_unlock(pq); 1612 } 1613 1614 /* 1615 * vm_pageout is the high level pageout daemon. 1616 */ 1617 static void 1618 vm_pageout(void) 1619 { 1620 int error, pass; 1621 1622 /* 1623 * Initialize some paging parameters. 1624 */ 1625 cnt.v_interrupt_free_min = 2; 1626 if (cnt.v_page_count < 2000) 1627 vm_pageout_page_count = 8; 1628 1629 /* 1630 * v_free_reserved needs to include enough for the largest 1631 * swap pager structures plus enough for any pv_entry structs 1632 * when paging. 1633 */ 1634 if (cnt.v_page_count > 1024) 1635 cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200; 1636 else 1637 cnt.v_free_min = 4; 1638 cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE + 1639 cnt.v_interrupt_free_min; 1640 cnt.v_free_reserved = vm_pageout_page_count + 1641 cnt.v_pageout_free_min + (cnt.v_page_count / 768); 1642 cnt.v_free_severe = cnt.v_free_min / 2; 1643 cnt.v_free_min += cnt.v_free_reserved; 1644 cnt.v_free_severe += cnt.v_free_reserved; 1645 1646 /* 1647 * v_free_target and v_cache_min control pageout hysteresis. Note 1648 * that these are more a measure of the VM cache queue hysteresis 1649 * then the VM free queue. Specifically, v_free_target is the 1650 * high water mark (free+cache pages). 1651 * 1652 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the 1653 * low water mark, while v_free_min is the stop. v_cache_min must 1654 * be big enough to handle memory needs while the pageout daemon 1655 * is signalled and run to free more pages. 1656 */ 1657 if (cnt.v_free_count > 6144) 1658 cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved; 1659 else 1660 cnt.v_free_target = 2 * cnt.v_free_min + cnt.v_free_reserved; 1661 1662 if (cnt.v_free_count > 2048) { 1663 cnt.v_cache_min = cnt.v_free_target; 1664 cnt.v_cache_max = 2 * cnt.v_cache_min; 1665 cnt.v_inactive_target = (3 * cnt.v_free_target) / 2; 1666 } else { 1667 cnt.v_cache_min = 0; 1668 cnt.v_cache_max = 0; 1669 cnt.v_inactive_target = cnt.v_free_count / 4; 1670 } 1671 if (cnt.v_inactive_target > cnt.v_free_count / 3) 1672 cnt.v_inactive_target = cnt.v_free_count / 3; 1673 1674 /* XXX does not really belong here */ 1675 if (vm_page_max_wired == 0) 1676 vm_page_max_wired = cnt.v_free_count / 3; 1677 1678 if (vm_pageout_stats_max == 0) 1679 vm_pageout_stats_max = cnt.v_free_target; 1680 1681 /* 1682 * Set interval in seconds for stats scan. 1683 */ 1684 if (vm_pageout_stats_interval == 0) 1685 vm_pageout_stats_interval = 5; 1686 if (vm_pageout_full_stats_interval == 0) 1687 vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4; 1688 1689 swap_pager_swap_init(); 1690 pass = 0; 1691 /* 1692 * The pageout daemon is never done, so loop forever. 1693 */ 1694 while (TRUE) { 1695 /* 1696 * If we have enough free memory, wakeup waiters. Do 1697 * not clear vm_pages_needed until we reach our target, 1698 * otherwise we may be woken up over and over again and 1699 * waste a lot of cpu. 1700 */ 1701 mtx_lock(&vm_page_queue_free_mtx); 1702 if (vm_pages_needed && !vm_page_count_min()) { 1703 if (!vm_paging_needed()) 1704 vm_pages_needed = 0; 1705 wakeup(&cnt.v_free_count); 1706 } 1707 if (vm_pages_needed) { 1708 /* 1709 * Still not done, take a second pass without waiting 1710 * (unlimited dirty cleaning), otherwise sleep a bit 1711 * and try again. 1712 */ 1713 ++pass; 1714 if (pass > 1) 1715 msleep(&vm_pages_needed, 1716 &vm_page_queue_free_mtx, PVM, "psleep", 1717 hz / 2); 1718 } else { 1719 /* 1720 * Good enough, sleep & handle stats. Prime the pass 1721 * for the next run. 1722 */ 1723 if (pass > 1) 1724 pass = 1; 1725 else 1726 pass = 0; 1727 error = msleep(&vm_pages_needed, 1728 &vm_page_queue_free_mtx, PVM, "psleep", 1729 vm_pageout_stats_interval * hz); 1730 if (error && !vm_pages_needed) { 1731 mtx_unlock(&vm_page_queue_free_mtx); 1732 pass = 0; 1733 vm_pageout_page_stats(); 1734 continue; 1735 } 1736 } 1737 if (vm_pages_needed) 1738 cnt.v_pdwakeups++; 1739 mtx_unlock(&vm_page_queue_free_mtx); 1740 vm_pageout_scan(pass); 1741 } 1742 } 1743 1744 /* 1745 * Unless the free page queue lock is held by the caller, this function 1746 * should be regarded as advisory. Specifically, the caller should 1747 * not msleep() on &cnt.v_free_count following this function unless 1748 * the free page queue lock is held until the msleep() is performed. 1749 */ 1750 void 1751 pagedaemon_wakeup(void) 1752 { 1753 1754 if (!vm_pages_needed && curthread->td_proc != pageproc) { 1755 vm_pages_needed = 1; 1756 wakeup(&vm_pages_needed); 1757 } 1758 } 1759 1760 #if !defined(NO_SWAPPING) 1761 static void 1762 vm_req_vmdaemon(int req) 1763 { 1764 static int lastrun = 0; 1765 1766 mtx_lock(&vm_daemon_mtx); 1767 vm_pageout_req_swapout |= req; 1768 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { 1769 wakeup(&vm_daemon_needed); 1770 lastrun = ticks; 1771 } 1772 mtx_unlock(&vm_daemon_mtx); 1773 } 1774 1775 static void 1776 vm_daemon(void) 1777 { 1778 struct rlimit rsslim; 1779 struct proc *p; 1780 struct thread *td; 1781 struct vmspace *vm; 1782 int breakout, swapout_flags, tryagain, attempts; 1783 #ifdef RACCT 1784 uint64_t rsize, ravailable; 1785 #endif 1786 1787 while (TRUE) { 1788 mtx_lock(&vm_daemon_mtx); 1789 #ifdef RACCT 1790 msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", hz); 1791 #else 1792 msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", 0); 1793 #endif 1794 swapout_flags = vm_pageout_req_swapout; 1795 vm_pageout_req_swapout = 0; 1796 mtx_unlock(&vm_daemon_mtx); 1797 if (swapout_flags) 1798 swapout_procs(swapout_flags); 1799 1800 /* 1801 * scan the processes for exceeding their rlimits or if 1802 * process is swapped out -- deactivate pages 1803 */ 1804 tryagain = 0; 1805 attempts = 0; 1806 again: 1807 attempts++; 1808 sx_slock(&allproc_lock); 1809 FOREACH_PROC_IN_SYSTEM(p) { 1810 vm_pindex_t limit, size; 1811 1812 /* 1813 * if this is a system process or if we have already 1814 * looked at this process, skip it. 1815 */ 1816 PROC_LOCK(p); 1817 if (p->p_state != PRS_NORMAL || 1818 p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) { 1819 PROC_UNLOCK(p); 1820 continue; 1821 } 1822 /* 1823 * if the process is in a non-running type state, 1824 * don't touch it. 1825 */ 1826 breakout = 0; 1827 FOREACH_THREAD_IN_PROC(p, td) { 1828 thread_lock(td); 1829 if (!TD_ON_RUNQ(td) && 1830 !TD_IS_RUNNING(td) && 1831 !TD_IS_SLEEPING(td) && 1832 !TD_IS_SUSPENDED(td)) { 1833 thread_unlock(td); 1834 breakout = 1; 1835 break; 1836 } 1837 thread_unlock(td); 1838 } 1839 if (breakout) { 1840 PROC_UNLOCK(p); 1841 continue; 1842 } 1843 /* 1844 * get a limit 1845 */ 1846 lim_rlimit(p, RLIMIT_RSS, &rsslim); 1847 limit = OFF_TO_IDX( 1848 qmin(rsslim.rlim_cur, rsslim.rlim_max)); 1849 1850 /* 1851 * let processes that are swapped out really be 1852 * swapped out set the limit to nothing (will force a 1853 * swap-out.) 1854 */ 1855 if ((p->p_flag & P_INMEM) == 0) 1856 limit = 0; /* XXX */ 1857 vm = vmspace_acquire_ref(p); 1858 PROC_UNLOCK(p); 1859 if (vm == NULL) 1860 continue; 1861 1862 size = vmspace_resident_count(vm); 1863 if (size >= limit) { 1864 vm_pageout_map_deactivate_pages( 1865 &vm->vm_map, limit); 1866 } 1867 #ifdef RACCT 1868 rsize = IDX_TO_OFF(size); 1869 PROC_LOCK(p); 1870 racct_set(p, RACCT_RSS, rsize); 1871 ravailable = racct_get_available(p, RACCT_RSS); 1872 PROC_UNLOCK(p); 1873 if (rsize > ravailable) { 1874 /* 1875 * Don't be overly aggressive; this might be 1876 * an innocent process, and the limit could've 1877 * been exceeded by some memory hog. Don't 1878 * try to deactivate more than 1/4th of process' 1879 * resident set size. 1880 */ 1881 if (attempts <= 8) { 1882 if (ravailable < rsize - (rsize / 4)) 1883 ravailable = rsize - (rsize / 4); 1884 } 1885 vm_pageout_map_deactivate_pages( 1886 &vm->vm_map, OFF_TO_IDX(ravailable)); 1887 /* Update RSS usage after paging out. */ 1888 size = vmspace_resident_count(vm); 1889 rsize = IDX_TO_OFF(size); 1890 PROC_LOCK(p); 1891 racct_set(p, RACCT_RSS, rsize); 1892 PROC_UNLOCK(p); 1893 if (rsize > ravailable) 1894 tryagain = 1; 1895 } 1896 #endif 1897 vmspace_free(vm); 1898 } 1899 sx_sunlock(&allproc_lock); 1900 if (tryagain != 0 && attempts <= 10) 1901 goto again; 1902 } 1903 } 1904 #endif /* !defined(NO_SWAPPING) */ 1905