1 /*- 2 * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU) 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2005 Yahoo! Technologies Norway AS 11 * All rights reserved. 12 * 13 * This code is derived from software contributed to Berkeley by 14 * The Mach Operating System project at Carnegie-Mellon University. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 3. All advertising materials mentioning features or use of this software 25 * must display the following acknowledgement: 26 * This product includes software developed by the University of 27 * California, Berkeley and its contributors. 28 * 4. Neither the name of the University nor the names of its contributors 29 * may be used to endorse or promote products derived from this software 30 * without specific prior written permission. 31 * 32 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 33 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 34 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 35 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 36 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 37 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 38 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 39 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 40 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 41 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 42 * SUCH DAMAGE. 43 * 44 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 45 * 46 * 47 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 48 * All rights reserved. 49 * 50 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 51 * 52 * Permission to use, copy, modify and distribute this software and 53 * its documentation is hereby granted, provided that both the copyright 54 * notice and this permission notice appear in all copies of the 55 * software, derivative works or modified versions, and any portions 56 * thereof, and that both notices appear in supporting documentation. 57 * 58 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 59 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 60 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 61 * 62 * Carnegie Mellon requests users of this software to return to 63 * 64 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 65 * School of Computer Science 66 * Carnegie Mellon University 67 * Pittsburgh PA 15213-3890 68 * 69 * any improvements or extensions that they make and grant Carnegie the 70 * rights to redistribute these changes. 71 */ 72 73 /* 74 * The proverbial page-out daemon. 75 */ 76 77 #include <sys/cdefs.h> 78 __FBSDID("$FreeBSD$"); 79 80 #include "opt_vm.h" 81 82 #include <sys/param.h> 83 #include <sys/systm.h> 84 #include <sys/kernel.h> 85 #include <sys/eventhandler.h> 86 #include <sys/lock.h> 87 #include <sys/mutex.h> 88 #include <sys/proc.h> 89 #include <sys/kthread.h> 90 #include <sys/ktr.h> 91 #include <sys/mount.h> 92 #include <sys/racct.h> 93 #include <sys/resourcevar.h> 94 #include <sys/sched.h> 95 #include <sys/sdt.h> 96 #include <sys/signalvar.h> 97 #include <sys/smp.h> 98 #include <sys/time.h> 99 #include <sys/vnode.h> 100 #include <sys/vmmeter.h> 101 #include <sys/rwlock.h> 102 #include <sys/sx.h> 103 #include <sys/sysctl.h> 104 105 #include <vm/vm.h> 106 #include <vm/vm_param.h> 107 #include <vm/vm_object.h> 108 #include <vm/vm_page.h> 109 #include <vm/vm_map.h> 110 #include <vm/vm_pageout.h> 111 #include <vm/vm_pager.h> 112 #include <vm/vm_phys.h> 113 #include <vm/swap_pager.h> 114 #include <vm/vm_extern.h> 115 #include <vm/uma.h> 116 117 /* 118 * System initialization 119 */ 120 121 /* the kernel process "vm_pageout"*/ 122 static void vm_pageout(void); 123 static void vm_pageout_init(void); 124 static int vm_pageout_clean(vm_page_t m, int *numpagedout); 125 static int vm_pageout_cluster(vm_page_t m); 126 static bool vm_pageout_scan(struct vm_domain *vmd, int pass); 127 static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, 128 int starting_page_shortage); 129 130 SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init, 131 NULL); 132 133 struct proc *pageproc; 134 135 static struct kproc_desc page_kp = { 136 "pagedaemon", 137 vm_pageout, 138 &pageproc 139 }; 140 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, 141 &page_kp); 142 143 SDT_PROVIDER_DEFINE(vm); 144 SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan); 145 146 /* Pagedaemon activity rates, in subdivisions of one second. */ 147 #define VM_LAUNDER_RATE 10 148 #define VM_INACT_SCAN_RATE 2 149 150 int vm_pageout_deficit; /* Estimated number of pages deficit */ 151 u_int vm_pageout_wakeup_thresh; 152 static int vm_pageout_oom_seq = 12; 153 static bool vm_pageout_wanted; /* Event on which pageout daemon sleeps */ 154 bool vm_pages_needed; /* Are threads waiting for free pages? */ 155 156 /* Pending request for dirty page laundering. */ 157 static enum { 158 VM_LAUNDRY_IDLE, 159 VM_LAUNDRY_BACKGROUND, 160 VM_LAUNDRY_SHORTFALL 161 } vm_laundry_request = VM_LAUNDRY_IDLE; 162 static int vm_inactq_scans; 163 164 static int vm_pageout_update_period; 165 static int disable_swap_pageouts; 166 static int lowmem_period = 10; 167 static time_t lowmem_uptime; 168 static int swapdev_enabled; 169 170 static int vm_panic_on_oom = 0; 171 172 SYSCTL_INT(_vm, OID_AUTO, panic_on_oom, 173 CTLFLAG_RWTUN, &vm_panic_on_oom, 0, 174 "panic on out of memory instead of killing the largest process"); 175 176 SYSCTL_INT(_vm, OID_AUTO, pageout_wakeup_thresh, 177 CTLFLAG_RWTUN, &vm_pageout_wakeup_thresh, 0, 178 "free page threshold for waking up the pageout daemon"); 179 180 SYSCTL_INT(_vm, OID_AUTO, pageout_update_period, 181 CTLFLAG_RWTUN, &vm_pageout_update_period, 0, 182 "Maximum active LRU update period"); 183 184 SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RWTUN, &lowmem_period, 0, 185 "Low memory callback period"); 186 187 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, 188 CTLFLAG_RWTUN, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); 189 190 static int pageout_lock_miss; 191 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, 192 CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); 193 194 SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq, 195 CTLFLAG_RWTUN, &vm_pageout_oom_seq, 0, 196 "back-to-back calls to oom detector to start OOM"); 197 198 static int act_scan_laundry_weight = 3; 199 SYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RWTUN, 200 &act_scan_laundry_weight, 0, 201 "weight given to clean vs. dirty pages in active queue scans"); 202 203 static u_int vm_background_launder_target; 204 SYSCTL_UINT(_vm, OID_AUTO, background_launder_target, CTLFLAG_RWTUN, 205 &vm_background_launder_target, 0, 206 "background laundering target, in pages"); 207 208 static u_int vm_background_launder_rate = 4096; 209 SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RWTUN, 210 &vm_background_launder_rate, 0, 211 "background laundering rate, in kilobytes per second"); 212 213 static u_int vm_background_launder_max = 20 * 1024; 214 SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RWTUN, 215 &vm_background_launder_max, 0, "background laundering cap, in kilobytes"); 216 217 int vm_pageout_page_count = 32; 218 219 int vm_page_max_wired; /* XXX max # of wired pages system-wide */ 220 SYSCTL_INT(_vm, OID_AUTO, max_wired, 221 CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count"); 222 223 static u_int isqrt(u_int num); 224 static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *); 225 static int vm_pageout_launder(struct vm_domain *vmd, int launder, 226 bool in_shortfall); 227 static void vm_pageout_laundry_worker(void *arg); 228 static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *); 229 230 /* 231 * Initialize a dummy page for marking the caller's place in the specified 232 * paging queue. In principle, this function only needs to set the flag 233 * PG_MARKER. Nonetheless, it write busies and initializes the hold count 234 * to one as safety precautions. 235 */ 236 static void 237 vm_pageout_init_marker(vm_page_t marker, u_short queue) 238 { 239 240 bzero(marker, sizeof(*marker)); 241 marker->flags = PG_MARKER; 242 marker->busy_lock = VPB_SINGLE_EXCLUSIVER; 243 marker->queue = queue; 244 marker->hold_count = 1; 245 } 246 247 /* 248 * vm_pageout_fallback_object_lock: 249 * 250 * Lock vm object currently associated with `m'. VM_OBJECT_TRYWLOCK is 251 * known to have failed and page queue must be either PQ_ACTIVE or 252 * PQ_INACTIVE. To avoid lock order violation, unlock the page queue 253 * while locking the vm object. Use marker page to detect page queue 254 * changes and maintain notion of next page on page queue. Return 255 * TRUE if no changes were detected, FALSE otherwise. vm object is 256 * locked on return. 257 * 258 * This function depends on both the lock portion of struct vm_object 259 * and normal struct vm_page being type stable. 260 */ 261 static boolean_t 262 vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next) 263 { 264 struct vm_page marker; 265 struct vm_pagequeue *pq; 266 boolean_t unchanged; 267 u_short queue; 268 vm_object_t object; 269 270 queue = m->queue; 271 vm_pageout_init_marker(&marker, queue); 272 pq = vm_page_pagequeue(m); 273 object = m->object; 274 275 TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q); 276 vm_pagequeue_unlock(pq); 277 vm_page_unlock(m); 278 VM_OBJECT_WLOCK(object); 279 vm_page_lock(m); 280 vm_pagequeue_lock(pq); 281 282 /* 283 * The page's object might have changed, and/or the page might 284 * have moved from its original position in the queue. If the 285 * page's object has changed, then the caller should abandon 286 * processing the page because the wrong object lock was 287 * acquired. Use the marker's plinks.q, not the page's, to 288 * determine if the page has been moved. The state of the 289 * page's plinks.q can be indeterminate; whereas, the marker's 290 * plinks.q must be valid. 291 */ 292 *next = TAILQ_NEXT(&marker, plinks.q); 293 unchanged = m->object == object && 294 m == TAILQ_PREV(&marker, pglist, plinks.q); 295 KASSERT(!unchanged || m->queue == queue, 296 ("page %p queue %d %d", m, queue, m->queue)); 297 TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q); 298 return (unchanged); 299 } 300 301 /* 302 * Lock the page while holding the page queue lock. Use marker page 303 * to detect page queue changes and maintain notion of next page on 304 * page queue. Return TRUE if no changes were detected, FALSE 305 * otherwise. The page is locked on return. The page queue lock might 306 * be dropped and reacquired. 307 * 308 * This function depends on normal struct vm_page being type stable. 309 */ 310 static boolean_t 311 vm_pageout_page_lock(vm_page_t m, vm_page_t *next) 312 { 313 struct vm_page marker; 314 struct vm_pagequeue *pq; 315 boolean_t unchanged; 316 u_short queue; 317 318 vm_page_lock_assert(m, MA_NOTOWNED); 319 if (vm_page_trylock(m)) 320 return (TRUE); 321 322 queue = m->queue; 323 vm_pageout_init_marker(&marker, queue); 324 pq = vm_page_pagequeue(m); 325 326 TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q); 327 vm_pagequeue_unlock(pq); 328 vm_page_lock(m); 329 vm_pagequeue_lock(pq); 330 331 /* Page queue might have changed. */ 332 *next = TAILQ_NEXT(&marker, plinks.q); 333 unchanged = m == TAILQ_PREV(&marker, pglist, plinks.q); 334 KASSERT(!unchanged || m->queue == queue, 335 ("page %p queue %d %d", m, queue, m->queue)); 336 TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q); 337 return (unchanged); 338 } 339 340 /* 341 * Scan for pages at adjacent offsets within the given page's object that are 342 * eligible for laundering, form a cluster of these pages and the given page, 343 * and launder that cluster. 344 */ 345 static int 346 vm_pageout_cluster(vm_page_t m) 347 { 348 vm_object_t object; 349 vm_page_t mc[2 * vm_pageout_page_count], p, pb, ps; 350 vm_pindex_t pindex; 351 int ib, is, page_base, pageout_count; 352 353 vm_page_assert_locked(m); 354 object = m->object; 355 VM_OBJECT_ASSERT_WLOCKED(object); 356 pindex = m->pindex; 357 358 /* 359 * We can't clean the page if it is busy or held. 360 */ 361 vm_page_assert_unbusied(m); 362 KASSERT(m->hold_count == 0, ("page %p is held", m)); 363 364 pmap_remove_write(m); 365 vm_page_unlock(m); 366 367 mc[vm_pageout_page_count] = pb = ps = m; 368 pageout_count = 1; 369 page_base = vm_pageout_page_count; 370 ib = 1; 371 is = 1; 372 373 /* 374 * We can cluster only if the page is not clean, busy, or held, and 375 * the page is in the laundry queue. 376 * 377 * During heavy mmap/modification loads the pageout 378 * daemon can really fragment the underlying file 379 * due to flushing pages out of order and not trying to 380 * align the clusters (which leaves sporadic out-of-order 381 * holes). To solve this problem we do the reverse scan 382 * first and attempt to align our cluster, then do a 383 * forward scan if room remains. 384 */ 385 more: 386 while (ib != 0 && pageout_count < vm_pageout_page_count) { 387 if (ib > pindex) { 388 ib = 0; 389 break; 390 } 391 if ((p = vm_page_prev(pb)) == NULL || vm_page_busied(p)) { 392 ib = 0; 393 break; 394 } 395 vm_page_test_dirty(p); 396 if (p->dirty == 0) { 397 ib = 0; 398 break; 399 } 400 vm_page_lock(p); 401 if (!vm_page_in_laundry(p) || 402 p->hold_count != 0) { /* may be undergoing I/O */ 403 vm_page_unlock(p); 404 ib = 0; 405 break; 406 } 407 pmap_remove_write(p); 408 vm_page_unlock(p); 409 mc[--page_base] = pb = p; 410 ++pageout_count; 411 ++ib; 412 413 /* 414 * We are at an alignment boundary. Stop here, and switch 415 * directions. Do not clear ib. 416 */ 417 if ((pindex - (ib - 1)) % vm_pageout_page_count == 0) 418 break; 419 } 420 while (pageout_count < vm_pageout_page_count && 421 pindex + is < object->size) { 422 if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p)) 423 break; 424 vm_page_test_dirty(p); 425 if (p->dirty == 0) 426 break; 427 vm_page_lock(p); 428 if (!vm_page_in_laundry(p) || 429 p->hold_count != 0) { /* may be undergoing I/O */ 430 vm_page_unlock(p); 431 break; 432 } 433 pmap_remove_write(p); 434 vm_page_unlock(p); 435 mc[page_base + pageout_count] = ps = p; 436 ++pageout_count; 437 ++is; 438 } 439 440 /* 441 * If we exhausted our forward scan, continue with the reverse scan 442 * when possible, even past an alignment boundary. This catches 443 * boundary conditions. 444 */ 445 if (ib != 0 && pageout_count < vm_pageout_page_count) 446 goto more; 447 448 return (vm_pageout_flush(&mc[page_base], pageout_count, 449 VM_PAGER_PUT_NOREUSE, 0, NULL, NULL)); 450 } 451 452 /* 453 * vm_pageout_flush() - launder the given pages 454 * 455 * The given pages are laundered. Note that we setup for the start of 456 * I/O ( i.e. busy the page ), mark it read-only, and bump the object 457 * reference count all in here rather then in the parent. If we want 458 * the parent to do more sophisticated things we may have to change 459 * the ordering. 460 * 461 * Returned runlen is the count of pages between mreq and first 462 * page after mreq with status VM_PAGER_AGAIN. 463 * *eio is set to TRUE if pager returned VM_PAGER_ERROR or VM_PAGER_FAIL 464 * for any page in runlen set. 465 */ 466 int 467 vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen, 468 boolean_t *eio) 469 { 470 vm_object_t object = mc[0]->object; 471 int pageout_status[count]; 472 int numpagedout = 0; 473 int i, runlen; 474 475 VM_OBJECT_ASSERT_WLOCKED(object); 476 477 /* 478 * Initiate I/O. Mark the pages busy and verify that they're valid 479 * and read-only. 480 * 481 * We do not have to fixup the clean/dirty bits here... we can 482 * allow the pager to do it after the I/O completes. 483 * 484 * NOTE! mc[i]->dirty may be partial or fragmented due to an 485 * edge case with file fragments. 486 */ 487 for (i = 0; i < count; i++) { 488 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, 489 ("vm_pageout_flush: partially invalid page %p index %d/%d", 490 mc[i], i, count)); 491 KASSERT((mc[i]->aflags & PGA_WRITEABLE) == 0, 492 ("vm_pageout_flush: writeable page %p", mc[i])); 493 vm_page_sbusy(mc[i]); 494 } 495 vm_object_pip_add(object, count); 496 497 vm_pager_put_pages(object, mc, count, flags, pageout_status); 498 499 runlen = count - mreq; 500 if (eio != NULL) 501 *eio = FALSE; 502 for (i = 0; i < count; i++) { 503 vm_page_t mt = mc[i]; 504 505 KASSERT(pageout_status[i] == VM_PAGER_PEND || 506 !pmap_page_is_write_mapped(mt), 507 ("vm_pageout_flush: page %p is not write protected", mt)); 508 switch (pageout_status[i]) { 509 case VM_PAGER_OK: 510 vm_page_lock(mt); 511 if (vm_page_in_laundry(mt)) 512 vm_page_deactivate_noreuse(mt); 513 vm_page_unlock(mt); 514 /* FALLTHROUGH */ 515 case VM_PAGER_PEND: 516 numpagedout++; 517 break; 518 case VM_PAGER_BAD: 519 /* 520 * The page is outside the object's range. We pretend 521 * that the page out worked and clean the page, so the 522 * changes will be lost if the page is reclaimed by 523 * the page daemon. 524 */ 525 vm_page_undirty(mt); 526 vm_page_lock(mt); 527 if (vm_page_in_laundry(mt)) 528 vm_page_deactivate_noreuse(mt); 529 vm_page_unlock(mt); 530 break; 531 case VM_PAGER_ERROR: 532 case VM_PAGER_FAIL: 533 /* 534 * If the page couldn't be paged out to swap because the 535 * pager wasn't able to find space, place the page in 536 * the PQ_UNSWAPPABLE holding queue. This is an 537 * optimization that prevents the page daemon from 538 * wasting CPU cycles on pages that cannot be reclaimed 539 * becase no swap device is configured. 540 * 541 * Otherwise, reactivate the page so that it doesn't 542 * clog the laundry and inactive queues. (We will try 543 * paging it out again later.) 544 */ 545 vm_page_lock(mt); 546 if (object->type == OBJT_SWAP && 547 pageout_status[i] == VM_PAGER_FAIL) { 548 vm_page_unswappable(mt); 549 numpagedout++; 550 } else 551 vm_page_activate(mt); 552 vm_page_unlock(mt); 553 if (eio != NULL && i >= mreq && i - mreq < runlen) 554 *eio = TRUE; 555 break; 556 case VM_PAGER_AGAIN: 557 if (i >= mreq && i - mreq < runlen) 558 runlen = i - mreq; 559 break; 560 } 561 562 /* 563 * If the operation is still going, leave the page busy to 564 * block all other accesses. Also, leave the paging in 565 * progress indicator set so that we don't attempt an object 566 * collapse. 567 */ 568 if (pageout_status[i] != VM_PAGER_PEND) { 569 vm_object_pip_wakeup(object); 570 vm_page_sunbusy(mt); 571 } 572 } 573 if (prunlen != NULL) 574 *prunlen = runlen; 575 return (numpagedout); 576 } 577 578 static void 579 vm_pageout_swapon(void *arg __unused, struct swdevt *sp __unused) 580 { 581 582 atomic_store_rel_int(&swapdev_enabled, 1); 583 } 584 585 static void 586 vm_pageout_swapoff(void *arg __unused, struct swdevt *sp __unused) 587 { 588 589 if (swap_pager_nswapdev() == 1) 590 atomic_store_rel_int(&swapdev_enabled, 0); 591 } 592 593 /* 594 * Attempt to acquire all of the necessary locks to launder a page and 595 * then call through the clustering layer to PUTPAGES. Wait a short 596 * time for a vnode lock. 597 * 598 * Requires the page and object lock on entry, releases both before return. 599 * Returns 0 on success and an errno otherwise. 600 */ 601 static int 602 vm_pageout_clean(vm_page_t m, int *numpagedout) 603 { 604 struct vnode *vp; 605 struct mount *mp; 606 vm_object_t object; 607 vm_pindex_t pindex; 608 int error, lockmode; 609 610 vm_page_assert_locked(m); 611 object = m->object; 612 VM_OBJECT_ASSERT_WLOCKED(object); 613 error = 0; 614 vp = NULL; 615 mp = NULL; 616 617 /* 618 * The object is already known NOT to be dead. It 619 * is possible for the vget() to block the whole 620 * pageout daemon, but the new low-memory handling 621 * code should prevent it. 622 * 623 * We can't wait forever for the vnode lock, we might 624 * deadlock due to a vn_read() getting stuck in 625 * vm_wait while holding this vnode. We skip the 626 * vnode if we can't get it in a reasonable amount 627 * of time. 628 */ 629 if (object->type == OBJT_VNODE) { 630 vm_page_unlock(m); 631 vp = object->handle; 632 if (vp->v_type == VREG && 633 vn_start_write(vp, &mp, V_NOWAIT) != 0) { 634 mp = NULL; 635 error = EDEADLK; 636 goto unlock_all; 637 } 638 KASSERT(mp != NULL, 639 ("vp %p with NULL v_mount", vp)); 640 vm_object_reference_locked(object); 641 pindex = m->pindex; 642 VM_OBJECT_WUNLOCK(object); 643 lockmode = MNT_SHARED_WRITES(vp->v_mount) ? 644 LK_SHARED : LK_EXCLUSIVE; 645 if (vget(vp, lockmode | LK_TIMELOCK, curthread)) { 646 vp = NULL; 647 error = EDEADLK; 648 goto unlock_mp; 649 } 650 VM_OBJECT_WLOCK(object); 651 652 /* 653 * Ensure that the object and vnode were not disassociated 654 * while locks were dropped. 655 */ 656 if (vp->v_object != object) { 657 error = ENOENT; 658 goto unlock_all; 659 } 660 vm_page_lock(m); 661 662 /* 663 * While the object and page were unlocked, the page 664 * may have been: 665 * (1) moved to a different queue, 666 * (2) reallocated to a different object, 667 * (3) reallocated to a different offset, or 668 * (4) cleaned. 669 */ 670 if (!vm_page_in_laundry(m) || m->object != object || 671 m->pindex != pindex || m->dirty == 0) { 672 vm_page_unlock(m); 673 error = ENXIO; 674 goto unlock_all; 675 } 676 677 /* 678 * The page may have been busied or held while the object 679 * and page locks were released. 680 */ 681 if (vm_page_busied(m) || m->hold_count != 0) { 682 vm_page_unlock(m); 683 error = EBUSY; 684 goto unlock_all; 685 } 686 } 687 688 /* 689 * If a page is dirty, then it is either being washed 690 * (but not yet cleaned) or it is still in the 691 * laundry. If it is still in the laundry, then we 692 * start the cleaning operation. 693 */ 694 if ((*numpagedout = vm_pageout_cluster(m)) == 0) 695 error = EIO; 696 697 unlock_all: 698 VM_OBJECT_WUNLOCK(object); 699 700 unlock_mp: 701 vm_page_lock_assert(m, MA_NOTOWNED); 702 if (mp != NULL) { 703 if (vp != NULL) 704 vput(vp); 705 vm_object_deallocate(object); 706 vn_finished_write(mp); 707 } 708 709 return (error); 710 } 711 712 /* 713 * Attempt to launder the specified number of pages. 714 * 715 * Returns the number of pages successfully laundered. 716 */ 717 static int 718 vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall) 719 { 720 struct vm_pagequeue *pq; 721 vm_object_t object; 722 vm_page_t m, next; 723 int act_delta, error, maxscan, numpagedout, starting_target; 724 int vnodes_skipped; 725 bool pageout_ok, queue_locked; 726 727 starting_target = launder; 728 vnodes_skipped = 0; 729 730 /* 731 * Scan the laundry queues for pages eligible to be laundered. We stop 732 * once the target number of dirty pages have been laundered, or once 733 * we've reached the end of the queue. A single iteration of this loop 734 * may cause more than one page to be laundered because of clustering. 735 * 736 * maxscan ensures that we don't re-examine requeued pages. Any 737 * additional pages written as part of a cluster are subtracted from 738 * maxscan since they must be taken from the laundry queue. 739 * 740 * As an optimization, we avoid laundering from PQ_UNSWAPPABLE when no 741 * swap devices are configured. 742 */ 743 if (atomic_load_acq_int(&swapdev_enabled)) 744 pq = &vmd->vmd_pagequeues[PQ_UNSWAPPABLE]; 745 else 746 pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; 747 748 scan: 749 vm_pagequeue_lock(pq); 750 maxscan = pq->pq_cnt; 751 queue_locked = true; 752 for (m = TAILQ_FIRST(&pq->pq_pl); 753 m != NULL && maxscan-- > 0 && launder > 0; 754 m = next) { 755 vm_pagequeue_assert_locked(pq); 756 KASSERT(queue_locked, ("unlocked laundry queue")); 757 KASSERT(vm_page_in_laundry(m), 758 ("page %p has an inconsistent queue", m)); 759 next = TAILQ_NEXT(m, plinks.q); 760 if ((m->flags & PG_MARKER) != 0) 761 continue; 762 KASSERT((m->flags & PG_FICTITIOUS) == 0, 763 ("PG_FICTITIOUS page %p cannot be in laundry queue", m)); 764 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 765 ("VPO_UNMANAGED page %p cannot be in laundry queue", m)); 766 if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) { 767 vm_page_unlock(m); 768 continue; 769 } 770 object = m->object; 771 if ((!VM_OBJECT_TRYWLOCK(object) && 772 (!vm_pageout_fallback_object_lock(m, &next) || 773 m->hold_count != 0)) || vm_page_busied(m)) { 774 VM_OBJECT_WUNLOCK(object); 775 vm_page_unlock(m); 776 continue; 777 } 778 779 /* 780 * Unlock the laundry queue, invalidating the 'next' pointer. 781 * Use a marker to remember our place in the laundry queue. 782 */ 783 TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_laundry_marker, 784 plinks.q); 785 vm_pagequeue_unlock(pq); 786 queue_locked = false; 787 788 /* 789 * Invalid pages can be easily freed. They cannot be 790 * mapped; vm_page_free() asserts this. 791 */ 792 if (m->valid == 0) 793 goto free_page; 794 795 /* 796 * If the page has been referenced and the object is not dead, 797 * reactivate or requeue the page depending on whether the 798 * object is mapped. 799 */ 800 if ((m->aflags & PGA_REFERENCED) != 0) { 801 vm_page_aflag_clear(m, PGA_REFERENCED); 802 act_delta = 1; 803 } else 804 act_delta = 0; 805 if (object->ref_count != 0) 806 act_delta += pmap_ts_referenced(m); 807 else { 808 KASSERT(!pmap_page_is_mapped(m), 809 ("page %p is mapped", m)); 810 } 811 if (act_delta != 0) { 812 if (object->ref_count != 0) { 813 VM_CNT_INC(v_reactivated); 814 vm_page_activate(m); 815 816 /* 817 * Increase the activation count if the page 818 * was referenced while in the laundry queue. 819 * This makes it less likely that the page will 820 * be returned prematurely to the inactive 821 * queue. 822 */ 823 m->act_count += act_delta + ACT_ADVANCE; 824 825 /* 826 * If this was a background laundering, count 827 * activated pages towards our target. The 828 * purpose of background laundering is to ensure 829 * that pages are eventually cycled through the 830 * laundry queue, and an activation is a valid 831 * way out. 832 */ 833 if (!in_shortfall) 834 launder--; 835 goto drop_page; 836 } else if ((object->flags & OBJ_DEAD) == 0) 837 goto requeue_page; 838 } 839 840 /* 841 * If the page appears to be clean at the machine-independent 842 * layer, then remove all of its mappings from the pmap in 843 * anticipation of freeing it. If, however, any of the page's 844 * mappings allow write access, then the page may still be 845 * modified until the last of those mappings are removed. 846 */ 847 if (object->ref_count != 0) { 848 vm_page_test_dirty(m); 849 if (m->dirty == 0) 850 pmap_remove_all(m); 851 } 852 853 /* 854 * Clean pages are freed, and dirty pages are paged out unless 855 * they belong to a dead object. Requeueing dirty pages from 856 * dead objects is pointless, as they are being paged out and 857 * freed by the thread that destroyed the object. 858 */ 859 if (m->dirty == 0) { 860 free_page: 861 vm_page_free(m); 862 VM_CNT_INC(v_dfree); 863 } else if ((object->flags & OBJ_DEAD) == 0) { 864 if (object->type != OBJT_SWAP && 865 object->type != OBJT_DEFAULT) 866 pageout_ok = true; 867 else if (disable_swap_pageouts) 868 pageout_ok = false; 869 else 870 pageout_ok = true; 871 if (!pageout_ok) { 872 requeue_page: 873 vm_pagequeue_lock(pq); 874 queue_locked = true; 875 vm_page_requeue_locked(m); 876 goto drop_page; 877 } 878 879 /* 880 * Form a cluster with adjacent, dirty pages from the 881 * same object, and page out that entire cluster. 882 * 883 * The adjacent, dirty pages must also be in the 884 * laundry. However, their mappings are not checked 885 * for new references. Consequently, a recently 886 * referenced page may be paged out. However, that 887 * page will not be prematurely reclaimed. After page 888 * out, the page will be placed in the inactive queue, 889 * where any new references will be detected and the 890 * page reactivated. 891 */ 892 error = vm_pageout_clean(m, &numpagedout); 893 if (error == 0) { 894 launder -= numpagedout; 895 maxscan -= numpagedout - 1; 896 } else if (error == EDEADLK) { 897 pageout_lock_miss++; 898 vnodes_skipped++; 899 } 900 goto relock_queue; 901 } 902 drop_page: 903 vm_page_unlock(m); 904 VM_OBJECT_WUNLOCK(object); 905 relock_queue: 906 if (!queue_locked) { 907 vm_pagequeue_lock(pq); 908 queue_locked = true; 909 } 910 next = TAILQ_NEXT(&vmd->vmd_laundry_marker, plinks.q); 911 TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_laundry_marker, plinks.q); 912 } 913 vm_pagequeue_unlock(pq); 914 915 if (launder > 0 && pq == &vmd->vmd_pagequeues[PQ_UNSWAPPABLE]) { 916 pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; 917 goto scan; 918 } 919 920 /* 921 * Wakeup the sync daemon if we skipped a vnode in a writeable object 922 * and we didn't launder enough pages. 923 */ 924 if (vnodes_skipped > 0 && launder > 0) 925 (void)speedup_syncer(); 926 927 return (starting_target - launder); 928 } 929 930 /* 931 * Compute the integer square root. 932 */ 933 static u_int 934 isqrt(u_int num) 935 { 936 u_int bit, root, tmp; 937 938 bit = 1u << ((NBBY * sizeof(u_int)) - 2); 939 while (bit > num) 940 bit >>= 2; 941 root = 0; 942 while (bit != 0) { 943 tmp = root + bit; 944 root >>= 1; 945 if (num >= tmp) { 946 num -= tmp; 947 root += bit; 948 } 949 bit >>= 2; 950 } 951 return (root); 952 } 953 954 /* 955 * Perform the work of the laundry thread: periodically wake up and determine 956 * whether any pages need to be laundered. If so, determine the number of pages 957 * that need to be laundered, and launder them. 958 */ 959 static void 960 vm_pageout_laundry_worker(void *arg) 961 { 962 struct vm_domain *domain; 963 struct vm_pagequeue *pq; 964 uint64_t nclean, ndirty; 965 u_int inactq_scans, last_launder; 966 int domidx, last_target, launder, shortfall, shortfall_cycle, target; 967 bool in_shortfall; 968 969 domidx = (uintptr_t)arg; 970 domain = &vm_dom[domidx]; 971 pq = &domain->vmd_pagequeues[PQ_LAUNDRY]; 972 KASSERT(domain->vmd_segs != 0, ("domain without segments")); 973 vm_pageout_init_marker(&domain->vmd_laundry_marker, PQ_LAUNDRY); 974 975 shortfall = 0; 976 in_shortfall = false; 977 shortfall_cycle = 0; 978 target = 0; 979 inactq_scans = 0; 980 last_launder = 0; 981 982 /* 983 * Calls to these handlers are serialized by the swap syscall lock. 984 */ 985 (void)EVENTHANDLER_REGISTER(swapon, vm_pageout_swapon, domain, 986 EVENTHANDLER_PRI_ANY); 987 (void)EVENTHANDLER_REGISTER(swapoff, vm_pageout_swapoff, domain, 988 EVENTHANDLER_PRI_ANY); 989 990 /* 991 * The pageout laundry worker is never done, so loop forever. 992 */ 993 for (;;) { 994 KASSERT(target >= 0, ("negative target %d", target)); 995 KASSERT(shortfall_cycle >= 0, 996 ("negative cycle %d", shortfall_cycle)); 997 launder = 0; 998 999 /* 1000 * First determine whether we need to launder pages to meet a 1001 * shortage of free pages. 1002 */ 1003 if (shortfall > 0) { 1004 in_shortfall = true; 1005 shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE; 1006 target = shortfall; 1007 } else if (!in_shortfall) 1008 goto trybackground; 1009 else if (shortfall_cycle == 0 || vm_laundry_target() <= 0) { 1010 /* 1011 * We recently entered shortfall and began laundering 1012 * pages. If we have completed that laundering run 1013 * (and we are no longer in shortfall) or we have met 1014 * our laundry target through other activity, then we 1015 * can stop laundering pages. 1016 */ 1017 in_shortfall = false; 1018 target = 0; 1019 goto trybackground; 1020 } 1021 last_launder = inactq_scans; 1022 launder = target / shortfall_cycle--; 1023 goto dolaundry; 1024 1025 /* 1026 * There's no immediate need to launder any pages; see if we 1027 * meet the conditions to perform background laundering: 1028 * 1029 * 1. The ratio of dirty to clean inactive pages exceeds the 1030 * background laundering threshold and the pagedaemon has 1031 * been woken up to reclaim pages since our last 1032 * laundering, or 1033 * 2. we haven't yet reached the target of the current 1034 * background laundering run. 1035 * 1036 * The background laundering threshold is not a constant. 1037 * Instead, it is a slowly growing function of the number of 1038 * page daemon scans since the last laundering. Thus, as the 1039 * ratio of dirty to clean inactive pages grows, the amount of 1040 * memory pressure required to trigger laundering decreases. 1041 */ 1042 trybackground: 1043 nclean = vm_cnt.v_inactive_count + vm_cnt.v_free_count; 1044 ndirty = vm_cnt.v_laundry_count; 1045 if (target == 0 && inactq_scans != last_launder && 1046 ndirty * isqrt(inactq_scans - last_launder) >= nclean) { 1047 target = vm_background_launder_target; 1048 } 1049 1050 /* 1051 * We have a non-zero background laundering target. If we've 1052 * laundered up to our maximum without observing a page daemon 1053 * request, just stop. This is a safety belt that ensures we 1054 * don't launder an excessive amount if memory pressure is low 1055 * and the ratio of dirty to clean pages is large. Otherwise, 1056 * proceed at the background laundering rate. 1057 */ 1058 if (target > 0) { 1059 if (inactq_scans != last_launder) { 1060 last_launder = inactq_scans; 1061 last_target = target; 1062 } else if (last_target - target >= 1063 vm_background_launder_max * PAGE_SIZE / 1024) { 1064 target = 0; 1065 } 1066 launder = vm_background_launder_rate * PAGE_SIZE / 1024; 1067 launder /= VM_LAUNDER_RATE; 1068 if (launder > target) 1069 launder = target; 1070 } 1071 1072 dolaundry: 1073 if (launder > 0) { 1074 /* 1075 * Because of I/O clustering, the number of laundered 1076 * pages could exceed "target" by the maximum size of 1077 * a cluster minus one. 1078 */ 1079 target -= min(vm_pageout_launder(domain, launder, 1080 in_shortfall), target); 1081 pause("laundp", hz / VM_LAUNDER_RATE); 1082 } 1083 1084 /* 1085 * If we're not currently laundering pages and the page daemon 1086 * hasn't posted a new request, sleep until the page daemon 1087 * kicks us. 1088 */ 1089 vm_pagequeue_lock(pq); 1090 if (target == 0 && vm_laundry_request == VM_LAUNDRY_IDLE) 1091 (void)mtx_sleep(&vm_laundry_request, 1092 vm_pagequeue_lockptr(pq), PVM, "launds", 0); 1093 1094 /* 1095 * If the pagedaemon has indicated that it's in shortfall, start 1096 * a shortfall laundering unless we're already in the middle of 1097 * one. This may preempt a background laundering. 1098 */ 1099 if (vm_laundry_request == VM_LAUNDRY_SHORTFALL && 1100 (!in_shortfall || shortfall_cycle == 0)) { 1101 shortfall = vm_laundry_target() + vm_pageout_deficit; 1102 target = 0; 1103 } else 1104 shortfall = 0; 1105 1106 if (target == 0) 1107 vm_laundry_request = VM_LAUNDRY_IDLE; 1108 inactq_scans = vm_inactq_scans; 1109 vm_pagequeue_unlock(pq); 1110 } 1111 } 1112 1113 /* 1114 * vm_pageout_scan does the dirty work for the pageout daemon. 1115 * 1116 * pass == 0: Update active LRU/deactivate pages 1117 * pass >= 1: Free inactive pages 1118 * 1119 * Returns true if pass was zero or enough pages were freed by the inactive 1120 * queue scan to meet the target. 1121 */ 1122 static bool 1123 vm_pageout_scan(struct vm_domain *vmd, int pass) 1124 { 1125 vm_page_t m, next; 1126 struct vm_pagequeue *pq; 1127 vm_object_t object; 1128 long min_scan; 1129 int act_delta, addl_page_shortage, deficit, inactq_shortage, maxscan; 1130 int page_shortage, scan_tick, scanned, starting_page_shortage; 1131 boolean_t queue_locked; 1132 1133 /* 1134 * If we need to reclaim memory ask kernel caches to return 1135 * some. We rate limit to avoid thrashing. 1136 */ 1137 if (vmd == &vm_dom[0] && pass > 0 && 1138 (time_uptime - lowmem_uptime) >= lowmem_period) { 1139 /* 1140 * Decrease registered cache sizes. 1141 */ 1142 SDT_PROBE0(vm, , , vm__lowmem_scan); 1143 EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_PAGES); 1144 /* 1145 * We do this explicitly after the caches have been 1146 * drained above. 1147 */ 1148 uma_reclaim(); 1149 lowmem_uptime = time_uptime; 1150 } 1151 1152 /* 1153 * The addl_page_shortage is the number of temporarily 1154 * stuck pages in the inactive queue. In other words, the 1155 * number of pages from the inactive count that should be 1156 * discounted in setting the target for the active queue scan. 1157 */ 1158 addl_page_shortage = 0; 1159 1160 /* 1161 * Calculate the number of pages that we want to free. This number 1162 * can be negative if many pages are freed between the wakeup call to 1163 * the page daemon and this calculation. 1164 */ 1165 if (pass > 0) { 1166 deficit = atomic_readandclear_int(&vm_pageout_deficit); 1167 page_shortage = vm_paging_target() + deficit; 1168 } else 1169 page_shortage = deficit = 0; 1170 starting_page_shortage = page_shortage; 1171 1172 /* 1173 * Start scanning the inactive queue for pages that we can free. The 1174 * scan will stop when we reach the target or we have scanned the 1175 * entire queue. (Note that m->act_count is not used to make 1176 * decisions for the inactive queue, only for the active queue.) 1177 */ 1178 pq = &vmd->vmd_pagequeues[PQ_INACTIVE]; 1179 maxscan = pq->pq_cnt; 1180 vm_pagequeue_lock(pq); 1181 queue_locked = TRUE; 1182 for (m = TAILQ_FIRST(&pq->pq_pl); 1183 m != NULL && maxscan-- > 0 && page_shortage > 0; 1184 m = next) { 1185 vm_pagequeue_assert_locked(pq); 1186 KASSERT(queue_locked, ("unlocked inactive queue")); 1187 KASSERT(vm_page_inactive(m), ("Inactive queue %p", m)); 1188 1189 VM_CNT_INC(v_pdpages); 1190 next = TAILQ_NEXT(m, plinks.q); 1191 1192 /* 1193 * skip marker pages 1194 */ 1195 if (m->flags & PG_MARKER) 1196 continue; 1197 1198 KASSERT((m->flags & PG_FICTITIOUS) == 0, 1199 ("Fictitious page %p cannot be in inactive queue", m)); 1200 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 1201 ("Unmanaged page %p cannot be in inactive queue", m)); 1202 1203 /* 1204 * The page or object lock acquisitions fail if the 1205 * page was removed from the queue or moved to a 1206 * different position within the queue. In either 1207 * case, addl_page_shortage should not be incremented. 1208 */ 1209 if (!vm_pageout_page_lock(m, &next)) 1210 goto unlock_page; 1211 else if (m->hold_count != 0) { 1212 /* 1213 * Held pages are essentially stuck in the 1214 * queue. So, they ought to be discounted 1215 * from the inactive count. See the 1216 * calculation of inactq_shortage before the 1217 * loop over the active queue below. 1218 */ 1219 addl_page_shortage++; 1220 goto unlock_page; 1221 } 1222 object = m->object; 1223 if (!VM_OBJECT_TRYWLOCK(object)) { 1224 if (!vm_pageout_fallback_object_lock(m, &next)) 1225 goto unlock_object; 1226 else if (m->hold_count != 0) { 1227 addl_page_shortage++; 1228 goto unlock_object; 1229 } 1230 } 1231 if (vm_page_busied(m)) { 1232 /* 1233 * Don't mess with busy pages. Leave them at 1234 * the front of the queue. Most likely, they 1235 * are being paged out and will leave the 1236 * queue shortly after the scan finishes. So, 1237 * they ought to be discounted from the 1238 * inactive count. 1239 */ 1240 addl_page_shortage++; 1241 unlock_object: 1242 VM_OBJECT_WUNLOCK(object); 1243 unlock_page: 1244 vm_page_unlock(m); 1245 continue; 1246 } 1247 KASSERT(m->hold_count == 0, ("Held page %p", m)); 1248 1249 /* 1250 * Dequeue the inactive page and unlock the inactive page 1251 * queue, invalidating the 'next' pointer. Dequeueing the 1252 * page here avoids a later reacquisition (and release) of 1253 * the inactive page queue lock when vm_page_activate(), 1254 * vm_page_free(), or vm_page_launder() is called. Use a 1255 * marker to remember our place in the inactive queue. 1256 */ 1257 TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q); 1258 vm_page_dequeue_locked(m); 1259 vm_pagequeue_unlock(pq); 1260 queue_locked = FALSE; 1261 1262 /* 1263 * Invalid pages can be easily freed. They cannot be 1264 * mapped, vm_page_free() asserts this. 1265 */ 1266 if (m->valid == 0) 1267 goto free_page; 1268 1269 /* 1270 * If the page has been referenced and the object is not dead, 1271 * reactivate or requeue the page depending on whether the 1272 * object is mapped. 1273 */ 1274 if ((m->aflags & PGA_REFERENCED) != 0) { 1275 vm_page_aflag_clear(m, PGA_REFERENCED); 1276 act_delta = 1; 1277 } else 1278 act_delta = 0; 1279 if (object->ref_count != 0) { 1280 act_delta += pmap_ts_referenced(m); 1281 } else { 1282 KASSERT(!pmap_page_is_mapped(m), 1283 ("vm_pageout_scan: page %p is mapped", m)); 1284 } 1285 if (act_delta != 0) { 1286 if (object->ref_count != 0) { 1287 VM_CNT_INC(v_reactivated); 1288 vm_page_activate(m); 1289 1290 /* 1291 * Increase the activation count if the page 1292 * was referenced while in the inactive queue. 1293 * This makes it less likely that the page will 1294 * be returned prematurely to the inactive 1295 * queue. 1296 */ 1297 m->act_count += act_delta + ACT_ADVANCE; 1298 goto drop_page; 1299 } else if ((object->flags & OBJ_DEAD) == 0) { 1300 vm_pagequeue_lock(pq); 1301 queue_locked = TRUE; 1302 m->queue = PQ_INACTIVE; 1303 TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); 1304 vm_pagequeue_cnt_inc(pq); 1305 goto drop_page; 1306 } 1307 } 1308 1309 /* 1310 * If the page appears to be clean at the machine-independent 1311 * layer, then remove all of its mappings from the pmap in 1312 * anticipation of freeing it. If, however, any of the page's 1313 * mappings allow write access, then the page may still be 1314 * modified until the last of those mappings are removed. 1315 */ 1316 if (object->ref_count != 0) { 1317 vm_page_test_dirty(m); 1318 if (m->dirty == 0) 1319 pmap_remove_all(m); 1320 } 1321 1322 /* 1323 * Clean pages can be freed, but dirty pages must be sent back 1324 * to the laundry, unless they belong to a dead object. 1325 * Requeueing dirty pages from dead objects is pointless, as 1326 * they are being paged out and freed by the thread that 1327 * destroyed the object. 1328 */ 1329 if (m->dirty == 0) { 1330 free_page: 1331 vm_page_free(m); 1332 VM_CNT_INC(v_dfree); 1333 --page_shortage; 1334 } else if ((object->flags & OBJ_DEAD) == 0) 1335 vm_page_launder(m); 1336 drop_page: 1337 vm_page_unlock(m); 1338 VM_OBJECT_WUNLOCK(object); 1339 if (!queue_locked) { 1340 vm_pagequeue_lock(pq); 1341 queue_locked = TRUE; 1342 } 1343 next = TAILQ_NEXT(&vmd->vmd_marker, plinks.q); 1344 TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_marker, plinks.q); 1345 } 1346 vm_pagequeue_unlock(pq); 1347 1348 /* 1349 * Wake up the laundry thread so that it can perform any needed 1350 * laundering. If we didn't meet our target, we're in shortfall and 1351 * need to launder more aggressively. If PQ_LAUNDRY is empty and no 1352 * swap devices are configured, the laundry thread has no work to do, so 1353 * don't bother waking it up. 1354 * 1355 * The laundry thread uses the number of inactive queue scans elapsed 1356 * since the last laundering to determine whether to launder again, so 1357 * keep count. 1358 */ 1359 if (starting_page_shortage > 0) { 1360 pq = &vm_dom[0].vmd_pagequeues[PQ_LAUNDRY]; 1361 vm_pagequeue_lock(pq); 1362 if (vm_laundry_request == VM_LAUNDRY_IDLE && 1363 (pq->pq_cnt > 0 || atomic_load_acq_int(&swapdev_enabled))) { 1364 if (page_shortage > 0) { 1365 vm_laundry_request = VM_LAUNDRY_SHORTFALL; 1366 VM_CNT_INC(v_pdshortfalls); 1367 } else if (vm_laundry_request != VM_LAUNDRY_SHORTFALL) 1368 vm_laundry_request = VM_LAUNDRY_BACKGROUND; 1369 wakeup(&vm_laundry_request); 1370 } 1371 vm_inactq_scans++; 1372 vm_pagequeue_unlock(pq); 1373 } 1374 1375 /* 1376 * Wakeup the swapout daemon if we didn't free the targeted number of 1377 * pages. 1378 */ 1379 if (page_shortage > 0) 1380 vm_swapout_run(); 1381 1382 /* 1383 * If the inactive queue scan fails repeatedly to meet its 1384 * target, kill the largest process. 1385 */ 1386 vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage); 1387 1388 /* 1389 * Compute the number of pages we want to try to move from the 1390 * active queue to either the inactive or laundry queue. 1391 * 1392 * When scanning active pages, we make clean pages count more heavily 1393 * towards the page shortage than dirty pages. This is because dirty 1394 * pages must be laundered before they can be reused and thus have less 1395 * utility when attempting to quickly alleviate a shortage. However, 1396 * this weighting also causes the scan to deactivate dirty pages more 1397 * more aggressively, improving the effectiveness of clustering and 1398 * ensuring that they can eventually be reused. 1399 */ 1400 inactq_shortage = vm_cnt.v_inactive_target - (vm_cnt.v_inactive_count + 1401 vm_cnt.v_laundry_count / act_scan_laundry_weight) + 1402 vm_paging_target() + deficit + addl_page_shortage; 1403 inactq_shortage *= act_scan_laundry_weight; 1404 1405 pq = &vmd->vmd_pagequeues[PQ_ACTIVE]; 1406 vm_pagequeue_lock(pq); 1407 maxscan = pq->pq_cnt; 1408 1409 /* 1410 * If we're just idle polling attempt to visit every 1411 * active page within 'update_period' seconds. 1412 */ 1413 scan_tick = ticks; 1414 if (vm_pageout_update_period != 0) { 1415 min_scan = pq->pq_cnt; 1416 min_scan *= scan_tick - vmd->vmd_last_active_scan; 1417 min_scan /= hz * vm_pageout_update_period; 1418 } else 1419 min_scan = 0; 1420 if (min_scan > 0 || (inactq_shortage > 0 && maxscan > 0)) 1421 vmd->vmd_last_active_scan = scan_tick; 1422 1423 /* 1424 * Scan the active queue for pages that can be deactivated. Update 1425 * the per-page activity counter and use it to identify deactivation 1426 * candidates. Held pages may be deactivated. 1427 */ 1428 for (m = TAILQ_FIRST(&pq->pq_pl), scanned = 0; m != NULL && (scanned < 1429 min_scan || (inactq_shortage > 0 && scanned < maxscan)); m = next, 1430 scanned++) { 1431 KASSERT(m->queue == PQ_ACTIVE, 1432 ("vm_pageout_scan: page %p isn't active", m)); 1433 next = TAILQ_NEXT(m, plinks.q); 1434 if ((m->flags & PG_MARKER) != 0) 1435 continue; 1436 KASSERT((m->flags & PG_FICTITIOUS) == 0, 1437 ("Fictitious page %p cannot be in active queue", m)); 1438 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 1439 ("Unmanaged page %p cannot be in active queue", m)); 1440 if (!vm_pageout_page_lock(m, &next)) { 1441 vm_page_unlock(m); 1442 continue; 1443 } 1444 1445 /* 1446 * The count for page daemon pages is updated after checking 1447 * the page for eligibility. 1448 */ 1449 VM_CNT_INC(v_pdpages); 1450 1451 /* 1452 * Check to see "how much" the page has been used. 1453 */ 1454 if ((m->aflags & PGA_REFERENCED) != 0) { 1455 vm_page_aflag_clear(m, PGA_REFERENCED); 1456 act_delta = 1; 1457 } else 1458 act_delta = 0; 1459 1460 /* 1461 * Perform an unsynchronized object ref count check. While 1462 * the page lock ensures that the page is not reallocated to 1463 * another object, in particular, one with unmanaged mappings 1464 * that cannot support pmap_ts_referenced(), two races are, 1465 * nonetheless, possible: 1466 * 1) The count was transitioning to zero, but we saw a non- 1467 * zero value. pmap_ts_referenced() will return zero 1468 * because the page is not mapped. 1469 * 2) The count was transitioning to one, but we saw zero. 1470 * This race delays the detection of a new reference. At 1471 * worst, we will deactivate and reactivate the page. 1472 */ 1473 if (m->object->ref_count != 0) 1474 act_delta += pmap_ts_referenced(m); 1475 1476 /* 1477 * Advance or decay the act_count based on recent usage. 1478 */ 1479 if (act_delta != 0) { 1480 m->act_count += ACT_ADVANCE + act_delta; 1481 if (m->act_count > ACT_MAX) 1482 m->act_count = ACT_MAX; 1483 } else 1484 m->act_count -= min(m->act_count, ACT_DECLINE); 1485 1486 /* 1487 * Move this page to the tail of the active, inactive or laundry 1488 * queue depending on usage. 1489 */ 1490 if (m->act_count == 0) { 1491 /* Dequeue to avoid later lock recursion. */ 1492 vm_page_dequeue_locked(m); 1493 1494 /* 1495 * When not short for inactive pages, let dirty pages go 1496 * through the inactive queue before moving to the 1497 * laundry queues. This gives them some extra time to 1498 * be reactivated, potentially avoiding an expensive 1499 * pageout. During a page shortage, the inactive queue 1500 * is necessarily small, so we may move dirty pages 1501 * directly to the laundry queue. 1502 */ 1503 if (inactq_shortage <= 0) 1504 vm_page_deactivate(m); 1505 else { 1506 /* 1507 * Calling vm_page_test_dirty() here would 1508 * require acquisition of the object's write 1509 * lock. However, during a page shortage, 1510 * directing dirty pages into the laundry 1511 * queue is only an optimization and not a 1512 * requirement. Therefore, we simply rely on 1513 * the opportunistic updates to the page's 1514 * dirty field by the pmap. 1515 */ 1516 if (m->dirty == 0) { 1517 vm_page_deactivate(m); 1518 inactq_shortage -= 1519 act_scan_laundry_weight; 1520 } else { 1521 vm_page_launder(m); 1522 inactq_shortage--; 1523 } 1524 } 1525 } else 1526 vm_page_requeue_locked(m); 1527 vm_page_unlock(m); 1528 } 1529 vm_pagequeue_unlock(pq); 1530 if (pass > 0) 1531 vm_swapout_run_idle(); 1532 return (page_shortage <= 0); 1533 } 1534 1535 static int vm_pageout_oom_vote; 1536 1537 /* 1538 * The pagedaemon threads randlomly select one to perform the 1539 * OOM. Trying to kill processes before all pagedaemons 1540 * failed to reach free target is premature. 1541 */ 1542 static void 1543 vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, 1544 int starting_page_shortage) 1545 { 1546 int old_vote; 1547 1548 if (starting_page_shortage <= 0 || starting_page_shortage != 1549 page_shortage) 1550 vmd->vmd_oom_seq = 0; 1551 else 1552 vmd->vmd_oom_seq++; 1553 if (vmd->vmd_oom_seq < vm_pageout_oom_seq) { 1554 if (vmd->vmd_oom) { 1555 vmd->vmd_oom = FALSE; 1556 atomic_subtract_int(&vm_pageout_oom_vote, 1); 1557 } 1558 return; 1559 } 1560 1561 /* 1562 * Do not follow the call sequence until OOM condition is 1563 * cleared. 1564 */ 1565 vmd->vmd_oom_seq = 0; 1566 1567 if (vmd->vmd_oom) 1568 return; 1569 1570 vmd->vmd_oom = TRUE; 1571 old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1); 1572 if (old_vote != vm_ndomains - 1) 1573 return; 1574 1575 /* 1576 * The current pagedaemon thread is the last in the quorum to 1577 * start OOM. Initiate the selection and signaling of the 1578 * victim. 1579 */ 1580 vm_pageout_oom(VM_OOM_MEM); 1581 1582 /* 1583 * After one round of OOM terror, recall our vote. On the 1584 * next pass, current pagedaemon would vote again if the low 1585 * memory condition is still there, due to vmd_oom being 1586 * false. 1587 */ 1588 vmd->vmd_oom = FALSE; 1589 atomic_subtract_int(&vm_pageout_oom_vote, 1); 1590 } 1591 1592 /* 1593 * The OOM killer is the page daemon's action of last resort when 1594 * memory allocation requests have been stalled for a prolonged period 1595 * of time because it cannot reclaim memory. This function computes 1596 * the approximate number of physical pages that could be reclaimed if 1597 * the specified address space is destroyed. 1598 * 1599 * Private, anonymous memory owned by the address space is the 1600 * principal resource that we expect to recover after an OOM kill. 1601 * Since the physical pages mapped by the address space's COW entries 1602 * are typically shared pages, they are unlikely to be released and so 1603 * they are not counted. 1604 * 1605 * To get to the point where the page daemon runs the OOM killer, its 1606 * efforts to write-back vnode-backed pages may have stalled. This 1607 * could be caused by a memory allocation deadlock in the write path 1608 * that might be resolved by an OOM kill. Therefore, physical pages 1609 * belonging to vnode-backed objects are counted, because they might 1610 * be freed without being written out first if the address space holds 1611 * the last reference to an unlinked vnode. 1612 * 1613 * Similarly, physical pages belonging to OBJT_PHYS objects are 1614 * counted because the address space might hold the last reference to 1615 * the object. 1616 */ 1617 static long 1618 vm_pageout_oom_pagecount(struct vmspace *vmspace) 1619 { 1620 vm_map_t map; 1621 vm_map_entry_t entry; 1622 vm_object_t obj; 1623 long res; 1624 1625 map = &vmspace->vm_map; 1626 KASSERT(!map->system_map, ("system map")); 1627 sx_assert(&map->lock, SA_LOCKED); 1628 res = 0; 1629 for (entry = map->header.next; entry != &map->header; 1630 entry = entry->next) { 1631 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) 1632 continue; 1633 obj = entry->object.vm_object; 1634 if (obj == NULL) 1635 continue; 1636 if ((entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0 && 1637 obj->ref_count != 1) 1638 continue; 1639 switch (obj->type) { 1640 case OBJT_DEFAULT: 1641 case OBJT_SWAP: 1642 case OBJT_PHYS: 1643 case OBJT_VNODE: 1644 res += obj->resident_page_count; 1645 break; 1646 } 1647 } 1648 return (res); 1649 } 1650 1651 void 1652 vm_pageout_oom(int shortage) 1653 { 1654 struct proc *p, *bigproc; 1655 vm_offset_t size, bigsize; 1656 struct thread *td; 1657 struct vmspace *vm; 1658 bool breakout; 1659 1660 /* 1661 * We keep the process bigproc locked once we find it to keep anyone 1662 * from messing with it; however, there is a possibility of 1663 * deadlock if process B is bigproc and one of its child processes 1664 * attempts to propagate a signal to B while we are waiting for A's 1665 * lock while walking this list. To avoid this, we don't block on 1666 * the process lock but just skip a process if it is already locked. 1667 */ 1668 bigproc = NULL; 1669 bigsize = 0; 1670 sx_slock(&allproc_lock); 1671 FOREACH_PROC_IN_SYSTEM(p) { 1672 PROC_LOCK(p); 1673 1674 /* 1675 * If this is a system, protected or killed process, skip it. 1676 */ 1677 if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC | 1678 P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 || 1679 p->p_pid == 1 || P_KILLED(p) || 1680 (p->p_pid < 48 && swap_pager_avail != 0)) { 1681 PROC_UNLOCK(p); 1682 continue; 1683 } 1684 /* 1685 * If the process is in a non-running type state, 1686 * don't touch it. Check all the threads individually. 1687 */ 1688 breakout = false; 1689 FOREACH_THREAD_IN_PROC(p, td) { 1690 thread_lock(td); 1691 if (!TD_ON_RUNQ(td) && 1692 !TD_IS_RUNNING(td) && 1693 !TD_IS_SLEEPING(td) && 1694 !TD_IS_SUSPENDED(td) && 1695 !TD_IS_SWAPPED(td)) { 1696 thread_unlock(td); 1697 breakout = true; 1698 break; 1699 } 1700 thread_unlock(td); 1701 } 1702 if (breakout) { 1703 PROC_UNLOCK(p); 1704 continue; 1705 } 1706 /* 1707 * get the process size 1708 */ 1709 vm = vmspace_acquire_ref(p); 1710 if (vm == NULL) { 1711 PROC_UNLOCK(p); 1712 continue; 1713 } 1714 _PHOLD_LITE(p); 1715 PROC_UNLOCK(p); 1716 sx_sunlock(&allproc_lock); 1717 if (!vm_map_trylock_read(&vm->vm_map)) { 1718 vmspace_free(vm); 1719 sx_slock(&allproc_lock); 1720 PRELE(p); 1721 continue; 1722 } 1723 size = vmspace_swap_count(vm); 1724 if (shortage == VM_OOM_MEM) 1725 size += vm_pageout_oom_pagecount(vm); 1726 vm_map_unlock_read(&vm->vm_map); 1727 vmspace_free(vm); 1728 sx_slock(&allproc_lock); 1729 1730 /* 1731 * If this process is bigger than the biggest one, 1732 * remember it. 1733 */ 1734 if (size > bigsize) { 1735 if (bigproc != NULL) 1736 PRELE(bigproc); 1737 bigproc = p; 1738 bigsize = size; 1739 } else { 1740 PRELE(p); 1741 } 1742 } 1743 sx_sunlock(&allproc_lock); 1744 if (bigproc != NULL) { 1745 if (vm_panic_on_oom != 0) 1746 panic("out of swap space"); 1747 PROC_LOCK(bigproc); 1748 killproc(bigproc, "out of swap space"); 1749 sched_nice(bigproc, PRIO_MIN); 1750 _PRELE(bigproc); 1751 PROC_UNLOCK(bigproc); 1752 wakeup(&vm_cnt.v_free_count); 1753 } 1754 } 1755 1756 static void 1757 vm_pageout_worker(void *arg) 1758 { 1759 struct vm_domain *domain; 1760 int domidx, pass; 1761 bool target_met; 1762 1763 domidx = (uintptr_t)arg; 1764 domain = &vm_dom[domidx]; 1765 pass = 0; 1766 target_met = true; 1767 1768 /* 1769 * XXXKIB It could be useful to bind pageout daemon threads to 1770 * the cores belonging to the domain, from which vm_page_array 1771 * is allocated. 1772 */ 1773 1774 KASSERT(domain->vmd_segs != 0, ("domain without segments")); 1775 domain->vmd_last_active_scan = ticks; 1776 vm_pageout_init_marker(&domain->vmd_marker, PQ_INACTIVE); 1777 vm_pageout_init_marker(&domain->vmd_inacthead, PQ_INACTIVE); 1778 TAILQ_INSERT_HEAD(&domain->vmd_pagequeues[PQ_INACTIVE].pq_pl, 1779 &domain->vmd_inacthead, plinks.q); 1780 1781 /* 1782 * The pageout daemon worker is never done, so loop forever. 1783 */ 1784 while (TRUE) { 1785 mtx_lock(&vm_page_queue_free_mtx); 1786 1787 /* 1788 * Generally, after a level >= 1 scan, if there are enough 1789 * free pages to wakeup the waiters, then they are already 1790 * awake. A call to vm_page_free() during the scan awakened 1791 * them. However, in the following case, this wakeup serves 1792 * to bound the amount of time that a thread might wait. 1793 * Suppose a thread's call to vm_page_alloc() fails, but 1794 * before that thread calls VM_WAIT, enough pages are freed by 1795 * other threads to alleviate the free page shortage. The 1796 * thread will, nonetheless, wait until another page is freed 1797 * or this wakeup is performed. 1798 */ 1799 if (vm_pages_needed && !vm_page_count_min()) { 1800 vm_pages_needed = false; 1801 wakeup(&vm_cnt.v_free_count); 1802 } 1803 1804 /* 1805 * Do not clear vm_pageout_wanted until we reach our free page 1806 * target. Otherwise, we may be awakened over and over again, 1807 * wasting CPU time. 1808 */ 1809 if (vm_pageout_wanted && target_met) 1810 vm_pageout_wanted = false; 1811 1812 /* 1813 * Might the page daemon receive a wakeup call? 1814 */ 1815 if (vm_pageout_wanted) { 1816 /* 1817 * No. Either vm_pageout_wanted was set by another 1818 * thread during the previous scan, which must have 1819 * been a level 0 scan, or vm_pageout_wanted was 1820 * already set and the scan failed to free enough 1821 * pages. If we haven't yet performed a level >= 1 1822 * (page reclamation) scan, then increase the level 1823 * and scan again now. Otherwise, sleep a bit and 1824 * try again later. 1825 */ 1826 mtx_unlock(&vm_page_queue_free_mtx); 1827 if (pass >= 1) 1828 pause("pwait", hz / VM_INACT_SCAN_RATE); 1829 pass++; 1830 } else { 1831 /* 1832 * Yes. If threads are still sleeping in VM_WAIT 1833 * then we immediately start a new scan. Otherwise, 1834 * sleep until the next wakeup or until pages need to 1835 * have their reference stats updated. 1836 */ 1837 if (vm_pages_needed) { 1838 mtx_unlock(&vm_page_queue_free_mtx); 1839 if (pass == 0) 1840 pass++; 1841 } else if (mtx_sleep(&vm_pageout_wanted, 1842 &vm_page_queue_free_mtx, PDROP | PVM, "psleep", 1843 hz) == 0) { 1844 VM_CNT_INC(v_pdwakeups); 1845 pass = 1; 1846 } else 1847 pass = 0; 1848 } 1849 1850 target_met = vm_pageout_scan(domain, pass); 1851 } 1852 } 1853 1854 /* 1855 * vm_pageout_init initialises basic pageout daemon settings. 1856 */ 1857 static void 1858 vm_pageout_init(void) 1859 { 1860 /* 1861 * Initialize some paging parameters. 1862 */ 1863 vm_cnt.v_interrupt_free_min = 2; 1864 if (vm_cnt.v_page_count < 2000) 1865 vm_pageout_page_count = 8; 1866 1867 /* 1868 * v_free_reserved needs to include enough for the largest 1869 * swap pager structures plus enough for any pv_entry structs 1870 * when paging. 1871 */ 1872 if (vm_cnt.v_page_count > 1024) 1873 vm_cnt.v_free_min = 4 + (vm_cnt.v_page_count - 1024) / 200; 1874 else 1875 vm_cnt.v_free_min = 4; 1876 vm_cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE + 1877 vm_cnt.v_interrupt_free_min; 1878 vm_cnt.v_free_reserved = vm_pageout_page_count + 1879 vm_cnt.v_pageout_free_min + (vm_cnt.v_page_count / 768); 1880 vm_cnt.v_free_severe = vm_cnt.v_free_min / 2; 1881 vm_cnt.v_free_target = 4 * vm_cnt.v_free_min + vm_cnt.v_free_reserved; 1882 vm_cnt.v_free_min += vm_cnt.v_free_reserved; 1883 vm_cnt.v_free_severe += vm_cnt.v_free_reserved; 1884 vm_cnt.v_inactive_target = (3 * vm_cnt.v_free_target) / 2; 1885 if (vm_cnt.v_inactive_target > vm_cnt.v_free_count / 3) 1886 vm_cnt.v_inactive_target = vm_cnt.v_free_count / 3; 1887 1888 /* 1889 * Set the default wakeup threshold to be 10% above the minimum 1890 * page limit. This keeps the steady state out of shortfall. 1891 */ 1892 vm_pageout_wakeup_thresh = (vm_cnt.v_free_min / 10) * 11; 1893 1894 /* 1895 * Set interval in seconds for active scan. We want to visit each 1896 * page at least once every ten minutes. This is to prevent worst 1897 * case paging behaviors with stale active LRU. 1898 */ 1899 if (vm_pageout_update_period == 0) 1900 vm_pageout_update_period = 600; 1901 1902 /* XXX does not really belong here */ 1903 if (vm_page_max_wired == 0) 1904 vm_page_max_wired = vm_cnt.v_free_count / 3; 1905 1906 /* 1907 * Target amount of memory to move out of the laundry queue during a 1908 * background laundering. This is proportional to the amount of system 1909 * memory. 1910 */ 1911 vm_background_launder_target = (vm_cnt.v_free_target - 1912 vm_cnt.v_free_min) / 10; 1913 } 1914 1915 /* 1916 * vm_pageout is the high level pageout daemon. 1917 */ 1918 static void 1919 vm_pageout(void) 1920 { 1921 int error; 1922 #ifdef VM_NUMA_ALLOC 1923 int i; 1924 #endif 1925 1926 swap_pager_swap_init(); 1927 error = kthread_add(vm_pageout_laundry_worker, NULL, curproc, NULL, 1928 0, 0, "laundry: dom0"); 1929 if (error != 0) 1930 panic("starting laundry for domain 0, error %d", error); 1931 #ifdef VM_NUMA_ALLOC 1932 for (i = 1; i < vm_ndomains; i++) { 1933 error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i, 1934 curproc, NULL, 0, 0, "dom%d", i); 1935 if (error != 0) { 1936 panic("starting pageout for domain %d, error %d\n", 1937 i, error); 1938 } 1939 } 1940 #endif 1941 error = kthread_add(uma_reclaim_worker, NULL, curproc, NULL, 1942 0, 0, "uma"); 1943 if (error != 0) 1944 panic("starting uma_reclaim helper, error %d\n", error); 1945 vm_pageout_worker((void *)(uintptr_t)0); 1946 } 1947 1948 /* 1949 * Perform an advisory wakeup of the page daemon. 1950 */ 1951 void 1952 pagedaemon_wakeup(void) 1953 { 1954 1955 mtx_assert(&vm_page_queue_free_mtx, MA_NOTOWNED); 1956 1957 if (!vm_pageout_wanted && curthread->td_proc != pageproc) { 1958 vm_pageout_wanted = true; 1959 wakeup(&vm_pageout_wanted); 1960 } 1961 } 1962 1963 /* 1964 * Wake up the page daemon and wait for it to reclaim free pages. 1965 * 1966 * This function returns with the free queues mutex unlocked. 1967 */ 1968 void 1969 pagedaemon_wait(int pri, const char *wmesg) 1970 { 1971 1972 mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); 1973 1974 /* 1975 * vm_pageout_wanted may have been set by an advisory wakeup, but if the 1976 * page daemon is running on a CPU, the wakeup will have been lost. 1977 * Thus, deliver a potentially spurious wakeup to ensure that the page 1978 * daemon has been notified of the shortage. 1979 */ 1980 if (!vm_pageout_wanted || !vm_pages_needed) { 1981 vm_pageout_wanted = true; 1982 wakeup(&vm_pageout_wanted); 1983 } 1984 vm_pages_needed = true; 1985 msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | pri, 1986 wmesg, 0); 1987 } 1988