1 /*- 2 * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU) 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2005 Yahoo! Technologies Norway AS 11 * All rights reserved. 12 * 13 * This code is derived from software contributed to Berkeley by 14 * The Mach Operating System project at Carnegie-Mellon University. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 3. All advertising materials mentioning features or use of this software 25 * must display the following acknowledgement: 26 * This product includes software developed by the University of 27 * California, Berkeley and its contributors. 28 * 4. Neither the name of the University nor the names of its contributors 29 * may be used to endorse or promote products derived from this software 30 * without specific prior written permission. 31 * 32 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 33 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 34 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 35 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 36 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 37 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 38 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 39 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 40 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 41 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 42 * SUCH DAMAGE. 43 * 44 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 45 * 46 * 47 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 48 * All rights reserved. 49 * 50 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 51 * 52 * Permission to use, copy, modify and distribute this software and 53 * its documentation is hereby granted, provided that both the copyright 54 * notice and this permission notice appear in all copies of the 55 * software, derivative works or modified versions, and any portions 56 * thereof, and that both notices appear in supporting documentation. 57 * 58 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 59 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 60 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 61 * 62 * Carnegie Mellon requests users of this software to return to 63 * 64 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 65 * School of Computer Science 66 * Carnegie Mellon University 67 * Pittsburgh PA 15213-3890 68 * 69 * any improvements or extensions that they make and grant Carnegie the 70 * rights to redistribute these changes. 71 */ 72 73 #include <sys/cdefs.h> 74 __FBSDID("$FreeBSD$"); 75 76 #include "opt_kstack_pages.h" 77 #include "opt_kstack_max_pages.h" 78 #include "opt_vm.h" 79 80 #include <sys/param.h> 81 #include <sys/systm.h> 82 #include <sys/limits.h> 83 #include <sys/kernel.h> 84 #include <sys/eventhandler.h> 85 #include <sys/lock.h> 86 #include <sys/mutex.h> 87 #include <sys/proc.h> 88 #include <sys/kthread.h> 89 #include <sys/ktr.h> 90 #include <sys/mount.h> 91 #include <sys/racct.h> 92 #include <sys/resourcevar.h> 93 #include <sys/refcount.h> 94 #include <sys/sched.h> 95 #include <sys/sdt.h> 96 #include <sys/signalvar.h> 97 #include <sys/smp.h> 98 #include <sys/time.h> 99 #include <sys/vnode.h> 100 #include <sys/vmmeter.h> 101 #include <sys/rwlock.h> 102 #include <sys/sx.h> 103 #include <sys/sysctl.h> 104 105 #include <vm/vm.h> 106 #include <vm/vm_param.h> 107 #include <vm/vm_kern.h> 108 #include <vm/vm_object.h> 109 #include <vm/vm_page.h> 110 #include <vm/vm_map.h> 111 #include <vm/vm_pageout.h> 112 #include <vm/vm_pager.h> 113 #include <vm/vm_phys.h> 114 #include <vm/swap_pager.h> 115 #include <vm/vm_extern.h> 116 #include <vm/uma.h> 117 118 /* the kernel process "vm_daemon" */ 119 static void vm_daemon(void); 120 static struct proc *vmproc; 121 122 static struct kproc_desc vm_kp = { 123 "vmdaemon", 124 vm_daemon, 125 &vmproc 126 }; 127 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); 128 129 static int vm_swap_enabled = 1; 130 static int vm_swap_idle_enabled = 0; 131 132 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RW, 133 &vm_swap_enabled, 0, 134 "Enable entire process swapout"); 135 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RW, 136 &vm_swap_idle_enabled, 0, 137 "Allow swapout on idle criteria"); 138 139 /* 140 * Swap_idle_threshold1 is the guaranteed swapped in time for a process 141 */ 142 static int swap_idle_threshold1 = 2; 143 SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW, 144 &swap_idle_threshold1, 0, 145 "Guaranteed swapped in time for a process"); 146 147 /* 148 * Swap_idle_threshold2 is the time that a process can be idle before 149 * it will be swapped out, if idle swapping is enabled. 150 */ 151 static int swap_idle_threshold2 = 10; 152 SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW, 153 &swap_idle_threshold2, 0, 154 "Time before a process will be swapped out"); 155 156 static int vm_pageout_req_swapout; /* XXX */ 157 static int vm_daemon_needed; 158 static struct mtx vm_daemon_mtx; 159 /* Allow for use by vm_pageout before vm_daemon is initialized. */ 160 MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF); 161 162 static int swapped_cnt; 163 static int swap_inprogress; /* Pending swap-ins done outside swapper. */ 164 static int last_swapin; 165 166 static void swapclear(struct proc *); 167 static int swapout(struct proc *); 168 static void vm_swapout_map_deactivate_pages(vm_map_t, long); 169 static void vm_swapout_object_deactivate(pmap_t, vm_object_t, long); 170 static void swapout_procs(int action); 171 static void vm_req_vmdaemon(int req); 172 static void vm_thread_swapout(struct thread *td); 173 174 static void 175 vm_swapout_object_deactivate_page(pmap_t pmap, vm_page_t m, bool unmap) 176 { 177 178 /* 179 * Ignore unreclaimable wired pages. Repeat the check after busying 180 * since a busy holder may wire the page. 181 */ 182 if (vm_page_wired(m) || !vm_page_tryxbusy(m)) 183 return; 184 185 if (vm_page_wired(m) || !pmap_page_exists_quick(pmap, m)) { 186 vm_page_xunbusy(m); 187 return; 188 } 189 if (!pmap_is_referenced(m)) { 190 if (!vm_page_active(m)) 191 (void)vm_page_try_remove_all(m); 192 else if (unmap && vm_page_try_remove_all(m)) 193 vm_page_deactivate(m); 194 } 195 vm_page_xunbusy(m); 196 } 197 198 /* 199 * vm_swapout_object_deactivate 200 * 201 * Deactivate enough pages to satisfy the inactive target 202 * requirements. 203 * 204 * The object and map must be locked. 205 */ 206 static void 207 vm_swapout_object_deactivate(pmap_t pmap, vm_object_t first_object, 208 long desired) 209 { 210 vm_object_t backing_object, object; 211 vm_page_t m; 212 bool unmap; 213 214 VM_OBJECT_ASSERT_LOCKED(first_object); 215 if ((first_object->flags & OBJ_FICTITIOUS) != 0) 216 return; 217 for (object = first_object;; object = backing_object) { 218 if (pmap_resident_count(pmap) <= desired) 219 goto unlock_return; 220 VM_OBJECT_ASSERT_LOCKED(object); 221 if ((object->flags & OBJ_UNMANAGED) != 0 || 222 blockcount_read(&object->paging_in_progress) > 0) 223 goto unlock_return; 224 225 unmap = true; 226 if (object->shadow_count > 1) 227 unmap = false; 228 229 /* 230 * Scan the object's entire memory queue. 231 */ 232 TAILQ_FOREACH(m, &object->memq, listq) { 233 if (pmap_resident_count(pmap) <= desired) 234 goto unlock_return; 235 if (should_yield()) 236 goto unlock_return; 237 vm_swapout_object_deactivate_page(pmap, m, unmap); 238 } 239 if ((backing_object = object->backing_object) == NULL) 240 goto unlock_return; 241 VM_OBJECT_RLOCK(backing_object); 242 if (object != first_object) 243 VM_OBJECT_RUNLOCK(object); 244 } 245 unlock_return: 246 if (object != first_object) 247 VM_OBJECT_RUNLOCK(object); 248 } 249 250 /* 251 * deactivate some number of pages in a map, try to do it fairly, but 252 * that is really hard to do. 253 */ 254 static void 255 vm_swapout_map_deactivate_pages(vm_map_t map, long desired) 256 { 257 vm_map_entry_t tmpe; 258 vm_object_t obj, bigobj; 259 int nothingwired; 260 261 if (!vm_map_trylock_read(map)) 262 return; 263 264 bigobj = NULL; 265 nothingwired = TRUE; 266 267 /* 268 * first, search out the biggest object, and try to free pages from 269 * that. 270 */ 271 VM_MAP_ENTRY_FOREACH(tmpe, map) { 272 if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { 273 obj = tmpe->object.vm_object; 274 if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) { 275 if (obj->shadow_count <= 1 && 276 (bigobj == NULL || 277 bigobj->resident_page_count < 278 obj->resident_page_count)) { 279 if (bigobj != NULL) 280 VM_OBJECT_RUNLOCK(bigobj); 281 bigobj = obj; 282 } else 283 VM_OBJECT_RUNLOCK(obj); 284 } 285 } 286 if (tmpe->wired_count > 0) 287 nothingwired = FALSE; 288 } 289 290 if (bigobj != NULL) { 291 vm_swapout_object_deactivate(map->pmap, bigobj, desired); 292 VM_OBJECT_RUNLOCK(bigobj); 293 } 294 /* 295 * Next, hunt around for other pages to deactivate. We actually 296 * do this search sort of wrong -- .text first is not the best idea. 297 */ 298 VM_MAP_ENTRY_FOREACH(tmpe, map) { 299 if (pmap_resident_count(vm_map_pmap(map)) <= desired) 300 break; 301 if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { 302 obj = tmpe->object.vm_object; 303 if (obj != NULL) { 304 VM_OBJECT_RLOCK(obj); 305 vm_swapout_object_deactivate(map->pmap, obj, 306 desired); 307 VM_OBJECT_RUNLOCK(obj); 308 } 309 } 310 } 311 312 /* 313 * Remove all mappings if a process is swapped out, this will free page 314 * table pages. 315 */ 316 if (desired == 0 && nothingwired) { 317 pmap_remove(vm_map_pmap(map), vm_map_min(map), 318 vm_map_max(map)); 319 } 320 321 vm_map_unlock_read(map); 322 } 323 324 /* 325 * Swap out requests 326 */ 327 #define VM_SWAP_NORMAL 1 328 #define VM_SWAP_IDLE 2 329 330 void 331 vm_swapout_run(void) 332 { 333 334 if (vm_swap_enabled) 335 vm_req_vmdaemon(VM_SWAP_NORMAL); 336 } 337 338 /* 339 * Idle process swapout -- run once per second when pagedaemons are 340 * reclaiming pages. 341 */ 342 void 343 vm_swapout_run_idle(void) 344 { 345 static long lsec; 346 347 if (!vm_swap_idle_enabled || time_second == lsec) 348 return; 349 vm_req_vmdaemon(VM_SWAP_IDLE); 350 lsec = time_second; 351 } 352 353 static void 354 vm_req_vmdaemon(int req) 355 { 356 static int lastrun = 0; 357 358 mtx_lock(&vm_daemon_mtx); 359 vm_pageout_req_swapout |= req; 360 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { 361 wakeup(&vm_daemon_needed); 362 lastrun = ticks; 363 } 364 mtx_unlock(&vm_daemon_mtx); 365 } 366 367 static void 368 vm_daemon(void) 369 { 370 struct rlimit rsslim; 371 struct proc *p; 372 struct thread *td; 373 struct vmspace *vm; 374 int breakout, swapout_flags, tryagain, attempts; 375 #ifdef RACCT 376 uint64_t rsize, ravailable; 377 #endif 378 379 while (TRUE) { 380 mtx_lock(&vm_daemon_mtx); 381 msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", 382 #ifdef RACCT 383 racct_enable ? hz : 0 384 #else 385 0 386 #endif 387 ); 388 swapout_flags = vm_pageout_req_swapout; 389 vm_pageout_req_swapout = 0; 390 mtx_unlock(&vm_daemon_mtx); 391 if (swapout_flags != 0) { 392 /* 393 * Drain the per-CPU page queue batches as a deadlock 394 * avoidance measure. 395 */ 396 if ((swapout_flags & VM_SWAP_NORMAL) != 0) 397 vm_page_pqbatch_drain(); 398 swapout_procs(swapout_flags); 399 } 400 401 /* 402 * scan the processes for exceeding their rlimits or if 403 * process is swapped out -- deactivate pages 404 */ 405 tryagain = 0; 406 attempts = 0; 407 again: 408 attempts++; 409 sx_slock(&allproc_lock); 410 FOREACH_PROC_IN_SYSTEM(p) { 411 vm_pindex_t limit, size; 412 413 /* 414 * if this is a system process or if we have already 415 * looked at this process, skip it. 416 */ 417 PROC_LOCK(p); 418 if (p->p_state != PRS_NORMAL || 419 p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) { 420 PROC_UNLOCK(p); 421 continue; 422 } 423 /* 424 * if the process is in a non-running type state, 425 * don't touch it. 426 */ 427 breakout = 0; 428 FOREACH_THREAD_IN_PROC(p, td) { 429 thread_lock(td); 430 if (!TD_ON_RUNQ(td) && 431 !TD_IS_RUNNING(td) && 432 !TD_IS_SLEEPING(td) && 433 !TD_IS_SUSPENDED(td)) { 434 thread_unlock(td); 435 breakout = 1; 436 break; 437 } 438 thread_unlock(td); 439 } 440 if (breakout) { 441 PROC_UNLOCK(p); 442 continue; 443 } 444 /* 445 * get a limit 446 */ 447 lim_rlimit_proc(p, RLIMIT_RSS, &rsslim); 448 limit = OFF_TO_IDX( 449 qmin(rsslim.rlim_cur, rsslim.rlim_max)); 450 451 /* 452 * let processes that are swapped out really be 453 * swapped out set the limit to nothing (will force a 454 * swap-out.) 455 */ 456 if ((p->p_flag & P_INMEM) == 0) 457 limit = 0; /* XXX */ 458 vm = vmspace_acquire_ref(p); 459 _PHOLD_LITE(p); 460 PROC_UNLOCK(p); 461 if (vm == NULL) { 462 PRELE(p); 463 continue; 464 } 465 sx_sunlock(&allproc_lock); 466 467 size = vmspace_resident_count(vm); 468 if (size >= limit) { 469 vm_swapout_map_deactivate_pages( 470 &vm->vm_map, limit); 471 size = vmspace_resident_count(vm); 472 } 473 #ifdef RACCT 474 if (racct_enable) { 475 rsize = IDX_TO_OFF(size); 476 PROC_LOCK(p); 477 if (p->p_state == PRS_NORMAL) 478 racct_set(p, RACCT_RSS, rsize); 479 ravailable = racct_get_available(p, RACCT_RSS); 480 PROC_UNLOCK(p); 481 if (rsize > ravailable) { 482 /* 483 * Don't be overly aggressive; this 484 * might be an innocent process, 485 * and the limit could've been exceeded 486 * by some memory hog. Don't try 487 * to deactivate more than 1/4th 488 * of process' resident set size. 489 */ 490 if (attempts <= 8) { 491 if (ravailable < rsize - 492 (rsize / 4)) { 493 ravailable = rsize - 494 (rsize / 4); 495 } 496 } 497 vm_swapout_map_deactivate_pages( 498 &vm->vm_map, 499 OFF_TO_IDX(ravailable)); 500 /* Update RSS usage after paging out. */ 501 size = vmspace_resident_count(vm); 502 rsize = IDX_TO_OFF(size); 503 PROC_LOCK(p); 504 if (p->p_state == PRS_NORMAL) 505 racct_set(p, RACCT_RSS, rsize); 506 PROC_UNLOCK(p); 507 if (rsize > ravailable) 508 tryagain = 1; 509 } 510 } 511 #endif 512 vmspace_free(vm); 513 sx_slock(&allproc_lock); 514 PRELE(p); 515 } 516 sx_sunlock(&allproc_lock); 517 if (tryagain != 0 && attempts <= 10) { 518 maybe_yield(); 519 goto again; 520 } 521 } 522 } 523 524 /* 525 * Allow a thread's kernel stack to be paged out. 526 */ 527 static void 528 vm_thread_swapout(struct thread *td) 529 { 530 vm_page_t m; 531 vm_offset_t kaddr; 532 vm_pindex_t pindex; 533 int i, pages; 534 535 cpu_thread_swapout(td); 536 kaddr = td->td_kstack; 537 pages = td->td_kstack_pages; 538 pindex = atop(kaddr - VM_MIN_KERNEL_ADDRESS); 539 pmap_qremove(kaddr, pages); 540 VM_OBJECT_WLOCK(kstack_object); 541 for (i = 0; i < pages; i++) { 542 m = vm_page_lookup(kstack_object, pindex + i); 543 if (m == NULL) 544 panic("vm_thread_swapout: kstack already missing?"); 545 vm_page_dirty(m); 546 vm_page_xunbusy_unchecked(m); 547 vm_page_unwire(m, PQ_LAUNDRY); 548 } 549 VM_OBJECT_WUNLOCK(kstack_object); 550 } 551 552 /* 553 * Bring the kernel stack for a specified thread back in. 554 */ 555 static void 556 vm_thread_swapin(struct thread *td, int oom_alloc) 557 { 558 vm_page_t ma[KSTACK_MAX_PAGES]; 559 vm_offset_t kaddr; 560 int a, count, i, j, pages, rv; 561 562 kaddr = td->td_kstack; 563 pages = td->td_kstack_pages; 564 vm_thread_stack_back(td->td_domain.dr_policy, kaddr, ma, pages, 565 oom_alloc); 566 for (i = 0; i < pages;) { 567 vm_page_assert_xbusied(ma[i]); 568 if (vm_page_all_valid(ma[i])) { 569 i++; 570 continue; 571 } 572 vm_object_pip_add(kstack_object, 1); 573 for (j = i + 1; j < pages; j++) 574 if (vm_page_all_valid(ma[j])) 575 break; 576 VM_OBJECT_WLOCK(kstack_object); 577 rv = vm_pager_has_page(kstack_object, ma[i]->pindex, NULL, &a); 578 VM_OBJECT_WUNLOCK(kstack_object); 579 KASSERT(rv == 1, ("%s: missing page %p", __func__, ma[i])); 580 count = min(a + 1, j - i); 581 rv = vm_pager_get_pages(kstack_object, ma + i, count, NULL, NULL); 582 KASSERT(rv == VM_PAGER_OK, ("%s: cannot get kstack for proc %d", 583 __func__, td->td_proc->p_pid)); 584 vm_object_pip_wakeup(kstack_object); 585 i += count; 586 } 587 pmap_qenter(kaddr, ma, pages); 588 cpu_thread_swapin(td); 589 } 590 591 void 592 faultin(struct proc *p) 593 { 594 struct thread *td; 595 int oom_alloc; 596 597 PROC_LOCK_ASSERT(p, MA_OWNED); 598 599 /* 600 * If another process is swapping in this process, 601 * just wait until it finishes. 602 */ 603 if (p->p_flag & P_SWAPPINGIN) { 604 while (p->p_flag & P_SWAPPINGIN) 605 msleep(&p->p_flag, &p->p_mtx, PVM, "faultin", 0); 606 return; 607 } 608 609 if ((p->p_flag & P_INMEM) == 0) { 610 oom_alloc = (p->p_flag & P_WKILLED) != 0 ? VM_ALLOC_SYSTEM : 611 VM_ALLOC_NORMAL; 612 613 /* 614 * Don't let another thread swap process p out while we are 615 * busy swapping it in. 616 */ 617 ++p->p_lock; 618 p->p_flag |= P_SWAPPINGIN; 619 PROC_UNLOCK(p); 620 sx_xlock(&allproc_lock); 621 MPASS(swapped_cnt > 0); 622 swapped_cnt--; 623 if (curthread != &thread0) 624 swap_inprogress++; 625 sx_xunlock(&allproc_lock); 626 627 /* 628 * We hold no lock here because the list of threads 629 * can not change while all threads in the process are 630 * swapped out. 631 */ 632 FOREACH_THREAD_IN_PROC(p, td) 633 vm_thread_swapin(td, oom_alloc); 634 635 if (curthread != &thread0) { 636 sx_xlock(&allproc_lock); 637 MPASS(swap_inprogress > 0); 638 swap_inprogress--; 639 last_swapin = ticks; 640 sx_xunlock(&allproc_lock); 641 } 642 PROC_LOCK(p); 643 swapclear(p); 644 p->p_swtick = ticks; 645 646 /* Allow other threads to swap p out now. */ 647 wakeup(&p->p_flag); 648 --p->p_lock; 649 } 650 } 651 652 /* 653 * This swapin algorithm attempts to swap-in processes only if there 654 * is enough space for them. Of course, if a process waits for a long 655 * time, it will be swapped in anyway. 656 */ 657 658 static struct proc * 659 swapper_selector(bool wkilled_only) 660 { 661 struct proc *p, *res; 662 struct thread *td; 663 int ppri, pri, slptime, swtime; 664 665 sx_assert(&allproc_lock, SA_SLOCKED); 666 if (swapped_cnt == 0) 667 return (NULL); 668 res = NULL; 669 ppri = INT_MIN; 670 FOREACH_PROC_IN_SYSTEM(p) { 671 PROC_LOCK(p); 672 if (p->p_state == PRS_NEW || (p->p_flag & (P_SWAPPINGOUT | 673 P_SWAPPINGIN | P_INMEM)) != 0) { 674 PROC_UNLOCK(p); 675 continue; 676 } 677 if (p->p_state == PRS_NORMAL && (p->p_flag & P_WKILLED) != 0) { 678 /* 679 * A swapped-out process might have mapped a 680 * large portion of the system's pages as 681 * anonymous memory. There is no other way to 682 * release the memory other than to kill the 683 * process, for which we need to swap it in. 684 */ 685 return (p); 686 } 687 if (wkilled_only) { 688 PROC_UNLOCK(p); 689 continue; 690 } 691 swtime = (ticks - p->p_swtick) / hz; 692 FOREACH_THREAD_IN_PROC(p, td) { 693 /* 694 * An otherwise runnable thread of a process 695 * swapped out has only the TDI_SWAPPED bit set. 696 */ 697 thread_lock(td); 698 if (td->td_inhibitors == TDI_SWAPPED) { 699 slptime = (ticks - td->td_slptick) / hz; 700 pri = swtime + slptime; 701 if ((td->td_flags & TDF_SWAPINREQ) == 0) 702 pri -= p->p_nice * 8; 703 /* 704 * if this thread is higher priority 705 * and there is enough space, then select 706 * this process instead of the previous 707 * selection. 708 */ 709 if (pri > ppri) { 710 res = p; 711 ppri = pri; 712 } 713 } 714 thread_unlock(td); 715 } 716 PROC_UNLOCK(p); 717 } 718 719 if (res != NULL) 720 PROC_LOCK(res); 721 return (res); 722 } 723 724 #define SWAPIN_INTERVAL (MAXSLP * hz / 2) 725 726 /* 727 * Limit swapper to swap in one non-WKILLED process in MAXSLP/2 728 * interval, assuming that there is: 729 * - at least one domain that is not suffering from a shortage of free memory; 730 * - no parallel swap-ins; 731 * - no other swap-ins in the current SWAPIN_INTERVAL. 732 */ 733 static bool 734 swapper_wkilled_only(void) 735 { 736 737 return (vm_page_count_min_set(&all_domains) || swap_inprogress > 0 || 738 (u_int)(ticks - last_swapin) < SWAPIN_INTERVAL); 739 } 740 741 void 742 swapper(void) 743 { 744 struct proc *p; 745 746 for (;;) { 747 sx_slock(&allproc_lock); 748 p = swapper_selector(swapper_wkilled_only()); 749 sx_sunlock(&allproc_lock); 750 751 if (p == NULL) { 752 tsleep(&proc0, PVM, "swapin", SWAPIN_INTERVAL); 753 } else { 754 PROC_LOCK_ASSERT(p, MA_OWNED); 755 756 /* 757 * Another process may be bringing or may have 758 * already brought this process in while we 759 * traverse all threads. Or, this process may 760 * have exited or even being swapped out 761 * again. 762 */ 763 if (p->p_state == PRS_NORMAL && (p->p_flag & (P_INMEM | 764 P_SWAPPINGOUT | P_SWAPPINGIN)) == 0) { 765 faultin(p); 766 } 767 PROC_UNLOCK(p); 768 } 769 } 770 } 771 772 /* 773 * First, if any processes have been sleeping or stopped for at least 774 * "swap_idle_threshold1" seconds, they are swapped out. If, however, 775 * no such processes exist, then the longest-sleeping or stopped 776 * process is swapped out. Finally, and only as a last resort, if 777 * there are no sleeping or stopped processes, the longest-resident 778 * process is swapped out. 779 */ 780 static void 781 swapout_procs(int action) 782 { 783 struct proc *p; 784 struct thread *td; 785 int slptime; 786 bool didswap, doswap; 787 788 MPASS((action & (VM_SWAP_NORMAL | VM_SWAP_IDLE)) != 0); 789 790 didswap = false; 791 sx_slock(&allproc_lock); 792 FOREACH_PROC_IN_SYSTEM(p) { 793 /* 794 * Filter out not yet fully constructed processes. Do 795 * not swap out held processes. Avoid processes which 796 * are system, exiting, execing, traced, already swapped 797 * out or are in the process of being swapped in or out. 798 */ 799 PROC_LOCK(p); 800 if (p->p_state != PRS_NORMAL || p->p_lock != 0 || (p->p_flag & 801 (P_SYSTEM | P_WEXIT | P_INEXEC | P_STOPPED_SINGLE | 802 P_TRACED | P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) != 803 P_INMEM) { 804 PROC_UNLOCK(p); 805 continue; 806 } 807 808 /* 809 * Further consideration of this process for swap out 810 * requires iterating over its threads. We release 811 * allproc_lock here so that process creation and 812 * destruction are not blocked while we iterate. 813 * 814 * To later reacquire allproc_lock and resume 815 * iteration over the allproc list, we will first have 816 * to release the lock on the process. We place a 817 * hold on the process so that it remains in the 818 * allproc list while it is unlocked. 819 */ 820 _PHOLD_LITE(p); 821 sx_sunlock(&allproc_lock); 822 823 /* 824 * Do not swapout a realtime process. 825 * Guarantee swap_idle_threshold1 time in memory. 826 * If the system is under memory stress, or if we are 827 * swapping idle processes >= swap_idle_threshold2, 828 * then swap the process out. 829 */ 830 doswap = true; 831 FOREACH_THREAD_IN_PROC(p, td) { 832 thread_lock(td); 833 slptime = (ticks - td->td_slptick) / hz; 834 if (PRI_IS_REALTIME(td->td_pri_class) || 835 slptime < swap_idle_threshold1 || 836 !thread_safetoswapout(td) || 837 ((action & VM_SWAP_NORMAL) == 0 && 838 slptime < swap_idle_threshold2)) 839 doswap = false; 840 thread_unlock(td); 841 if (!doswap) 842 break; 843 } 844 if (doswap && swapout(p) == 0) 845 didswap = true; 846 847 PROC_UNLOCK(p); 848 if (didswap) { 849 sx_xlock(&allproc_lock); 850 swapped_cnt++; 851 sx_downgrade(&allproc_lock); 852 } else 853 sx_slock(&allproc_lock); 854 PRELE(p); 855 } 856 sx_sunlock(&allproc_lock); 857 858 /* 859 * If we swapped something out, and another process needed memory, 860 * then wakeup the sched process. 861 */ 862 if (didswap) 863 wakeup(&proc0); 864 } 865 866 static void 867 swapclear(struct proc *p) 868 { 869 struct thread *td; 870 871 PROC_LOCK_ASSERT(p, MA_OWNED); 872 873 FOREACH_THREAD_IN_PROC(p, td) { 874 thread_lock(td); 875 td->td_flags |= TDF_INMEM; 876 td->td_flags &= ~TDF_SWAPINREQ; 877 TD_CLR_SWAPPED(td); 878 if (TD_CAN_RUN(td)) { 879 if (setrunnable(td, 0)) { 880 #ifdef INVARIANTS 881 /* 882 * XXX: We just cleared TDI_SWAPPED 883 * above and set TDF_INMEM, so this 884 * should never happen. 885 */ 886 panic("not waking up swapper"); 887 #endif 888 } 889 } else 890 thread_unlock(td); 891 } 892 p->p_flag &= ~(P_SWAPPINGIN | P_SWAPPINGOUT); 893 p->p_flag |= P_INMEM; 894 } 895 896 static int 897 swapout(struct proc *p) 898 { 899 struct thread *td; 900 901 PROC_LOCK_ASSERT(p, MA_OWNED); 902 903 /* 904 * The states of this process and its threads may have changed 905 * by now. Assuming that there is only one pageout daemon thread, 906 * this process should still be in memory. 907 */ 908 KASSERT((p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) == 909 P_INMEM, ("swapout: lost a swapout race?")); 910 911 /* 912 * Remember the resident count. 913 */ 914 p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace); 915 916 /* 917 * Check and mark all threads before we proceed. 918 */ 919 p->p_flag &= ~P_INMEM; 920 p->p_flag |= P_SWAPPINGOUT; 921 FOREACH_THREAD_IN_PROC(p, td) { 922 thread_lock(td); 923 if (!thread_safetoswapout(td)) { 924 thread_unlock(td); 925 swapclear(p); 926 return (EBUSY); 927 } 928 td->td_flags &= ~TDF_INMEM; 929 TD_SET_SWAPPED(td); 930 thread_unlock(td); 931 } 932 td = FIRST_THREAD_IN_PROC(p); 933 ++td->td_ru.ru_nswap; 934 PROC_UNLOCK(p); 935 936 /* 937 * This list is stable because all threads are now prevented from 938 * running. The list is only modified in the context of a running 939 * thread in this process. 940 */ 941 FOREACH_THREAD_IN_PROC(p, td) 942 vm_thread_swapout(td); 943 944 PROC_LOCK(p); 945 p->p_flag &= ~P_SWAPPINGOUT; 946 p->p_swtick = ticks; 947 return (0); 948 } 949