1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1998 Matthew Dillon, 5 * Copyright (c) 1994 John S. Dyson 6 * Copyright (c) 1990 University of Utah. 7 * Copyright (c) 1982, 1986, 1989, 1993 8 * The Regents of the University of California. All rights reserved. 9 * 10 * This code is derived from software contributed to Berkeley by 11 * the Systems Programming Group of the University of Utah Computer 12 * Science Department. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. All advertising materials mentioning features or use of this software 23 * must display the following acknowledgement: 24 * This product includes software developed by the University of 25 * California, Berkeley and its contributors. 26 * 4. Neither the name of the University nor the names of its contributors 27 * may be used to endorse or promote products derived from this software 28 * without specific prior written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 40 * SUCH DAMAGE. 41 * 42 * New Swap System 43 * Matthew Dillon 44 * 45 * Radix Bitmap 'blists'. 46 * 47 * - The new swapper uses the new radix bitmap code. This should scale 48 * to arbitrarily small or arbitrarily large swap spaces and an almost 49 * arbitrary degree of fragmentation. 50 * 51 * Features: 52 * 53 * - on the fly reallocation of swap during putpages. The new system 54 * does not try to keep previously allocated swap blocks for dirty 55 * pages. 56 * 57 * - on the fly deallocation of swap 58 * 59 * - No more garbage collection required. Unnecessarily allocated swap 60 * blocks only exist for dirty vm_page_t's now and these are already 61 * cycled (in a high-load system) by the pager. We also do on-the-fly 62 * removal of invalidated swap blocks when a page is destroyed 63 * or renamed. 64 * 65 * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$ 66 * 67 * @(#)swap_pager.c 8.9 (Berkeley) 3/21/94 68 * @(#)vm_swap.c 8.5 (Berkeley) 2/17/94 69 */ 70 71 #include <sys/cdefs.h> 72 __FBSDID("$FreeBSD$"); 73 74 #include "opt_vm.h" 75 76 #include <sys/param.h> 77 #include <sys/bio.h> 78 #include <sys/blist.h> 79 #include <sys/buf.h> 80 #include <sys/conf.h> 81 #include <sys/disk.h> 82 #include <sys/disklabel.h> 83 #include <sys/eventhandler.h> 84 #include <sys/fcntl.h> 85 #include <sys/limits.h> 86 #include <sys/lock.h> 87 #include <sys/kernel.h> 88 #include <sys/mount.h> 89 #include <sys/namei.h> 90 #include <sys/malloc.h> 91 #include <sys/pctrie.h> 92 #include <sys/priv.h> 93 #include <sys/proc.h> 94 #include <sys/racct.h> 95 #include <sys/resource.h> 96 #include <sys/resourcevar.h> 97 #include <sys/rwlock.h> 98 #include <sys/sbuf.h> 99 #include <sys/sysctl.h> 100 #include <sys/sysproto.h> 101 #include <sys/systm.h> 102 #include <sys/sx.h> 103 #include <sys/unistd.h> 104 #include <sys/user.h> 105 #include <sys/vmmeter.h> 106 #include <sys/vnode.h> 107 108 #include <security/mac/mac_framework.h> 109 110 #include <vm/vm.h> 111 #include <vm/pmap.h> 112 #include <vm/vm_map.h> 113 #include <vm/vm_kern.h> 114 #include <vm/vm_object.h> 115 #include <vm/vm_page.h> 116 #include <vm/vm_pager.h> 117 #include <vm/vm_pageout.h> 118 #include <vm/vm_param.h> 119 #include <vm/swap_pager.h> 120 #include <vm/vm_extern.h> 121 #include <vm/uma.h> 122 123 #include <geom/geom.h> 124 125 /* 126 * MAX_PAGEOUT_CLUSTER must be a power of 2 between 1 and 64. 127 * The 64-page limit is due to the radix code (kern/subr_blist.c). 128 */ 129 #ifndef MAX_PAGEOUT_CLUSTER 130 #define MAX_PAGEOUT_CLUSTER 32 131 #endif 132 133 #if !defined(SWB_NPAGES) 134 #define SWB_NPAGES MAX_PAGEOUT_CLUSTER 135 #endif 136 137 #define SWAP_META_PAGES PCTRIE_COUNT 138 139 /* 140 * A swblk structure maps each page index within a 141 * SWAP_META_PAGES-aligned and sized range to the address of an 142 * on-disk swap block (or SWAPBLK_NONE). The collection of these 143 * mappings for an entire vm object is implemented as a pc-trie. 144 */ 145 struct swblk { 146 vm_pindex_t p; 147 daddr_t d[SWAP_META_PAGES]; 148 }; 149 150 static MALLOC_DEFINE(M_VMPGDATA, "vm_pgdata", "swap pager private data"); 151 static struct mtx sw_dev_mtx; 152 static TAILQ_HEAD(, swdevt) swtailq = TAILQ_HEAD_INITIALIZER(swtailq); 153 static struct swdevt *swdevhd; /* Allocate from here next */ 154 static int nswapdev; /* Number of swap devices */ 155 int swap_pager_avail; 156 static struct sx swdev_syscall_lock; /* serialize swap(on|off) */ 157 158 static __exclusive_cache_line u_long swap_reserved; 159 static u_long swap_total; 160 static int sysctl_page_shift(SYSCTL_HANDLER_ARGS); 161 162 static SYSCTL_NODE(_vm_stats, OID_AUTO, swap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 163 "VM swap stats"); 164 165 SYSCTL_PROC(_vm, OID_AUTO, swap_reserved, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, 166 &swap_reserved, 0, sysctl_page_shift, "QU", 167 "Amount of swap storage needed to back all allocated anonymous memory."); 168 SYSCTL_PROC(_vm, OID_AUTO, swap_total, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, 169 &swap_total, 0, sysctl_page_shift, "QU", 170 "Total amount of available swap storage."); 171 172 int vm_overcommit __read_mostly = 0; 173 SYSCTL_INT(_vm, VM_OVERCOMMIT, overcommit, CTLFLAG_RW, &vm_overcommit, 0, 174 "Configure virtual memory overcommit behavior. See tuning(7) " 175 "for details."); 176 static unsigned long swzone; 177 SYSCTL_ULONG(_vm, OID_AUTO, swzone, CTLFLAG_RD, &swzone, 0, 178 "Actual size of swap metadata zone"); 179 static unsigned long swap_maxpages; 180 SYSCTL_ULONG(_vm, OID_AUTO, swap_maxpages, CTLFLAG_RD, &swap_maxpages, 0, 181 "Maximum amount of swap supported"); 182 183 static COUNTER_U64_DEFINE_EARLY(swap_free_deferred); 184 SYSCTL_COUNTER_U64(_vm_stats_swap, OID_AUTO, free_deferred, 185 CTLFLAG_RD, &swap_free_deferred, 186 "Number of pages that deferred freeing swap space"); 187 188 static COUNTER_U64_DEFINE_EARLY(swap_free_completed); 189 SYSCTL_COUNTER_U64(_vm_stats_swap, OID_AUTO, free_completed, 190 CTLFLAG_RD, &swap_free_completed, 191 "Number of deferred frees completed"); 192 193 static int 194 sysctl_page_shift(SYSCTL_HANDLER_ARGS) 195 { 196 uint64_t newval; 197 u_long value = *(u_long *)arg1; 198 199 newval = ((uint64_t)value) << PAGE_SHIFT; 200 return (sysctl_handle_64(oidp, &newval, 0, req)); 201 } 202 203 static bool 204 swap_reserve_by_cred_rlimit(u_long pincr, struct ucred *cred, int oc) 205 { 206 struct uidinfo *uip; 207 u_long prev; 208 209 uip = cred->cr_ruidinfo; 210 211 prev = atomic_fetchadd_long(&uip->ui_vmsize, pincr); 212 if ((oc & SWAP_RESERVE_RLIMIT_ON) != 0 && 213 prev + pincr > lim_cur(curthread, RLIMIT_SWAP) && 214 priv_check(curthread, PRIV_VM_SWAP_NORLIMIT) != 0) { 215 prev = atomic_fetchadd_long(&uip->ui_vmsize, -pincr); 216 KASSERT(prev >= pincr, 217 ("negative vmsize for uid %d\n", uip->ui_uid)); 218 return (false); 219 } 220 return (true); 221 } 222 223 static void 224 swap_release_by_cred_rlimit(u_long pdecr, struct ucred *cred) 225 { 226 struct uidinfo *uip; 227 #ifdef INVARIANTS 228 u_long prev; 229 #endif 230 231 uip = cred->cr_ruidinfo; 232 233 #ifdef INVARIANTS 234 prev = atomic_fetchadd_long(&uip->ui_vmsize, -pdecr); 235 KASSERT(prev >= pdecr, 236 ("negative vmsize for uid %d\n", uip->ui_uid)); 237 #else 238 atomic_subtract_long(&uip->ui_vmsize, pdecr); 239 #endif 240 } 241 242 static void 243 swap_reserve_force_rlimit(u_long pincr, struct ucred *cred) 244 { 245 struct uidinfo *uip; 246 247 uip = cred->cr_ruidinfo; 248 atomic_add_long(&uip->ui_vmsize, pincr); 249 } 250 251 bool 252 swap_reserve(vm_ooffset_t incr) 253 { 254 255 return (swap_reserve_by_cred(incr, curthread->td_ucred)); 256 } 257 258 bool 259 swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred) 260 { 261 u_long r, s, prev, pincr; 262 #ifdef RACCT 263 int error; 264 #endif 265 int oc; 266 static int curfail; 267 static struct timeval lastfail; 268 269 KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK", 270 __func__, (uintmax_t)incr)); 271 272 #ifdef RACCT 273 if (RACCT_ENABLED()) { 274 PROC_LOCK(curproc); 275 error = racct_add(curproc, RACCT_SWAP, incr); 276 PROC_UNLOCK(curproc); 277 if (error != 0) 278 return (false); 279 } 280 #endif 281 282 pincr = atop(incr); 283 prev = atomic_fetchadd_long(&swap_reserved, pincr); 284 r = prev + pincr; 285 s = swap_total; 286 oc = atomic_load_int(&vm_overcommit); 287 if (r > s && (oc & SWAP_RESERVE_ALLOW_NONWIRED) != 0) { 288 s += vm_cnt.v_page_count - vm_cnt.v_free_reserved - 289 vm_wire_count(); 290 } 291 if ((oc & SWAP_RESERVE_FORCE_ON) != 0 && r > s && 292 priv_check(curthread, PRIV_VM_SWAP_NOQUOTA) != 0) { 293 prev = atomic_fetchadd_long(&swap_reserved, -pincr); 294 KASSERT(prev >= pincr, 295 ("swap_reserved < incr on overcommit fail")); 296 goto out_error; 297 } 298 299 if (!swap_reserve_by_cred_rlimit(pincr, cred, oc)) { 300 prev = atomic_fetchadd_long(&swap_reserved, -pincr); 301 KASSERT(prev >= pincr, 302 ("swap_reserved < incr on overcommit fail")); 303 goto out_error; 304 } 305 306 return (true); 307 308 out_error: 309 if (ppsratecheck(&lastfail, &curfail, 1)) { 310 printf("uid %d, pid %d: swap reservation " 311 "for %jd bytes failed\n", 312 cred->cr_ruidinfo->ui_uid, curproc->p_pid, incr); 313 } 314 #ifdef RACCT 315 if (RACCT_ENABLED()) { 316 PROC_LOCK(curproc); 317 racct_sub(curproc, RACCT_SWAP, incr); 318 PROC_UNLOCK(curproc); 319 } 320 #endif 321 322 return (false); 323 } 324 325 void 326 swap_reserve_force(vm_ooffset_t incr) 327 { 328 u_long pincr; 329 330 KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK", 331 __func__, (uintmax_t)incr)); 332 333 #ifdef RACCT 334 if (RACCT_ENABLED()) { 335 PROC_LOCK(curproc); 336 racct_add_force(curproc, RACCT_SWAP, incr); 337 PROC_UNLOCK(curproc); 338 } 339 #endif 340 pincr = atop(incr); 341 atomic_add_long(&swap_reserved, pincr); 342 swap_reserve_force_rlimit(pincr, curthread->td_ucred); 343 } 344 345 void 346 swap_release(vm_ooffset_t decr) 347 { 348 struct ucred *cred; 349 350 PROC_LOCK(curproc); 351 cred = curproc->p_ucred; 352 swap_release_by_cred(decr, cred); 353 PROC_UNLOCK(curproc); 354 } 355 356 void 357 swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred) 358 { 359 u_long pdecr; 360 #ifdef INVARIANTS 361 u_long prev; 362 #endif 363 364 KASSERT((decr & PAGE_MASK) == 0, ("%s: decr: %ju & PAGE_MASK", 365 __func__, (uintmax_t)decr)); 366 367 pdecr = atop(decr); 368 #ifdef INVARIANTS 369 prev = atomic_fetchadd_long(&swap_reserved, -pdecr); 370 KASSERT(prev >= pdecr, ("swap_reserved < decr")); 371 #else 372 atomic_subtract_long(&swap_reserved, pdecr); 373 #endif 374 375 swap_release_by_cred_rlimit(pdecr, cred); 376 #ifdef RACCT 377 if (racct_enable) 378 racct_sub_cred(cred, RACCT_SWAP, decr); 379 #endif 380 } 381 382 static int swap_pager_full = 2; /* swap space exhaustion (task killing) */ 383 static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/ 384 static struct mtx swbuf_mtx; /* to sync nsw_wcount_async */ 385 static int nsw_wcount_async; /* limit async write buffers */ 386 static int nsw_wcount_async_max;/* assigned maximum */ 387 int nsw_cluster_max; /* maximum VOP I/O allowed */ 388 389 static int sysctl_swap_async_max(SYSCTL_HANDLER_ARGS); 390 SYSCTL_PROC(_vm, OID_AUTO, swap_async_max, CTLTYPE_INT | CTLFLAG_RW | 391 CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_async_max, "I", 392 "Maximum running async swap ops"); 393 static int sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS); 394 SYSCTL_PROC(_vm, OID_AUTO, swap_fragmentation, CTLTYPE_STRING | CTLFLAG_RD | 395 CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_fragmentation, "A", 396 "Swap Fragmentation Info"); 397 398 static struct sx sw_alloc_sx; 399 400 /* 401 * "named" and "unnamed" anon region objects. Try to reduce the overhead 402 * of searching a named list by hashing it just a little. 403 */ 404 405 #define NOBJLISTS 8 406 407 #define NOBJLIST(handle) \ 408 (&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)]) 409 410 static struct pagerlst swap_pager_object_list[NOBJLISTS]; 411 static uma_zone_t swwbuf_zone; 412 static uma_zone_t swrbuf_zone; 413 static uma_zone_t swblk_zone; 414 static uma_zone_t swpctrie_zone; 415 416 /* 417 * pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure 418 * calls hooked from other parts of the VM system and do not appear here. 419 * (see vm/swap_pager.h). 420 */ 421 static vm_object_t 422 swap_pager_alloc(void *handle, vm_ooffset_t size, 423 vm_prot_t prot, vm_ooffset_t offset, struct ucred *); 424 static void swap_pager_dealloc(vm_object_t object); 425 static int swap_pager_getpages(vm_object_t, vm_page_t *, int, int *, 426 int *); 427 static int swap_pager_getpages_async(vm_object_t, vm_page_t *, int, int *, 428 int *, pgo_getpages_iodone_t, void *); 429 static void swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *); 430 static boolean_t 431 swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after); 432 static void swap_pager_init(void); 433 static void swap_pager_unswapped(vm_page_t); 434 static void swap_pager_swapoff(struct swdevt *sp); 435 static void swap_pager_update_writecount(vm_object_t object, 436 vm_offset_t start, vm_offset_t end); 437 static void swap_pager_release_writecount(vm_object_t object, 438 vm_offset_t start, vm_offset_t end); 439 static void swap_pager_freespace(vm_object_t object, vm_pindex_t start, 440 vm_size_t size); 441 442 const struct pagerops swappagerops = { 443 .pgo_kvme_type = KVME_TYPE_SWAP, 444 .pgo_init = swap_pager_init, /* early system initialization of pager */ 445 .pgo_alloc = swap_pager_alloc, /* allocate an OBJT_SWAP object */ 446 .pgo_dealloc = swap_pager_dealloc, /* deallocate an OBJT_SWAP object */ 447 .pgo_getpages = swap_pager_getpages, /* pagein */ 448 .pgo_getpages_async = swap_pager_getpages_async, /* pagein (async) */ 449 .pgo_putpages = swap_pager_putpages, /* pageout */ 450 .pgo_haspage = swap_pager_haspage, /* get backing store status for page */ 451 .pgo_pageunswapped = swap_pager_unswapped, /* remove swap related to page */ 452 .pgo_update_writecount = swap_pager_update_writecount, 453 .pgo_release_writecount = swap_pager_release_writecount, 454 .pgo_freespace = swap_pager_freespace, 455 }; 456 457 /* 458 * swap_*() routines are externally accessible. swp_*() routines are 459 * internal. 460 */ 461 static int nswap_lowat = 128; /* in pages, swap_pager_almost_full warn */ 462 static int nswap_hiwat = 512; /* in pages, swap_pager_almost_full warn */ 463 464 SYSCTL_INT(_vm, OID_AUTO, dmmax, CTLFLAG_RD, &nsw_cluster_max, 0, 465 "Maximum size of a swap block in pages"); 466 467 static void swp_sizecheck(void); 468 static void swp_pager_async_iodone(struct buf *bp); 469 static bool swp_pager_swblk_empty(struct swblk *sb, int start, int limit); 470 static void swp_pager_free_empty_swblk(vm_object_t, struct swblk *sb); 471 static int swapongeom(struct vnode *); 472 static int swaponvp(struct thread *, struct vnode *, u_long); 473 static int swapoff_one(struct swdevt *sp, struct ucred *cred, 474 u_int flags); 475 476 /* 477 * Swap bitmap functions 478 */ 479 static void swp_pager_freeswapspace(daddr_t blk, daddr_t npages); 480 static daddr_t swp_pager_getswapspace(int *npages); 481 482 /* 483 * Metadata functions 484 */ 485 static daddr_t swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t); 486 static void swp_pager_meta_free(vm_object_t, vm_pindex_t, vm_pindex_t); 487 static void swp_pager_meta_transfer(vm_object_t src, vm_object_t dst, 488 vm_pindex_t pindex, vm_pindex_t count); 489 static void swp_pager_meta_free_all(vm_object_t); 490 static daddr_t swp_pager_meta_lookup(vm_object_t, vm_pindex_t); 491 492 static void 493 swp_pager_init_freerange(daddr_t *start, daddr_t *num) 494 { 495 496 *start = SWAPBLK_NONE; 497 *num = 0; 498 } 499 500 static void 501 swp_pager_update_freerange(daddr_t *start, daddr_t *num, daddr_t addr) 502 { 503 504 if (*start + *num == addr) { 505 (*num)++; 506 } else { 507 swp_pager_freeswapspace(*start, *num); 508 *start = addr; 509 *num = 1; 510 } 511 } 512 513 static void * 514 swblk_trie_alloc(struct pctrie *ptree) 515 { 516 517 return (uma_zalloc(swpctrie_zone, M_NOWAIT | (curproc == pageproc ? 518 M_USE_RESERVE : 0))); 519 } 520 521 static void 522 swblk_trie_free(struct pctrie *ptree, void *node) 523 { 524 525 uma_zfree(swpctrie_zone, node); 526 } 527 528 PCTRIE_DEFINE(SWAP, swblk, p, swblk_trie_alloc, swblk_trie_free); 529 530 /* 531 * SWP_SIZECHECK() - update swap_pager_full indication 532 * 533 * update the swap_pager_almost_full indication and warn when we are 534 * about to run out of swap space, using lowat/hiwat hysteresis. 535 * 536 * Clear swap_pager_full ( task killing ) indication when lowat is met. 537 * 538 * No restrictions on call 539 * This routine may not block. 540 */ 541 static void 542 swp_sizecheck(void) 543 { 544 545 if (swap_pager_avail < nswap_lowat) { 546 if (swap_pager_almost_full == 0) { 547 printf("swap_pager: out of swap space\n"); 548 swap_pager_almost_full = 1; 549 } 550 } else { 551 swap_pager_full = 0; 552 if (swap_pager_avail > nswap_hiwat) 553 swap_pager_almost_full = 0; 554 } 555 } 556 557 /* 558 * SWAP_PAGER_INIT() - initialize the swap pager! 559 * 560 * Expected to be started from system init. NOTE: This code is run 561 * before much else so be careful what you depend on. Most of the VM 562 * system has yet to be initialized at this point. 563 */ 564 static void 565 swap_pager_init(void) 566 { 567 /* 568 * Initialize object lists 569 */ 570 int i; 571 572 for (i = 0; i < NOBJLISTS; ++i) 573 TAILQ_INIT(&swap_pager_object_list[i]); 574 mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF); 575 sx_init(&sw_alloc_sx, "swspsx"); 576 sx_init(&swdev_syscall_lock, "swsysc"); 577 578 /* 579 * The nsw_cluster_max is constrained by the bp->b_pages[] 580 * array, which has maxphys / PAGE_SIZE entries, and our locally 581 * defined MAX_PAGEOUT_CLUSTER. Also be aware that swap ops are 582 * constrained by the swap device interleave stripe size. 583 * 584 * Initialized early so that GEOM_ELI can see it. 585 */ 586 nsw_cluster_max = min(maxphys / PAGE_SIZE, MAX_PAGEOUT_CLUSTER); 587 } 588 589 /* 590 * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process 591 * 592 * Expected to be started from pageout process once, prior to entering 593 * its main loop. 594 */ 595 void 596 swap_pager_swap_init(void) 597 { 598 unsigned long n, n2; 599 600 /* 601 * Number of in-transit swap bp operations. Don't 602 * exhaust the pbufs completely. Make sure we 603 * initialize workable values (0 will work for hysteresis 604 * but it isn't very efficient). 605 * 606 * Currently we hardwire nsw_wcount_async to 4. This limit is 607 * designed to prevent other I/O from having high latencies due to 608 * our pageout I/O. The value 4 works well for one or two active swap 609 * devices but is probably a little low if you have more. Even so, 610 * a higher value would probably generate only a limited improvement 611 * with three or four active swap devices since the system does not 612 * typically have to pageout at extreme bandwidths. We will want 613 * at least 2 per swap devices, and 4 is a pretty good value if you 614 * have one NFS swap device due to the command/ack latency over NFS. 615 * So it all works out pretty well. 616 * 617 * nsw_cluster_max is initialized in swap_pager_init(). 618 */ 619 620 nsw_wcount_async = 4; 621 nsw_wcount_async_max = nsw_wcount_async; 622 mtx_init(&swbuf_mtx, "async swbuf mutex", NULL, MTX_DEF); 623 624 swwbuf_zone = pbuf_zsecond_create("swwbuf", nswbuf / 4); 625 swrbuf_zone = pbuf_zsecond_create("swrbuf", nswbuf / 2); 626 627 /* 628 * Initialize our zone, taking the user's requested size or 629 * estimating the number we need based on the number of pages 630 * in the system. 631 */ 632 n = maxswzone != 0 ? maxswzone / sizeof(struct swblk) : 633 vm_cnt.v_page_count / 2; 634 swpctrie_zone = uma_zcreate("swpctrie", pctrie_node_size(), NULL, NULL, 635 pctrie_zone_init, NULL, UMA_ALIGN_PTR, 0); 636 swblk_zone = uma_zcreate("swblk", sizeof(struct swblk), NULL, NULL, 637 NULL, NULL, _Alignof(struct swblk) - 1, 0); 638 n2 = n; 639 do { 640 if (uma_zone_reserve_kva(swblk_zone, n)) 641 break; 642 /* 643 * if the allocation failed, try a zone two thirds the 644 * size of the previous attempt. 645 */ 646 n -= ((n + 2) / 3); 647 } while (n > 0); 648 649 /* 650 * Often uma_zone_reserve_kva() cannot reserve exactly the 651 * requested size. Account for the difference when 652 * calculating swap_maxpages. 653 */ 654 n = uma_zone_get_max(swblk_zone); 655 656 if (n < n2) 657 printf("Swap blk zone entries changed from %lu to %lu.\n", 658 n2, n); 659 /* absolute maximum we can handle assuming 100% efficiency */ 660 swap_maxpages = n * SWAP_META_PAGES; 661 swzone = n * sizeof(struct swblk); 662 if (!uma_zone_reserve_kva(swpctrie_zone, n)) 663 printf("Cannot reserve swap pctrie zone, " 664 "reduce kern.maxswzone.\n"); 665 } 666 667 bool 668 swap_pager_init_object(vm_object_t object, void *handle, struct ucred *cred, 669 vm_ooffset_t size, vm_ooffset_t offset) 670 { 671 if (cred != NULL) { 672 if (!swap_reserve_by_cred(size, cred)) 673 return (false); 674 crhold(cred); 675 } 676 677 object->un_pager.swp.writemappings = 0; 678 object->handle = handle; 679 if (cred != NULL) { 680 object->cred = cred; 681 object->charge = size; 682 } 683 return (true); 684 } 685 686 static vm_object_t 687 swap_pager_alloc_init(objtype_t otype, void *handle, struct ucred *cred, 688 vm_ooffset_t size, vm_ooffset_t offset) 689 { 690 vm_object_t object; 691 692 /* 693 * The un_pager.swp.swp_blks trie is initialized by 694 * vm_object_allocate() to ensure the correct order of 695 * visibility to other threads. 696 */ 697 object = vm_object_allocate(otype, OFF_TO_IDX(offset + 698 PAGE_MASK + size)); 699 700 if (!swap_pager_init_object(object, handle, cred, size, offset)) { 701 vm_object_deallocate(object); 702 return (NULL); 703 } 704 return (object); 705 } 706 707 /* 708 * SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate 709 * its metadata structures. 710 * 711 * This routine is called from the mmap and fork code to create a new 712 * OBJT_SWAP object. 713 * 714 * This routine must ensure that no live duplicate is created for 715 * the named object request, which is protected against by 716 * holding the sw_alloc_sx lock in case handle != NULL. 717 */ 718 static vm_object_t 719 swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, 720 vm_ooffset_t offset, struct ucred *cred) 721 { 722 vm_object_t object; 723 724 if (handle != NULL) { 725 /* 726 * Reference existing named region or allocate new one. There 727 * should not be a race here against swp_pager_meta_build() 728 * as called from vm_page_remove() in regards to the lookup 729 * of the handle. 730 */ 731 sx_xlock(&sw_alloc_sx); 732 object = vm_pager_object_lookup(NOBJLIST(handle), handle); 733 if (object == NULL) { 734 object = swap_pager_alloc_init(OBJT_SWAP, handle, cred, 735 size, offset); 736 if (object != NULL) { 737 TAILQ_INSERT_TAIL(NOBJLIST(object->handle), 738 object, pager_object_list); 739 } 740 } 741 sx_xunlock(&sw_alloc_sx); 742 } else { 743 object = swap_pager_alloc_init(OBJT_SWAP, handle, cred, 744 size, offset); 745 } 746 return (object); 747 } 748 749 /* 750 * SWAP_PAGER_DEALLOC() - remove swap metadata from object 751 * 752 * The swap backing for the object is destroyed. The code is 753 * designed such that we can reinstantiate it later, but this 754 * routine is typically called only when the entire object is 755 * about to be destroyed. 756 * 757 * The object must be locked. 758 */ 759 static void 760 swap_pager_dealloc(vm_object_t object) 761 { 762 763 VM_OBJECT_ASSERT_WLOCKED(object); 764 KASSERT((object->flags & OBJ_DEAD) != 0, ("dealloc of reachable obj")); 765 766 /* 767 * Remove from list right away so lookups will fail if we block for 768 * pageout completion. 769 */ 770 if ((object->flags & OBJ_ANON) == 0 && object->handle != NULL) { 771 VM_OBJECT_WUNLOCK(object); 772 sx_xlock(&sw_alloc_sx); 773 TAILQ_REMOVE(NOBJLIST(object->handle), object, 774 pager_object_list); 775 sx_xunlock(&sw_alloc_sx); 776 VM_OBJECT_WLOCK(object); 777 } 778 779 vm_object_pip_wait(object, "swpdea"); 780 781 /* 782 * Free all remaining metadata. We only bother to free it from 783 * the swap meta data. We do not attempt to free swapblk's still 784 * associated with vm_page_t's for this object. We do not care 785 * if paging is still in progress on some objects. 786 */ 787 swp_pager_meta_free_all(object); 788 object->handle = NULL; 789 object->type = OBJT_DEAD; 790 791 /* 792 * Release the allocation charge. 793 */ 794 if (object->cred != NULL) { 795 swap_release_by_cred(object->charge, object->cred); 796 object->charge = 0; 797 crfree(object->cred); 798 object->cred = NULL; 799 } 800 801 /* 802 * Hide the object from swap_pager_swapoff(). 803 */ 804 vm_object_clear_flag(object, OBJ_SWAP); 805 } 806 807 /************************************************************************ 808 * SWAP PAGER BITMAP ROUTINES * 809 ************************************************************************/ 810 811 /* 812 * SWP_PAGER_GETSWAPSPACE() - allocate raw swap space 813 * 814 * Allocate swap for up to the requested number of pages. The 815 * starting swap block number (a page index) is returned or 816 * SWAPBLK_NONE if the allocation failed. 817 * 818 * Also has the side effect of advising that somebody made a mistake 819 * when they configured swap and didn't configure enough. 820 * 821 * This routine may not sleep. 822 * 823 * We allocate in round-robin fashion from the configured devices. 824 */ 825 static daddr_t 826 swp_pager_getswapspace(int *io_npages) 827 { 828 daddr_t blk; 829 struct swdevt *sp; 830 int mpages, npages; 831 832 KASSERT(*io_npages >= 1, 833 ("%s: npages not positive", __func__)); 834 blk = SWAPBLK_NONE; 835 mpages = *io_npages; 836 npages = imin(BLIST_MAX_ALLOC, mpages); 837 mtx_lock(&sw_dev_mtx); 838 sp = swdevhd; 839 while (!TAILQ_EMPTY(&swtailq)) { 840 if (sp == NULL) 841 sp = TAILQ_FIRST(&swtailq); 842 if ((sp->sw_flags & SW_CLOSING) == 0) 843 blk = blist_alloc(sp->sw_blist, &npages, mpages); 844 if (blk != SWAPBLK_NONE) 845 break; 846 sp = TAILQ_NEXT(sp, sw_list); 847 if (swdevhd == sp) { 848 if (npages == 1) 849 break; 850 mpages = npages - 1; 851 npages >>= 1; 852 } 853 } 854 if (blk != SWAPBLK_NONE) { 855 *io_npages = npages; 856 blk += sp->sw_first; 857 sp->sw_used += npages; 858 swap_pager_avail -= npages; 859 swp_sizecheck(); 860 swdevhd = TAILQ_NEXT(sp, sw_list); 861 } else { 862 if (swap_pager_full != 2) { 863 printf("swp_pager_getswapspace(%d): failed\n", 864 *io_npages); 865 swap_pager_full = 2; 866 swap_pager_almost_full = 1; 867 } 868 swdevhd = NULL; 869 } 870 mtx_unlock(&sw_dev_mtx); 871 return (blk); 872 } 873 874 static bool 875 swp_pager_isondev(daddr_t blk, struct swdevt *sp) 876 { 877 878 return (blk >= sp->sw_first && blk < sp->sw_end); 879 } 880 881 static void 882 swp_pager_strategy(struct buf *bp) 883 { 884 struct swdevt *sp; 885 886 mtx_lock(&sw_dev_mtx); 887 TAILQ_FOREACH(sp, &swtailq, sw_list) { 888 if (swp_pager_isondev(bp->b_blkno, sp)) { 889 mtx_unlock(&sw_dev_mtx); 890 if ((sp->sw_flags & SW_UNMAPPED) != 0 && 891 unmapped_buf_allowed) { 892 bp->b_data = unmapped_buf; 893 bp->b_offset = 0; 894 } else { 895 pmap_qenter((vm_offset_t)bp->b_data, 896 &bp->b_pages[0], bp->b_bcount / PAGE_SIZE); 897 } 898 sp->sw_strategy(bp, sp); 899 return; 900 } 901 } 902 panic("Swapdev not found"); 903 } 904 905 /* 906 * SWP_PAGER_FREESWAPSPACE() - free raw swap space 907 * 908 * This routine returns the specified swap blocks back to the bitmap. 909 * 910 * This routine may not sleep. 911 */ 912 static void 913 swp_pager_freeswapspace(daddr_t blk, daddr_t npages) 914 { 915 struct swdevt *sp; 916 917 if (npages == 0) 918 return; 919 mtx_lock(&sw_dev_mtx); 920 TAILQ_FOREACH(sp, &swtailq, sw_list) { 921 if (swp_pager_isondev(blk, sp)) { 922 sp->sw_used -= npages; 923 /* 924 * If we are attempting to stop swapping on 925 * this device, we don't want to mark any 926 * blocks free lest they be reused. 927 */ 928 if ((sp->sw_flags & SW_CLOSING) == 0) { 929 blist_free(sp->sw_blist, blk - sp->sw_first, 930 npages); 931 swap_pager_avail += npages; 932 swp_sizecheck(); 933 } 934 mtx_unlock(&sw_dev_mtx); 935 return; 936 } 937 } 938 panic("Swapdev not found"); 939 } 940 941 /* 942 * SYSCTL_SWAP_FRAGMENTATION() - produce raw swap space stats 943 */ 944 static int 945 sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS) 946 { 947 struct sbuf sbuf; 948 struct swdevt *sp; 949 const char *devname; 950 int error; 951 952 error = sysctl_wire_old_buffer(req, 0); 953 if (error != 0) 954 return (error); 955 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 956 mtx_lock(&sw_dev_mtx); 957 TAILQ_FOREACH(sp, &swtailq, sw_list) { 958 if (vn_isdisk(sp->sw_vp)) 959 devname = devtoname(sp->sw_vp->v_rdev); 960 else 961 devname = "[file]"; 962 sbuf_printf(&sbuf, "\nFree space on device %s:\n", devname); 963 blist_stats(sp->sw_blist, &sbuf); 964 } 965 mtx_unlock(&sw_dev_mtx); 966 error = sbuf_finish(&sbuf); 967 sbuf_delete(&sbuf); 968 return (error); 969 } 970 971 /* 972 * SWAP_PAGER_FREESPACE() - frees swap blocks associated with a page 973 * range within an object. 974 * 975 * This routine removes swapblk assignments from swap metadata. 976 * 977 * The external callers of this routine typically have already destroyed 978 * or renamed vm_page_t's associated with this range in the object so 979 * we should be ok. 980 * 981 * The object must be locked. 982 */ 983 static void 984 swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size) 985 { 986 987 swp_pager_meta_free(object, start, size); 988 } 989 990 /* 991 * SWAP_PAGER_RESERVE() - reserve swap blocks in object 992 * 993 * Assigns swap blocks to the specified range within the object. The 994 * swap blocks are not zeroed. Any previous swap assignment is destroyed. 995 * 996 * Returns 0 on success, -1 on failure. 997 */ 998 int 999 swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_pindex_t size) 1000 { 1001 daddr_t addr, blk, n_free, s_free; 1002 vm_pindex_t i, j; 1003 int n; 1004 1005 swp_pager_init_freerange(&s_free, &n_free); 1006 VM_OBJECT_WLOCK(object); 1007 for (i = 0; i < size; i += n) { 1008 n = MIN(size - i, INT_MAX); 1009 blk = swp_pager_getswapspace(&n); 1010 if (blk == SWAPBLK_NONE) { 1011 swp_pager_meta_free(object, start, i); 1012 VM_OBJECT_WUNLOCK(object); 1013 return (-1); 1014 } 1015 for (j = 0; j < n; ++j) { 1016 addr = swp_pager_meta_build(object, 1017 start + i + j, blk + j); 1018 if (addr != SWAPBLK_NONE) 1019 swp_pager_update_freerange(&s_free, &n_free, 1020 addr); 1021 } 1022 } 1023 swp_pager_freeswapspace(s_free, n_free); 1024 VM_OBJECT_WUNLOCK(object); 1025 return (0); 1026 } 1027 1028 static bool 1029 swp_pager_xfer_source(vm_object_t srcobject, vm_object_t dstobject, 1030 vm_pindex_t pindex, daddr_t addr) 1031 { 1032 daddr_t dstaddr __diagused; 1033 1034 KASSERT((srcobject->flags & OBJ_SWAP) != 0, 1035 ("%s: srcobject not swappable", __func__)); 1036 KASSERT((dstobject->flags & OBJ_SWAP) != 0, 1037 ("%s: dstobject not swappable", __func__)); 1038 1039 if (swp_pager_meta_lookup(dstobject, pindex) != SWAPBLK_NONE) { 1040 /* Caller should destroy the source block. */ 1041 return (false); 1042 } 1043 1044 /* 1045 * Destination has no swapblk and is not resident, transfer source. 1046 * swp_pager_meta_build() can sleep. 1047 */ 1048 VM_OBJECT_WUNLOCK(srcobject); 1049 dstaddr = swp_pager_meta_build(dstobject, pindex, addr); 1050 KASSERT(dstaddr == SWAPBLK_NONE, 1051 ("Unexpected destination swapblk")); 1052 VM_OBJECT_WLOCK(srcobject); 1053 1054 return (true); 1055 } 1056 1057 /* 1058 * SWAP_PAGER_COPY() - copy blocks from source pager to destination pager 1059 * and destroy the source. 1060 * 1061 * Copy any valid swapblks from the source to the destination. In 1062 * cases where both the source and destination have a valid swapblk, 1063 * we keep the destination's. 1064 * 1065 * This routine is allowed to sleep. It may sleep allocating metadata 1066 * indirectly through swp_pager_meta_build(). 1067 * 1068 * The source object contains no vm_page_t's (which is just as well) 1069 * 1070 * The source and destination objects must be locked. 1071 * Both object locks may temporarily be released. 1072 */ 1073 void 1074 swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject, 1075 vm_pindex_t offset, int destroysource) 1076 { 1077 VM_OBJECT_ASSERT_WLOCKED(srcobject); 1078 VM_OBJECT_ASSERT_WLOCKED(dstobject); 1079 1080 /* 1081 * If destroysource is set, we remove the source object from the 1082 * swap_pager internal queue now. 1083 */ 1084 if (destroysource && (srcobject->flags & OBJ_ANON) == 0 && 1085 srcobject->handle != NULL) { 1086 VM_OBJECT_WUNLOCK(srcobject); 1087 VM_OBJECT_WUNLOCK(dstobject); 1088 sx_xlock(&sw_alloc_sx); 1089 TAILQ_REMOVE(NOBJLIST(srcobject->handle), srcobject, 1090 pager_object_list); 1091 sx_xunlock(&sw_alloc_sx); 1092 VM_OBJECT_WLOCK(dstobject); 1093 VM_OBJECT_WLOCK(srcobject); 1094 } 1095 1096 /* 1097 * Transfer source to destination. 1098 */ 1099 swp_pager_meta_transfer(srcobject, dstobject, offset, dstobject->size); 1100 1101 /* 1102 * Free left over swap blocks in source. 1103 */ 1104 if (destroysource) 1105 swp_pager_meta_free_all(srcobject); 1106 } 1107 1108 /* 1109 * SWAP_PAGER_HASPAGE() - determine if we have good backing store for 1110 * the requested page. 1111 * 1112 * We determine whether good backing store exists for the requested 1113 * page and return TRUE if it does, FALSE if it doesn't. 1114 * 1115 * If TRUE, we also try to determine how much valid, contiguous backing 1116 * store exists before and after the requested page. 1117 */ 1118 static boolean_t 1119 swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, 1120 int *after) 1121 { 1122 daddr_t blk, blk0; 1123 int i; 1124 1125 VM_OBJECT_ASSERT_LOCKED(object); 1126 KASSERT((object->flags & OBJ_SWAP) != 0, 1127 ("%s: object not swappable", __func__)); 1128 1129 /* 1130 * do we have good backing store at the requested index ? 1131 */ 1132 blk0 = swp_pager_meta_lookup(object, pindex); 1133 if (blk0 == SWAPBLK_NONE) { 1134 if (before) 1135 *before = 0; 1136 if (after) 1137 *after = 0; 1138 return (FALSE); 1139 } 1140 1141 /* 1142 * find backwards-looking contiguous good backing store 1143 */ 1144 if (before != NULL) { 1145 for (i = 1; i < SWB_NPAGES; i++) { 1146 if (i > pindex) 1147 break; 1148 blk = swp_pager_meta_lookup(object, pindex - i); 1149 if (blk != blk0 - i) 1150 break; 1151 } 1152 *before = i - 1; 1153 } 1154 1155 /* 1156 * find forward-looking contiguous good backing store 1157 */ 1158 if (after != NULL) { 1159 for (i = 1; i < SWB_NPAGES; i++) { 1160 blk = swp_pager_meta_lookup(object, pindex + i); 1161 if (blk != blk0 + i) 1162 break; 1163 } 1164 *after = i - 1; 1165 } 1166 return (TRUE); 1167 } 1168 1169 /* 1170 * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page 1171 * 1172 * This removes any associated swap backing store, whether valid or 1173 * not, from the page. 1174 * 1175 * This routine is typically called when a page is made dirty, at 1176 * which point any associated swap can be freed. MADV_FREE also 1177 * calls us in a special-case situation 1178 * 1179 * NOTE!!! If the page is clean and the swap was valid, the caller 1180 * should make the page dirty before calling this routine. This routine 1181 * does NOT change the m->dirty status of the page. Also: MADV_FREE 1182 * depends on it. 1183 * 1184 * This routine may not sleep. 1185 * 1186 * The object containing the page may be locked. 1187 */ 1188 static void 1189 swap_pager_unswapped(vm_page_t m) 1190 { 1191 struct swblk *sb; 1192 vm_object_t obj; 1193 1194 /* 1195 * Handle enqueing deferred frees first. If we do not have the 1196 * object lock we wait for the page daemon to clear the space. 1197 */ 1198 obj = m->object; 1199 if (!VM_OBJECT_WOWNED(obj)) { 1200 VM_PAGE_OBJECT_BUSY_ASSERT(m); 1201 /* 1202 * The caller is responsible for synchronization but we 1203 * will harmlessly handle races. This is typically provided 1204 * by only calling unswapped() when a page transitions from 1205 * clean to dirty. 1206 */ 1207 if ((m->a.flags & (PGA_SWAP_SPACE | PGA_SWAP_FREE)) == 1208 PGA_SWAP_SPACE) { 1209 vm_page_aflag_set(m, PGA_SWAP_FREE); 1210 counter_u64_add(swap_free_deferred, 1); 1211 } 1212 return; 1213 } 1214 if ((m->a.flags & PGA_SWAP_FREE) != 0) 1215 counter_u64_add(swap_free_completed, 1); 1216 vm_page_aflag_clear(m, PGA_SWAP_FREE | PGA_SWAP_SPACE); 1217 1218 /* 1219 * The meta data only exists if the object is OBJT_SWAP 1220 * and even then might not be allocated yet. 1221 */ 1222 KASSERT((m->object->flags & OBJ_SWAP) != 0, 1223 ("Free object not swappable")); 1224 1225 sb = SWAP_PCTRIE_LOOKUP(&m->object->un_pager.swp.swp_blks, 1226 rounddown(m->pindex, SWAP_META_PAGES)); 1227 if (sb == NULL) 1228 return; 1229 if (sb->d[m->pindex % SWAP_META_PAGES] == SWAPBLK_NONE) 1230 return; 1231 swp_pager_freeswapspace(sb->d[m->pindex % SWAP_META_PAGES], 1); 1232 sb->d[m->pindex % SWAP_META_PAGES] = SWAPBLK_NONE; 1233 swp_pager_free_empty_swblk(m->object, sb); 1234 } 1235 1236 /* 1237 * swap_pager_getpages() - bring pages in from swap 1238 * 1239 * Attempt to page in the pages in array "ma" of length "count". The 1240 * caller may optionally specify that additional pages preceding and 1241 * succeeding the specified range be paged in. The number of such pages 1242 * is returned in the "rbehind" and "rahead" parameters, and they will 1243 * be in the inactive queue upon return. 1244 * 1245 * The pages in "ma" must be busied and will remain busied upon return. 1246 */ 1247 static int 1248 swap_pager_getpages_locked(vm_object_t object, vm_page_t *ma, int count, 1249 int *rbehind, int *rahead) 1250 { 1251 struct buf *bp; 1252 vm_page_t bm, mpred, msucc, p; 1253 vm_pindex_t pindex; 1254 daddr_t blk; 1255 int i, maxahead, maxbehind, reqcount; 1256 1257 VM_OBJECT_ASSERT_WLOCKED(object); 1258 reqcount = count; 1259 1260 KASSERT((object->flags & OBJ_SWAP) != 0, 1261 ("%s: object not swappable", __func__)); 1262 if (!swap_pager_haspage(object, ma[0]->pindex, &maxbehind, &maxahead)) { 1263 VM_OBJECT_WUNLOCK(object); 1264 return (VM_PAGER_FAIL); 1265 } 1266 1267 KASSERT(reqcount - 1 <= maxahead, 1268 ("page count %d extends beyond swap block", reqcount)); 1269 1270 /* 1271 * Do not transfer any pages other than those that are xbusied 1272 * when running during a split or collapse operation. This 1273 * prevents clustering from re-creating pages which are being 1274 * moved into another object. 1275 */ 1276 if ((object->flags & (OBJ_SPLIT | OBJ_DEAD)) != 0) { 1277 maxahead = reqcount - 1; 1278 maxbehind = 0; 1279 } 1280 1281 /* 1282 * Clip the readahead and readbehind ranges to exclude resident pages. 1283 */ 1284 if (rahead != NULL) { 1285 *rahead = imin(*rahead, maxahead - (reqcount - 1)); 1286 pindex = ma[reqcount - 1]->pindex; 1287 msucc = TAILQ_NEXT(ma[reqcount - 1], listq); 1288 if (msucc != NULL && msucc->pindex - pindex - 1 < *rahead) 1289 *rahead = msucc->pindex - pindex - 1; 1290 } 1291 if (rbehind != NULL) { 1292 *rbehind = imin(*rbehind, maxbehind); 1293 pindex = ma[0]->pindex; 1294 mpred = TAILQ_PREV(ma[0], pglist, listq); 1295 if (mpred != NULL && pindex - mpred->pindex - 1 < *rbehind) 1296 *rbehind = pindex - mpred->pindex - 1; 1297 } 1298 1299 bm = ma[0]; 1300 for (i = 0; i < count; i++) 1301 ma[i]->oflags |= VPO_SWAPINPROG; 1302 1303 /* 1304 * Allocate readahead and readbehind pages. 1305 */ 1306 if (rbehind != NULL) { 1307 for (i = 1; i <= *rbehind; i++) { 1308 p = vm_page_alloc(object, ma[0]->pindex - i, 1309 VM_ALLOC_NORMAL); 1310 if (p == NULL) 1311 break; 1312 p->oflags |= VPO_SWAPINPROG; 1313 bm = p; 1314 } 1315 *rbehind = i - 1; 1316 } 1317 if (rahead != NULL) { 1318 for (i = 0; i < *rahead; i++) { 1319 p = vm_page_alloc(object, 1320 ma[reqcount - 1]->pindex + i + 1, VM_ALLOC_NORMAL); 1321 if (p == NULL) 1322 break; 1323 p->oflags |= VPO_SWAPINPROG; 1324 } 1325 *rahead = i; 1326 } 1327 if (rbehind != NULL) 1328 count += *rbehind; 1329 if (rahead != NULL) 1330 count += *rahead; 1331 1332 vm_object_pip_add(object, count); 1333 1334 pindex = bm->pindex; 1335 blk = swp_pager_meta_lookup(object, pindex); 1336 KASSERT(blk != SWAPBLK_NONE, 1337 ("no swap blocking containing %p(%jx)", object, (uintmax_t)pindex)); 1338 1339 VM_OBJECT_WUNLOCK(object); 1340 bp = uma_zalloc(swrbuf_zone, M_WAITOK); 1341 MPASS((bp->b_flags & B_MAXPHYS) != 0); 1342 /* Pages cannot leave the object while busy. */ 1343 for (i = 0, p = bm; i < count; i++, p = TAILQ_NEXT(p, listq)) { 1344 MPASS(p->pindex == bm->pindex + i); 1345 bp->b_pages[i] = p; 1346 } 1347 1348 bp->b_flags |= B_PAGING; 1349 bp->b_iocmd = BIO_READ; 1350 bp->b_iodone = swp_pager_async_iodone; 1351 bp->b_rcred = crhold(thread0.td_ucred); 1352 bp->b_wcred = crhold(thread0.td_ucred); 1353 bp->b_blkno = blk; 1354 bp->b_bcount = PAGE_SIZE * count; 1355 bp->b_bufsize = PAGE_SIZE * count; 1356 bp->b_npages = count; 1357 bp->b_pgbefore = rbehind != NULL ? *rbehind : 0; 1358 bp->b_pgafter = rahead != NULL ? *rahead : 0; 1359 1360 VM_CNT_INC(v_swapin); 1361 VM_CNT_ADD(v_swappgsin, count); 1362 1363 /* 1364 * perform the I/O. NOTE!!! bp cannot be considered valid after 1365 * this point because we automatically release it on completion. 1366 * Instead, we look at the one page we are interested in which we 1367 * still hold a lock on even through the I/O completion. 1368 * 1369 * The other pages in our ma[] array are also released on completion, 1370 * so we cannot assume they are valid anymore either. 1371 * 1372 * NOTE: b_blkno is destroyed by the call to swapdev_strategy 1373 */ 1374 BUF_KERNPROC(bp); 1375 swp_pager_strategy(bp); 1376 1377 /* 1378 * Wait for the pages we want to complete. VPO_SWAPINPROG is always 1379 * cleared on completion. If an I/O error occurs, SWAPBLK_NONE 1380 * is set in the metadata for each page in the request. 1381 */ 1382 VM_OBJECT_WLOCK(object); 1383 /* This could be implemented more efficiently with aflags */ 1384 while ((ma[0]->oflags & VPO_SWAPINPROG) != 0) { 1385 ma[0]->oflags |= VPO_SWAPSLEEP; 1386 VM_CNT_INC(v_intrans); 1387 if (VM_OBJECT_SLEEP(object, &object->handle, PSWP, 1388 "swread", hz * 20)) { 1389 printf( 1390 "swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n", 1391 bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount); 1392 } 1393 } 1394 VM_OBJECT_WUNLOCK(object); 1395 1396 /* 1397 * If we had an unrecoverable read error pages will not be valid. 1398 */ 1399 for (i = 0; i < reqcount; i++) 1400 if (ma[i]->valid != VM_PAGE_BITS_ALL) 1401 return (VM_PAGER_ERROR); 1402 1403 return (VM_PAGER_OK); 1404 1405 /* 1406 * A final note: in a low swap situation, we cannot deallocate swap 1407 * and mark a page dirty here because the caller is likely to mark 1408 * the page clean when we return, causing the page to possibly revert 1409 * to all-zero's later. 1410 */ 1411 } 1412 1413 static int 1414 swap_pager_getpages(vm_object_t object, vm_page_t *ma, int count, 1415 int *rbehind, int *rahead) 1416 { 1417 1418 VM_OBJECT_WLOCK(object); 1419 return (swap_pager_getpages_locked(object, ma, count, rbehind, rahead)); 1420 } 1421 1422 /* 1423 * swap_pager_getpages_async(): 1424 * 1425 * Right now this is emulation of asynchronous operation on top of 1426 * swap_pager_getpages(). 1427 */ 1428 static int 1429 swap_pager_getpages_async(vm_object_t object, vm_page_t *ma, int count, 1430 int *rbehind, int *rahead, pgo_getpages_iodone_t iodone, void *arg) 1431 { 1432 int r, error; 1433 1434 r = swap_pager_getpages(object, ma, count, rbehind, rahead); 1435 switch (r) { 1436 case VM_PAGER_OK: 1437 error = 0; 1438 break; 1439 case VM_PAGER_ERROR: 1440 error = EIO; 1441 break; 1442 case VM_PAGER_FAIL: 1443 error = EINVAL; 1444 break; 1445 default: 1446 panic("unhandled swap_pager_getpages() error %d", r); 1447 } 1448 (iodone)(arg, ma, count, error); 1449 1450 return (r); 1451 } 1452 1453 /* 1454 * swap_pager_putpages: 1455 * 1456 * Assign swap (if necessary) and initiate I/O on the specified pages. 1457 * 1458 * In a low memory situation we may block in VOP_STRATEGY(), but the new 1459 * vm_page reservation system coupled with properly written VFS devices 1460 * should ensure that no low-memory deadlock occurs. This is an area 1461 * which needs work. 1462 * 1463 * The parent has N vm_object_pip_add() references prior to 1464 * calling us and will remove references for rtvals[] that are 1465 * not set to VM_PAGER_PEND. We need to remove the rest on I/O 1466 * completion. 1467 * 1468 * The parent has soft-busy'd the pages it passes us and will unbusy 1469 * those whose rtvals[] entry is not set to VM_PAGER_PEND on return. 1470 * We need to unbusy the rest on I/O completion. 1471 */ 1472 static void 1473 swap_pager_putpages(vm_object_t object, vm_page_t *ma, int count, 1474 int flags, int *rtvals) 1475 { 1476 struct buf *bp; 1477 daddr_t addr, blk, n_free, s_free; 1478 vm_page_t mreq; 1479 int i, j, n; 1480 bool async; 1481 1482 KASSERT(count == 0 || ma[0]->object == object, 1483 ("%s: object mismatch %p/%p", 1484 __func__, object, ma[0]->object)); 1485 1486 VM_OBJECT_WUNLOCK(object); 1487 async = curproc == pageproc && (flags & VM_PAGER_PUT_SYNC) == 0; 1488 swp_pager_init_freerange(&s_free, &n_free); 1489 1490 /* 1491 * Assign swap blocks and issue I/O. We reallocate swap on the fly. 1492 * The page is left dirty until the pageout operation completes 1493 * successfully. 1494 */ 1495 for (i = 0; i < count; i += n) { 1496 /* Maximum I/O size is limited by maximum swap block size. */ 1497 n = min(count - i, nsw_cluster_max); 1498 1499 if (async) { 1500 mtx_lock(&swbuf_mtx); 1501 while (nsw_wcount_async == 0) 1502 msleep(&nsw_wcount_async, &swbuf_mtx, PVM, 1503 "swbufa", 0); 1504 nsw_wcount_async--; 1505 mtx_unlock(&swbuf_mtx); 1506 } 1507 1508 /* Get a block of swap of size up to size n. */ 1509 blk = swp_pager_getswapspace(&n); 1510 if (blk == SWAPBLK_NONE) { 1511 mtx_lock(&swbuf_mtx); 1512 if (++nsw_wcount_async == 1) 1513 wakeup(&nsw_wcount_async); 1514 mtx_unlock(&swbuf_mtx); 1515 for (j = 0; j < n; ++j) 1516 rtvals[i + j] = VM_PAGER_FAIL; 1517 continue; 1518 } 1519 VM_OBJECT_WLOCK(object); 1520 for (j = 0; j < n; ++j) { 1521 mreq = ma[i + j]; 1522 vm_page_aflag_clear(mreq, PGA_SWAP_FREE); 1523 addr = swp_pager_meta_build(mreq->object, mreq->pindex, 1524 blk + j); 1525 if (addr != SWAPBLK_NONE) 1526 swp_pager_update_freerange(&s_free, &n_free, 1527 addr); 1528 MPASS(mreq->dirty == VM_PAGE_BITS_ALL); 1529 mreq->oflags |= VPO_SWAPINPROG; 1530 } 1531 VM_OBJECT_WUNLOCK(object); 1532 1533 bp = uma_zalloc(swwbuf_zone, M_WAITOK); 1534 MPASS((bp->b_flags & B_MAXPHYS) != 0); 1535 if (async) 1536 bp->b_flags |= B_ASYNC; 1537 bp->b_flags |= B_PAGING; 1538 bp->b_iocmd = BIO_WRITE; 1539 1540 bp->b_rcred = crhold(thread0.td_ucred); 1541 bp->b_wcred = crhold(thread0.td_ucred); 1542 bp->b_bcount = PAGE_SIZE * n; 1543 bp->b_bufsize = PAGE_SIZE * n; 1544 bp->b_blkno = blk; 1545 for (j = 0; j < n; j++) 1546 bp->b_pages[j] = ma[i + j]; 1547 bp->b_npages = n; 1548 1549 /* 1550 * Must set dirty range for NFS to work. 1551 */ 1552 bp->b_dirtyoff = 0; 1553 bp->b_dirtyend = bp->b_bcount; 1554 1555 VM_CNT_INC(v_swapout); 1556 VM_CNT_ADD(v_swappgsout, bp->b_npages); 1557 1558 /* 1559 * We unconditionally set rtvals[] to VM_PAGER_PEND so that we 1560 * can call the async completion routine at the end of a 1561 * synchronous I/O operation. Otherwise, our caller would 1562 * perform duplicate unbusy and wakeup operations on the page 1563 * and object, respectively. 1564 */ 1565 for (j = 0; j < n; j++) 1566 rtvals[i + j] = VM_PAGER_PEND; 1567 1568 /* 1569 * asynchronous 1570 * 1571 * NOTE: b_blkno is destroyed by the call to swapdev_strategy. 1572 */ 1573 if (async) { 1574 bp->b_iodone = swp_pager_async_iodone; 1575 BUF_KERNPROC(bp); 1576 swp_pager_strategy(bp); 1577 continue; 1578 } 1579 1580 /* 1581 * synchronous 1582 * 1583 * NOTE: b_blkno is destroyed by the call to swapdev_strategy. 1584 */ 1585 bp->b_iodone = bdone; 1586 swp_pager_strategy(bp); 1587 1588 /* 1589 * Wait for the sync I/O to complete. 1590 */ 1591 bwait(bp, PVM, "swwrt"); 1592 1593 /* 1594 * Now that we are through with the bp, we can call the 1595 * normal async completion, which frees everything up. 1596 */ 1597 swp_pager_async_iodone(bp); 1598 } 1599 swp_pager_freeswapspace(s_free, n_free); 1600 VM_OBJECT_WLOCK(object); 1601 } 1602 1603 /* 1604 * swp_pager_async_iodone: 1605 * 1606 * Completion routine for asynchronous reads and writes from/to swap. 1607 * Also called manually by synchronous code to finish up a bp. 1608 * 1609 * This routine may not sleep. 1610 */ 1611 static void 1612 swp_pager_async_iodone(struct buf *bp) 1613 { 1614 int i; 1615 vm_object_t object = NULL; 1616 1617 /* 1618 * Report error - unless we ran out of memory, in which case 1619 * we've already logged it in swapgeom_strategy(). 1620 */ 1621 if (bp->b_ioflags & BIO_ERROR && bp->b_error != ENOMEM) { 1622 printf( 1623 "swap_pager: I/O error - %s failed; blkno %ld," 1624 "size %ld, error %d\n", 1625 ((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"), 1626 (long)bp->b_blkno, 1627 (long)bp->b_bcount, 1628 bp->b_error 1629 ); 1630 } 1631 1632 /* 1633 * remove the mapping for kernel virtual 1634 */ 1635 if (buf_mapped(bp)) 1636 pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages); 1637 else 1638 bp->b_data = bp->b_kvabase; 1639 1640 if (bp->b_npages) { 1641 object = bp->b_pages[0]->object; 1642 VM_OBJECT_WLOCK(object); 1643 } 1644 1645 /* 1646 * cleanup pages. If an error occurs writing to swap, we are in 1647 * very serious trouble. If it happens to be a disk error, though, 1648 * we may be able to recover by reassigning the swap later on. So 1649 * in this case we remove the m->swapblk assignment for the page 1650 * but do not free it in the rlist. The errornous block(s) are thus 1651 * never reallocated as swap. Redirty the page and continue. 1652 */ 1653 for (i = 0; i < bp->b_npages; ++i) { 1654 vm_page_t m = bp->b_pages[i]; 1655 1656 m->oflags &= ~VPO_SWAPINPROG; 1657 if (m->oflags & VPO_SWAPSLEEP) { 1658 m->oflags &= ~VPO_SWAPSLEEP; 1659 wakeup(&object->handle); 1660 } 1661 1662 /* We always have space after I/O, successful or not. */ 1663 vm_page_aflag_set(m, PGA_SWAP_SPACE); 1664 1665 if (bp->b_ioflags & BIO_ERROR) { 1666 /* 1667 * If an error occurs I'd love to throw the swapblk 1668 * away without freeing it back to swapspace, so it 1669 * can never be used again. But I can't from an 1670 * interrupt. 1671 */ 1672 if (bp->b_iocmd == BIO_READ) { 1673 /* 1674 * NOTE: for reads, m->dirty will probably 1675 * be overridden by the original caller of 1676 * getpages so don't play cute tricks here. 1677 */ 1678 vm_page_invalid(m); 1679 } else { 1680 /* 1681 * If a write error occurs, reactivate page 1682 * so it doesn't clog the inactive list, 1683 * then finish the I/O. 1684 */ 1685 MPASS(m->dirty == VM_PAGE_BITS_ALL); 1686 1687 /* PQ_UNSWAPPABLE? */ 1688 vm_page_activate(m); 1689 vm_page_sunbusy(m); 1690 } 1691 } else if (bp->b_iocmd == BIO_READ) { 1692 /* 1693 * NOTE: for reads, m->dirty will probably be 1694 * overridden by the original caller of getpages so 1695 * we cannot set them in order to free the underlying 1696 * swap in a low-swap situation. I don't think we'd 1697 * want to do that anyway, but it was an optimization 1698 * that existed in the old swapper for a time before 1699 * it got ripped out due to precisely this problem. 1700 */ 1701 KASSERT(!pmap_page_is_mapped(m), 1702 ("swp_pager_async_iodone: page %p is mapped", m)); 1703 KASSERT(m->dirty == 0, 1704 ("swp_pager_async_iodone: page %p is dirty", m)); 1705 1706 vm_page_valid(m); 1707 if (i < bp->b_pgbefore || 1708 i >= bp->b_npages - bp->b_pgafter) 1709 vm_page_readahead_finish(m); 1710 } else { 1711 /* 1712 * For write success, clear the dirty 1713 * status, then finish the I/O ( which decrements the 1714 * busy count and possibly wakes waiter's up ). 1715 * A page is only written to swap after a period of 1716 * inactivity. Therefore, we do not expect it to be 1717 * reused. 1718 */ 1719 KASSERT(!pmap_page_is_write_mapped(m), 1720 ("swp_pager_async_iodone: page %p is not write" 1721 " protected", m)); 1722 vm_page_undirty(m); 1723 vm_page_deactivate_noreuse(m); 1724 vm_page_sunbusy(m); 1725 } 1726 } 1727 1728 /* 1729 * adjust pip. NOTE: the original parent may still have its own 1730 * pip refs on the object. 1731 */ 1732 if (object != NULL) { 1733 vm_object_pip_wakeupn(object, bp->b_npages); 1734 VM_OBJECT_WUNLOCK(object); 1735 } 1736 1737 /* 1738 * swapdev_strategy() manually sets b_vp and b_bufobj before calling 1739 * bstrategy(). Set them back to NULL now we're done with it, or we'll 1740 * trigger a KASSERT in relpbuf(). 1741 */ 1742 if (bp->b_vp) { 1743 bp->b_vp = NULL; 1744 bp->b_bufobj = NULL; 1745 } 1746 /* 1747 * release the physical I/O buffer 1748 */ 1749 if (bp->b_flags & B_ASYNC) { 1750 mtx_lock(&swbuf_mtx); 1751 if (++nsw_wcount_async == 1) 1752 wakeup(&nsw_wcount_async); 1753 mtx_unlock(&swbuf_mtx); 1754 } 1755 uma_zfree((bp->b_iocmd == BIO_READ) ? swrbuf_zone : swwbuf_zone, bp); 1756 } 1757 1758 int 1759 swap_pager_nswapdev(void) 1760 { 1761 1762 return (nswapdev); 1763 } 1764 1765 static void 1766 swp_pager_force_dirty(vm_page_t m) 1767 { 1768 1769 vm_page_dirty(m); 1770 swap_pager_unswapped(m); 1771 vm_page_launder(m); 1772 } 1773 1774 u_long 1775 swap_pager_swapped_pages(vm_object_t object) 1776 { 1777 struct swblk *sb; 1778 vm_pindex_t pi; 1779 u_long res; 1780 int i; 1781 1782 VM_OBJECT_ASSERT_LOCKED(object); 1783 1784 if (pctrie_is_empty(&object->un_pager.swp.swp_blks)) 1785 return (0); 1786 1787 for (res = 0, pi = 0; (sb = SWAP_PCTRIE_LOOKUP_GE( 1788 &object->un_pager.swp.swp_blks, pi)) != NULL; 1789 pi = sb->p + SWAP_META_PAGES) { 1790 for (i = 0; i < SWAP_META_PAGES; i++) { 1791 if (sb->d[i] != SWAPBLK_NONE) 1792 res++; 1793 } 1794 } 1795 return (res); 1796 } 1797 1798 /* 1799 * swap_pager_swapoff_object: 1800 * 1801 * Page in all of the pages that have been paged out for an object 1802 * to a swap device. 1803 */ 1804 static void 1805 swap_pager_swapoff_object(struct swdevt *sp, vm_object_t object) 1806 { 1807 struct swblk *sb; 1808 vm_page_t m; 1809 vm_pindex_t pi; 1810 daddr_t blk; 1811 int i, nv, rahead, rv; 1812 1813 KASSERT((object->flags & OBJ_SWAP) != 0, 1814 ("%s: Object not swappable", __func__)); 1815 1816 for (pi = 0; (sb = SWAP_PCTRIE_LOOKUP_GE( 1817 &object->un_pager.swp.swp_blks, pi)) != NULL; ) { 1818 if ((object->flags & OBJ_DEAD) != 0) { 1819 /* 1820 * Make sure that pending writes finish before 1821 * returning. 1822 */ 1823 vm_object_pip_wait(object, "swpoff"); 1824 swp_pager_meta_free_all(object); 1825 break; 1826 } 1827 for (i = 0; i < SWAP_META_PAGES; i++) { 1828 /* 1829 * Count the number of contiguous valid blocks. 1830 */ 1831 for (nv = 0; nv < SWAP_META_PAGES - i; nv++) { 1832 blk = sb->d[i + nv]; 1833 if (!swp_pager_isondev(blk, sp) || 1834 blk == SWAPBLK_NONE) 1835 break; 1836 } 1837 if (nv == 0) 1838 continue; 1839 1840 /* 1841 * Look for a page corresponding to the first 1842 * valid block and ensure that any pending paging 1843 * operations on it are complete. If the page is valid, 1844 * mark it dirty and free the swap block. Try to batch 1845 * this operation since it may cause sp to be freed, 1846 * meaning that we must restart the scan. Avoid busying 1847 * valid pages since we may block forever on kernel 1848 * stack pages. 1849 */ 1850 m = vm_page_lookup(object, sb->p + i); 1851 if (m == NULL) { 1852 m = vm_page_alloc(object, sb->p + i, 1853 VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL); 1854 if (m == NULL) 1855 break; 1856 } else { 1857 if ((m->oflags & VPO_SWAPINPROG) != 0) { 1858 m->oflags |= VPO_SWAPSLEEP; 1859 VM_OBJECT_SLEEP(object, &object->handle, 1860 PSWP, "swpoff", 0); 1861 break; 1862 } 1863 if (vm_page_all_valid(m)) { 1864 do { 1865 swp_pager_force_dirty(m); 1866 } while (--nv > 0 && 1867 (m = vm_page_next(m)) != NULL && 1868 vm_page_all_valid(m) && 1869 (m->oflags & VPO_SWAPINPROG) == 0); 1870 break; 1871 } 1872 if (!vm_page_busy_acquire(m, VM_ALLOC_WAITFAIL)) 1873 break; 1874 } 1875 1876 vm_object_pip_add(object, 1); 1877 rahead = SWAP_META_PAGES; 1878 rv = swap_pager_getpages_locked(object, &m, 1, NULL, 1879 &rahead); 1880 if (rv != VM_PAGER_OK) 1881 panic("%s: read from swap failed: %d", 1882 __func__, rv); 1883 vm_object_pip_wakeupn(object, 1); 1884 VM_OBJECT_WLOCK(object); 1885 vm_page_xunbusy(m); 1886 1887 /* 1888 * The object lock was dropped so we must restart the 1889 * scan of this swap block. Pages paged in during this 1890 * iteration will be marked dirty in a future iteration. 1891 */ 1892 break; 1893 } 1894 if (i == SWAP_META_PAGES) 1895 pi = sb->p + SWAP_META_PAGES; 1896 } 1897 } 1898 1899 /* 1900 * swap_pager_swapoff: 1901 * 1902 * Page in all of the pages that have been paged out to the 1903 * given device. The corresponding blocks in the bitmap must be 1904 * marked as allocated and the device must be flagged SW_CLOSING. 1905 * There may be no processes swapped out to the device. 1906 * 1907 * This routine may block. 1908 */ 1909 static void 1910 swap_pager_swapoff(struct swdevt *sp) 1911 { 1912 vm_object_t object; 1913 int retries; 1914 1915 sx_assert(&swdev_syscall_lock, SA_XLOCKED); 1916 1917 retries = 0; 1918 full_rescan: 1919 mtx_lock(&vm_object_list_mtx); 1920 TAILQ_FOREACH(object, &vm_object_list, object_list) { 1921 if ((object->flags & OBJ_SWAP) == 0) 1922 continue; 1923 mtx_unlock(&vm_object_list_mtx); 1924 /* Depends on type-stability. */ 1925 VM_OBJECT_WLOCK(object); 1926 1927 /* 1928 * Dead objects are eventually terminated on their own. 1929 */ 1930 if ((object->flags & OBJ_DEAD) != 0) 1931 goto next_obj; 1932 1933 /* 1934 * Sync with fences placed after pctrie 1935 * initialization. We must not access pctrie below 1936 * unless we checked that our object is swap and not 1937 * dead. 1938 */ 1939 atomic_thread_fence_acq(); 1940 if ((object->flags & OBJ_SWAP) == 0) 1941 goto next_obj; 1942 1943 swap_pager_swapoff_object(sp, object); 1944 next_obj: 1945 VM_OBJECT_WUNLOCK(object); 1946 mtx_lock(&vm_object_list_mtx); 1947 } 1948 mtx_unlock(&vm_object_list_mtx); 1949 1950 if (sp->sw_used) { 1951 /* 1952 * Objects may be locked or paging to the device being 1953 * removed, so we will miss their pages and need to 1954 * make another pass. We have marked this device as 1955 * SW_CLOSING, so the activity should finish soon. 1956 */ 1957 retries++; 1958 if (retries > 100) { 1959 panic("swapoff: failed to locate %d swap blocks", 1960 sp->sw_used); 1961 } 1962 pause("swpoff", hz / 20); 1963 goto full_rescan; 1964 } 1965 EVENTHANDLER_INVOKE(swapoff, sp); 1966 } 1967 1968 /************************************************************************ 1969 * SWAP META DATA * 1970 ************************************************************************ 1971 * 1972 * These routines manipulate the swap metadata stored in the 1973 * OBJT_SWAP object. 1974 * 1975 * Swap metadata is implemented with a global hash and not directly 1976 * linked into the object. Instead the object simply contains 1977 * appropriate tracking counters. 1978 */ 1979 1980 /* 1981 * SWP_PAGER_SWBLK_EMPTY() - is a range of blocks free? 1982 */ 1983 static bool 1984 swp_pager_swblk_empty(struct swblk *sb, int start, int limit) 1985 { 1986 int i; 1987 1988 MPASS(0 <= start && start <= limit && limit <= SWAP_META_PAGES); 1989 for (i = start; i < limit; i++) { 1990 if (sb->d[i] != SWAPBLK_NONE) 1991 return (false); 1992 } 1993 return (true); 1994 } 1995 1996 /* 1997 * SWP_PAGER_FREE_EMPTY_SWBLK() - frees if a block is free 1998 * 1999 * Nothing is done if the block is still in use. 2000 */ 2001 static void 2002 swp_pager_free_empty_swblk(vm_object_t object, struct swblk *sb) 2003 { 2004 2005 if (swp_pager_swblk_empty(sb, 0, SWAP_META_PAGES)) { 2006 SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p); 2007 uma_zfree(swblk_zone, sb); 2008 } 2009 } 2010 2011 /* 2012 * SWP_PAGER_META_BUILD() - add swap block to swap meta data for object 2013 * 2014 * The specified swapblk is added to the object's swap metadata. If 2015 * the swapblk is not valid, it is freed instead. Any previously 2016 * assigned swapblk is returned. 2017 */ 2018 static daddr_t 2019 swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk) 2020 { 2021 static volatile int swblk_zone_exhausted, swpctrie_zone_exhausted; 2022 struct swblk *sb, *sb1; 2023 vm_pindex_t modpi, rdpi; 2024 daddr_t prev_swapblk; 2025 int error, i; 2026 2027 VM_OBJECT_ASSERT_WLOCKED(object); 2028 2029 rdpi = rounddown(pindex, SWAP_META_PAGES); 2030 sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rdpi); 2031 if (sb == NULL) { 2032 if (swapblk == SWAPBLK_NONE) 2033 return (SWAPBLK_NONE); 2034 for (;;) { 2035 sb = uma_zalloc(swblk_zone, M_NOWAIT | (curproc == 2036 pageproc ? M_USE_RESERVE : 0)); 2037 if (sb != NULL) { 2038 sb->p = rdpi; 2039 for (i = 0; i < SWAP_META_PAGES; i++) 2040 sb->d[i] = SWAPBLK_NONE; 2041 if (atomic_cmpset_int(&swblk_zone_exhausted, 2042 1, 0)) 2043 printf("swblk zone ok\n"); 2044 break; 2045 } 2046 VM_OBJECT_WUNLOCK(object); 2047 if (uma_zone_exhausted(swblk_zone)) { 2048 if (atomic_cmpset_int(&swblk_zone_exhausted, 2049 0, 1)) 2050 printf("swap blk zone exhausted, " 2051 "increase kern.maxswzone\n"); 2052 vm_pageout_oom(VM_OOM_SWAPZ); 2053 pause("swzonxb", 10); 2054 } else 2055 uma_zwait(swblk_zone); 2056 VM_OBJECT_WLOCK(object); 2057 sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, 2058 rdpi); 2059 if (sb != NULL) 2060 /* 2061 * Somebody swapped out a nearby page, 2062 * allocating swblk at the rdpi index, 2063 * while we dropped the object lock. 2064 */ 2065 goto allocated; 2066 } 2067 for (;;) { 2068 error = SWAP_PCTRIE_INSERT( 2069 &object->un_pager.swp.swp_blks, sb); 2070 if (error == 0) { 2071 if (atomic_cmpset_int(&swpctrie_zone_exhausted, 2072 1, 0)) 2073 printf("swpctrie zone ok\n"); 2074 break; 2075 } 2076 VM_OBJECT_WUNLOCK(object); 2077 if (uma_zone_exhausted(swpctrie_zone)) { 2078 if (atomic_cmpset_int(&swpctrie_zone_exhausted, 2079 0, 1)) 2080 printf("swap pctrie zone exhausted, " 2081 "increase kern.maxswzone\n"); 2082 vm_pageout_oom(VM_OOM_SWAPZ); 2083 pause("swzonxp", 10); 2084 } else 2085 uma_zwait(swpctrie_zone); 2086 VM_OBJECT_WLOCK(object); 2087 sb1 = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, 2088 rdpi); 2089 if (sb1 != NULL) { 2090 uma_zfree(swblk_zone, sb); 2091 sb = sb1; 2092 goto allocated; 2093 } 2094 } 2095 } 2096 allocated: 2097 MPASS(sb->p == rdpi); 2098 2099 modpi = pindex % SWAP_META_PAGES; 2100 /* Return prior contents of metadata. */ 2101 prev_swapblk = sb->d[modpi]; 2102 /* Enter block into metadata. */ 2103 sb->d[modpi] = swapblk; 2104 2105 /* 2106 * Free the swblk if we end up with the empty page run. 2107 */ 2108 if (swapblk == SWAPBLK_NONE) 2109 swp_pager_free_empty_swblk(object, sb); 2110 return (prev_swapblk); 2111 } 2112 2113 /* 2114 * SWP_PAGER_META_TRANSFER() - free a range of blocks in the srcobject's swap 2115 * metadata, or transfer it into dstobject. 2116 * 2117 * This routine will free swap metadata structures as they are cleaned 2118 * out. 2119 */ 2120 static void 2121 swp_pager_meta_transfer(vm_object_t srcobject, vm_object_t dstobject, 2122 vm_pindex_t pindex, vm_pindex_t count) 2123 { 2124 struct swblk *sb; 2125 daddr_t n_free, s_free; 2126 vm_pindex_t offset, last; 2127 int i, limit, start; 2128 2129 VM_OBJECT_ASSERT_WLOCKED(srcobject); 2130 if (count == 0 || pctrie_is_empty(&srcobject->un_pager.swp.swp_blks)) 2131 return; 2132 2133 swp_pager_init_freerange(&s_free, &n_free); 2134 offset = pindex; 2135 last = pindex + count; 2136 for (;;) { 2137 sb = SWAP_PCTRIE_LOOKUP_GE(&srcobject->un_pager.swp.swp_blks, 2138 rounddown(pindex, SWAP_META_PAGES)); 2139 if (sb == NULL || sb->p >= last) 2140 break; 2141 start = pindex > sb->p ? pindex - sb->p : 0; 2142 limit = last - sb->p < SWAP_META_PAGES ? last - sb->p : 2143 SWAP_META_PAGES; 2144 for (i = start; i < limit; i++) { 2145 if (sb->d[i] == SWAPBLK_NONE) 2146 continue; 2147 if (dstobject == NULL || 2148 !swp_pager_xfer_source(srcobject, dstobject, 2149 sb->p + i - offset, sb->d[i])) { 2150 swp_pager_update_freerange(&s_free, &n_free, 2151 sb->d[i]); 2152 } 2153 sb->d[i] = SWAPBLK_NONE; 2154 } 2155 pindex = sb->p + SWAP_META_PAGES; 2156 if (swp_pager_swblk_empty(sb, 0, start) && 2157 swp_pager_swblk_empty(sb, limit, SWAP_META_PAGES)) { 2158 SWAP_PCTRIE_REMOVE(&srcobject->un_pager.swp.swp_blks, 2159 sb->p); 2160 uma_zfree(swblk_zone, sb); 2161 } 2162 } 2163 swp_pager_freeswapspace(s_free, n_free); 2164 } 2165 2166 /* 2167 * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata 2168 * 2169 * The requested range of blocks is freed, with any associated swap 2170 * returned to the swap bitmap. 2171 * 2172 * This routine will free swap metadata structures as they are cleaned 2173 * out. This routine does *NOT* operate on swap metadata associated 2174 * with resident pages. 2175 */ 2176 static void 2177 swp_pager_meta_free(vm_object_t object, vm_pindex_t pindex, vm_pindex_t count) 2178 { 2179 swp_pager_meta_transfer(object, NULL, pindex, count); 2180 } 2181 2182 /* 2183 * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object 2184 * 2185 * This routine locates and destroys all swap metadata associated with 2186 * an object. 2187 */ 2188 static void 2189 swp_pager_meta_free_all(vm_object_t object) 2190 { 2191 struct swblk *sb; 2192 daddr_t n_free, s_free; 2193 vm_pindex_t pindex; 2194 int i; 2195 2196 VM_OBJECT_ASSERT_WLOCKED(object); 2197 2198 if (pctrie_is_empty(&object->un_pager.swp.swp_blks)) 2199 return; 2200 2201 swp_pager_init_freerange(&s_free, &n_free); 2202 for (pindex = 0; (sb = SWAP_PCTRIE_LOOKUP_GE( 2203 &object->un_pager.swp.swp_blks, pindex)) != NULL;) { 2204 pindex = sb->p + SWAP_META_PAGES; 2205 for (i = 0; i < SWAP_META_PAGES; i++) { 2206 if (sb->d[i] == SWAPBLK_NONE) 2207 continue; 2208 swp_pager_update_freerange(&s_free, &n_free, sb->d[i]); 2209 } 2210 SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p); 2211 uma_zfree(swblk_zone, sb); 2212 } 2213 swp_pager_freeswapspace(s_free, n_free); 2214 } 2215 2216 /* 2217 * SWP_PAGER_METACTL() - misc control of swap meta data. 2218 * 2219 * This routine is capable of looking up, or removing swapblk 2220 * assignments in the swap meta data. It returns the swapblk being 2221 * looked-up, popped, or SWAPBLK_NONE if the block was invalid. 2222 * 2223 * When acting on a busy resident page and paging is in progress, we 2224 * have to wait until paging is complete but otherwise can act on the 2225 * busy page. 2226 */ 2227 static daddr_t 2228 swp_pager_meta_lookup(vm_object_t object, vm_pindex_t pindex) 2229 { 2230 struct swblk *sb; 2231 2232 VM_OBJECT_ASSERT_LOCKED(object); 2233 2234 /* 2235 * The meta data only exists if the object is OBJT_SWAP 2236 * and even then might not be allocated yet. 2237 */ 2238 KASSERT((object->flags & OBJ_SWAP) != 0, 2239 ("Lookup object not swappable")); 2240 2241 sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, 2242 rounddown(pindex, SWAP_META_PAGES)); 2243 if (sb == NULL) 2244 return (SWAPBLK_NONE); 2245 return (sb->d[pindex % SWAP_META_PAGES]); 2246 } 2247 2248 /* 2249 * Returns the least page index which is greater than or equal to the 2250 * parameter pindex and for which there is a swap block allocated. 2251 * Returns object's size if the object's type is not swap or if there 2252 * are no allocated swap blocks for the object after the requested 2253 * pindex. 2254 */ 2255 vm_pindex_t 2256 swap_pager_find_least(vm_object_t object, vm_pindex_t pindex) 2257 { 2258 struct swblk *sb; 2259 int i; 2260 2261 VM_OBJECT_ASSERT_LOCKED(object); 2262 2263 if (pctrie_is_empty(&object->un_pager.swp.swp_blks)) 2264 return (object->size); 2265 sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks, 2266 rounddown(pindex, SWAP_META_PAGES)); 2267 if (sb == NULL) 2268 return (object->size); 2269 if (sb->p < pindex) { 2270 for (i = pindex % SWAP_META_PAGES; i < SWAP_META_PAGES; i++) { 2271 if (sb->d[i] != SWAPBLK_NONE) 2272 return (sb->p + i); 2273 } 2274 sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks, 2275 roundup(pindex, SWAP_META_PAGES)); 2276 if (sb == NULL) 2277 return (object->size); 2278 } 2279 for (i = 0; i < SWAP_META_PAGES; i++) { 2280 if (sb->d[i] != SWAPBLK_NONE) 2281 return (sb->p + i); 2282 } 2283 2284 /* 2285 * We get here if a swblk is present in the trie but it 2286 * doesn't map any blocks. 2287 */ 2288 MPASS(0); 2289 return (object->size); 2290 } 2291 2292 /* 2293 * System call swapon(name) enables swapping on device name, 2294 * which must be in the swdevsw. Return EBUSY 2295 * if already swapping on this device. 2296 */ 2297 #ifndef _SYS_SYSPROTO_H_ 2298 struct swapon_args { 2299 char *name; 2300 }; 2301 #endif 2302 2303 int 2304 sys_swapon(struct thread *td, struct swapon_args *uap) 2305 { 2306 struct vattr attr; 2307 struct vnode *vp; 2308 struct nameidata nd; 2309 int error; 2310 2311 error = priv_check(td, PRIV_SWAPON); 2312 if (error) 2313 return (error); 2314 2315 sx_xlock(&swdev_syscall_lock); 2316 2317 /* 2318 * Swap metadata may not fit in the KVM if we have physical 2319 * memory of >1GB. 2320 */ 2321 if (swblk_zone == NULL) { 2322 error = ENOMEM; 2323 goto done; 2324 } 2325 2326 NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | AUDITVNODE1, 2327 UIO_USERSPACE, uap->name); 2328 error = namei(&nd); 2329 if (error) 2330 goto done; 2331 2332 NDFREE_PNBUF(&nd); 2333 vp = nd.ni_vp; 2334 2335 if (vn_isdisk_error(vp, &error)) { 2336 error = swapongeom(vp); 2337 } else if (vp->v_type == VREG && 2338 (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && 2339 (error = VOP_GETATTR(vp, &attr, td->td_ucred)) == 0) { 2340 /* 2341 * Allow direct swapping to NFS regular files in the same 2342 * way that nfs_mountroot() sets up diskless swapping. 2343 */ 2344 error = swaponvp(td, vp, attr.va_size / DEV_BSIZE); 2345 } 2346 2347 if (error != 0) 2348 vput(vp); 2349 else 2350 VOP_UNLOCK(vp); 2351 done: 2352 sx_xunlock(&swdev_syscall_lock); 2353 return (error); 2354 } 2355 2356 /* 2357 * Check that the total amount of swap currently configured does not 2358 * exceed half the theoretical maximum. If it does, print a warning 2359 * message. 2360 */ 2361 static void 2362 swapon_check_swzone(void) 2363 { 2364 2365 /* recommend using no more than half that amount */ 2366 if (swap_total > swap_maxpages / 2) { 2367 printf("warning: total configured swap (%lu pages) " 2368 "exceeds maximum recommended amount (%lu pages).\n", 2369 swap_total, swap_maxpages / 2); 2370 printf("warning: increase kern.maxswzone " 2371 "or reduce amount of swap.\n"); 2372 } 2373 } 2374 2375 static void 2376 swaponsomething(struct vnode *vp, void *id, u_long nblks, 2377 sw_strategy_t *strategy, sw_close_t *close, dev_t dev, int flags) 2378 { 2379 struct swdevt *sp, *tsp; 2380 daddr_t dvbase; 2381 2382 /* 2383 * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks. 2384 * First chop nblks off to page-align it, then convert. 2385 * 2386 * sw->sw_nblks is in page-sized chunks now too. 2387 */ 2388 nblks &= ~(ctodb(1) - 1); 2389 nblks = dbtoc(nblks); 2390 2391 sp = malloc(sizeof *sp, M_VMPGDATA, M_WAITOK | M_ZERO); 2392 sp->sw_blist = blist_create(nblks, M_WAITOK); 2393 sp->sw_vp = vp; 2394 sp->sw_id = id; 2395 sp->sw_dev = dev; 2396 sp->sw_nblks = nblks; 2397 sp->sw_used = 0; 2398 sp->sw_strategy = strategy; 2399 sp->sw_close = close; 2400 sp->sw_flags = flags; 2401 2402 /* 2403 * Do not free the first blocks in order to avoid overwriting 2404 * any bsd label at the front of the partition 2405 */ 2406 blist_free(sp->sw_blist, howmany(BBSIZE, PAGE_SIZE), 2407 nblks - howmany(BBSIZE, PAGE_SIZE)); 2408 2409 dvbase = 0; 2410 mtx_lock(&sw_dev_mtx); 2411 TAILQ_FOREACH(tsp, &swtailq, sw_list) { 2412 if (tsp->sw_end >= dvbase) { 2413 /* 2414 * We put one uncovered page between the devices 2415 * in order to definitively prevent any cross-device 2416 * I/O requests 2417 */ 2418 dvbase = tsp->sw_end + 1; 2419 } 2420 } 2421 sp->sw_first = dvbase; 2422 sp->sw_end = dvbase + nblks; 2423 TAILQ_INSERT_TAIL(&swtailq, sp, sw_list); 2424 nswapdev++; 2425 swap_pager_avail += nblks - howmany(BBSIZE, PAGE_SIZE); 2426 swap_total += nblks; 2427 swapon_check_swzone(); 2428 swp_sizecheck(); 2429 mtx_unlock(&sw_dev_mtx); 2430 EVENTHANDLER_INVOKE(swapon, sp); 2431 } 2432 2433 /* 2434 * SYSCALL: swapoff(devname) 2435 * 2436 * Disable swapping on the given device. 2437 * 2438 * XXX: Badly designed system call: it should use a device index 2439 * rather than filename as specification. We keep sw_vp around 2440 * only to make this work. 2441 */ 2442 static int 2443 kern_swapoff(struct thread *td, const char *name, enum uio_seg name_seg, 2444 u_int flags) 2445 { 2446 struct vnode *vp; 2447 struct nameidata nd; 2448 struct swdevt *sp; 2449 int error; 2450 2451 error = priv_check(td, PRIV_SWAPOFF); 2452 if (error != 0) 2453 return (error); 2454 if ((flags & ~(SWAPOFF_FORCE)) != 0) 2455 return (EINVAL); 2456 2457 sx_xlock(&swdev_syscall_lock); 2458 2459 NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, name_seg, name); 2460 error = namei(&nd); 2461 if (error) 2462 goto done; 2463 NDFREE_PNBUF(&nd); 2464 vp = nd.ni_vp; 2465 2466 mtx_lock(&sw_dev_mtx); 2467 TAILQ_FOREACH(sp, &swtailq, sw_list) { 2468 if (sp->sw_vp == vp) 2469 break; 2470 } 2471 mtx_unlock(&sw_dev_mtx); 2472 if (sp == NULL) { 2473 error = EINVAL; 2474 goto done; 2475 } 2476 error = swapoff_one(sp, td->td_ucred, flags); 2477 done: 2478 sx_xunlock(&swdev_syscall_lock); 2479 return (error); 2480 } 2481 2482 2483 #ifdef COMPAT_FREEBSD13 2484 int 2485 freebsd13_swapoff(struct thread *td, struct freebsd13_swapoff_args *uap) 2486 { 2487 return (kern_swapoff(td, uap->name, UIO_USERSPACE, 0)); 2488 } 2489 #endif 2490 2491 int 2492 sys_swapoff(struct thread *td, struct swapoff_args *uap) 2493 { 2494 return (kern_swapoff(td, uap->name, UIO_USERSPACE, uap->flags)); 2495 } 2496 2497 static int 2498 swapoff_one(struct swdevt *sp, struct ucred *cred, u_int flags) 2499 { 2500 u_long nblks; 2501 #ifdef MAC 2502 int error; 2503 #endif 2504 2505 sx_assert(&swdev_syscall_lock, SA_XLOCKED); 2506 #ifdef MAC 2507 (void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY); 2508 error = mac_system_check_swapoff(cred, sp->sw_vp); 2509 (void) VOP_UNLOCK(sp->sw_vp); 2510 if (error != 0) 2511 return (error); 2512 #endif 2513 nblks = sp->sw_nblks; 2514 2515 /* 2516 * We can turn off this swap device safely only if the 2517 * available virtual memory in the system will fit the amount 2518 * of data we will have to page back in, plus an epsilon so 2519 * the system doesn't become critically low on swap space. 2520 * The vm_free_count() part does not account e.g. for clean 2521 * pages that can be immediately reclaimed without paging, so 2522 * this is a very rough estimation. 2523 * 2524 * On the other hand, not turning swap off on swapoff_all() 2525 * means that we can lose swap data when filesystems go away, 2526 * which is arguably worse. 2527 */ 2528 if ((flags & SWAPOFF_FORCE) == 0 && 2529 vm_free_count() + swap_pager_avail < nblks + nswap_lowat) 2530 return (ENOMEM); 2531 2532 /* 2533 * Prevent further allocations on this device. 2534 */ 2535 mtx_lock(&sw_dev_mtx); 2536 sp->sw_flags |= SW_CLOSING; 2537 swap_pager_avail -= blist_fill(sp->sw_blist, 0, nblks); 2538 swap_total -= nblks; 2539 mtx_unlock(&sw_dev_mtx); 2540 2541 /* 2542 * Page in the contents of the device and close it. 2543 */ 2544 swap_pager_swapoff(sp); 2545 2546 sp->sw_close(curthread, sp); 2547 mtx_lock(&sw_dev_mtx); 2548 sp->sw_id = NULL; 2549 TAILQ_REMOVE(&swtailq, sp, sw_list); 2550 nswapdev--; 2551 if (nswapdev == 0) { 2552 swap_pager_full = 2; 2553 swap_pager_almost_full = 1; 2554 } 2555 if (swdevhd == sp) 2556 swdevhd = NULL; 2557 mtx_unlock(&sw_dev_mtx); 2558 blist_destroy(sp->sw_blist); 2559 free(sp, M_VMPGDATA); 2560 return (0); 2561 } 2562 2563 void 2564 swapoff_all(void) 2565 { 2566 struct swdevt *sp, *spt; 2567 const char *devname; 2568 int error; 2569 2570 sx_xlock(&swdev_syscall_lock); 2571 2572 mtx_lock(&sw_dev_mtx); 2573 TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) { 2574 mtx_unlock(&sw_dev_mtx); 2575 if (vn_isdisk(sp->sw_vp)) 2576 devname = devtoname(sp->sw_vp->v_rdev); 2577 else 2578 devname = "[file]"; 2579 error = swapoff_one(sp, thread0.td_ucred, SWAPOFF_FORCE); 2580 if (error != 0) { 2581 printf("Cannot remove swap device %s (error=%d), " 2582 "skipping.\n", devname, error); 2583 } else if (bootverbose) { 2584 printf("Swap device %s removed.\n", devname); 2585 } 2586 mtx_lock(&sw_dev_mtx); 2587 } 2588 mtx_unlock(&sw_dev_mtx); 2589 2590 sx_xunlock(&swdev_syscall_lock); 2591 } 2592 2593 void 2594 swap_pager_status(int *total, int *used) 2595 { 2596 2597 *total = swap_total; 2598 *used = swap_total - swap_pager_avail - 2599 nswapdev * howmany(BBSIZE, PAGE_SIZE); 2600 } 2601 2602 int 2603 swap_dev_info(int name, struct xswdev *xs, char *devname, size_t len) 2604 { 2605 struct swdevt *sp; 2606 const char *tmp_devname; 2607 int error, n; 2608 2609 n = 0; 2610 error = ENOENT; 2611 mtx_lock(&sw_dev_mtx); 2612 TAILQ_FOREACH(sp, &swtailq, sw_list) { 2613 if (n != name) { 2614 n++; 2615 continue; 2616 } 2617 xs->xsw_version = XSWDEV_VERSION; 2618 xs->xsw_dev = sp->sw_dev; 2619 xs->xsw_flags = sp->sw_flags; 2620 xs->xsw_nblks = sp->sw_nblks; 2621 xs->xsw_used = sp->sw_used; 2622 if (devname != NULL) { 2623 if (vn_isdisk(sp->sw_vp)) 2624 tmp_devname = devtoname(sp->sw_vp->v_rdev); 2625 else 2626 tmp_devname = "[file]"; 2627 strncpy(devname, tmp_devname, len); 2628 } 2629 error = 0; 2630 break; 2631 } 2632 mtx_unlock(&sw_dev_mtx); 2633 return (error); 2634 } 2635 2636 #if defined(COMPAT_FREEBSD11) 2637 #define XSWDEV_VERSION_11 1 2638 struct xswdev11 { 2639 u_int xsw_version; 2640 uint32_t xsw_dev; 2641 int xsw_flags; 2642 int xsw_nblks; 2643 int xsw_used; 2644 }; 2645 #endif 2646 2647 #if defined(__amd64__) && defined(COMPAT_FREEBSD32) 2648 struct xswdev32 { 2649 u_int xsw_version; 2650 u_int xsw_dev1, xsw_dev2; 2651 int xsw_flags; 2652 int xsw_nblks; 2653 int xsw_used; 2654 }; 2655 #endif 2656 2657 static int 2658 sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS) 2659 { 2660 struct xswdev xs; 2661 #if defined(__amd64__) && defined(COMPAT_FREEBSD32) 2662 struct xswdev32 xs32; 2663 #endif 2664 #if defined(COMPAT_FREEBSD11) 2665 struct xswdev11 xs11; 2666 #endif 2667 int error; 2668 2669 if (arg2 != 1) /* name length */ 2670 return (EINVAL); 2671 2672 memset(&xs, 0, sizeof(xs)); 2673 error = swap_dev_info(*(int *)arg1, &xs, NULL, 0); 2674 if (error != 0) 2675 return (error); 2676 #if defined(__amd64__) && defined(COMPAT_FREEBSD32) 2677 if (req->oldlen == sizeof(xs32)) { 2678 memset(&xs32, 0, sizeof(xs32)); 2679 xs32.xsw_version = XSWDEV_VERSION; 2680 xs32.xsw_dev1 = xs.xsw_dev; 2681 xs32.xsw_dev2 = xs.xsw_dev >> 32; 2682 xs32.xsw_flags = xs.xsw_flags; 2683 xs32.xsw_nblks = xs.xsw_nblks; 2684 xs32.xsw_used = xs.xsw_used; 2685 error = SYSCTL_OUT(req, &xs32, sizeof(xs32)); 2686 return (error); 2687 } 2688 #endif 2689 #if defined(COMPAT_FREEBSD11) 2690 if (req->oldlen == sizeof(xs11)) { 2691 memset(&xs11, 0, sizeof(xs11)); 2692 xs11.xsw_version = XSWDEV_VERSION_11; 2693 xs11.xsw_dev = xs.xsw_dev; /* truncation */ 2694 xs11.xsw_flags = xs.xsw_flags; 2695 xs11.xsw_nblks = xs.xsw_nblks; 2696 xs11.xsw_used = xs.xsw_used; 2697 error = SYSCTL_OUT(req, &xs11, sizeof(xs11)); 2698 return (error); 2699 } 2700 #endif 2701 error = SYSCTL_OUT(req, &xs, sizeof(xs)); 2702 return (error); 2703 } 2704 2705 SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0, 2706 "Number of swap devices"); 2707 SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD | CTLFLAG_MPSAFE, 2708 sysctl_vm_swap_info, 2709 "Swap statistics by device"); 2710 2711 /* 2712 * Count the approximate swap usage in pages for a vmspace. The 2713 * shadowed or not yet copied on write swap blocks are not accounted. 2714 * The map must be locked. 2715 */ 2716 long 2717 vmspace_swap_count(struct vmspace *vmspace) 2718 { 2719 vm_map_t map; 2720 vm_map_entry_t cur; 2721 vm_object_t object; 2722 struct swblk *sb; 2723 vm_pindex_t e, pi; 2724 long count; 2725 int i; 2726 2727 map = &vmspace->vm_map; 2728 count = 0; 2729 2730 VM_MAP_ENTRY_FOREACH(cur, map) { 2731 if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) 2732 continue; 2733 object = cur->object.vm_object; 2734 if (object == NULL || (object->flags & OBJ_SWAP) == 0) 2735 continue; 2736 VM_OBJECT_RLOCK(object); 2737 if ((object->flags & OBJ_SWAP) == 0) 2738 goto unlock; 2739 pi = OFF_TO_IDX(cur->offset); 2740 e = pi + OFF_TO_IDX(cur->end - cur->start); 2741 for (;; pi = sb->p + SWAP_META_PAGES) { 2742 sb = SWAP_PCTRIE_LOOKUP_GE( 2743 &object->un_pager.swp.swp_blks, pi); 2744 if (sb == NULL || sb->p >= e) 2745 break; 2746 for (i = 0; i < SWAP_META_PAGES; i++) { 2747 if (sb->p + i < e && 2748 sb->d[i] != SWAPBLK_NONE) 2749 count++; 2750 } 2751 } 2752 unlock: 2753 VM_OBJECT_RUNLOCK(object); 2754 } 2755 return (count); 2756 } 2757 2758 /* 2759 * GEOM backend 2760 * 2761 * Swapping onto disk devices. 2762 * 2763 */ 2764 2765 static g_orphan_t swapgeom_orphan; 2766 2767 static struct g_class g_swap_class = { 2768 .name = "SWAP", 2769 .version = G_VERSION, 2770 .orphan = swapgeom_orphan, 2771 }; 2772 2773 DECLARE_GEOM_CLASS(g_swap_class, g_class); 2774 2775 static void 2776 swapgeom_close_ev(void *arg, int flags) 2777 { 2778 struct g_consumer *cp; 2779 2780 cp = arg; 2781 g_access(cp, -1, -1, 0); 2782 g_detach(cp); 2783 g_destroy_consumer(cp); 2784 } 2785 2786 /* 2787 * Add a reference to the g_consumer for an inflight transaction. 2788 */ 2789 static void 2790 swapgeom_acquire(struct g_consumer *cp) 2791 { 2792 2793 mtx_assert(&sw_dev_mtx, MA_OWNED); 2794 cp->index++; 2795 } 2796 2797 /* 2798 * Remove a reference from the g_consumer. Post a close event if all 2799 * references go away, since the function might be called from the 2800 * biodone context. 2801 */ 2802 static void 2803 swapgeom_release(struct g_consumer *cp, struct swdevt *sp) 2804 { 2805 2806 mtx_assert(&sw_dev_mtx, MA_OWNED); 2807 cp->index--; 2808 if (cp->index == 0) { 2809 if (g_post_event(swapgeom_close_ev, cp, M_NOWAIT, NULL) == 0) 2810 sp->sw_id = NULL; 2811 } 2812 } 2813 2814 static void 2815 swapgeom_done(struct bio *bp2) 2816 { 2817 struct swdevt *sp; 2818 struct buf *bp; 2819 struct g_consumer *cp; 2820 2821 bp = bp2->bio_caller2; 2822 cp = bp2->bio_from; 2823 bp->b_ioflags = bp2->bio_flags; 2824 if (bp2->bio_error) 2825 bp->b_ioflags |= BIO_ERROR; 2826 bp->b_resid = bp->b_bcount - bp2->bio_completed; 2827 bp->b_error = bp2->bio_error; 2828 bp->b_caller1 = NULL; 2829 bufdone(bp); 2830 sp = bp2->bio_caller1; 2831 mtx_lock(&sw_dev_mtx); 2832 swapgeom_release(cp, sp); 2833 mtx_unlock(&sw_dev_mtx); 2834 g_destroy_bio(bp2); 2835 } 2836 2837 static void 2838 swapgeom_strategy(struct buf *bp, struct swdevt *sp) 2839 { 2840 struct bio *bio; 2841 struct g_consumer *cp; 2842 2843 mtx_lock(&sw_dev_mtx); 2844 cp = sp->sw_id; 2845 if (cp == NULL) { 2846 mtx_unlock(&sw_dev_mtx); 2847 bp->b_error = ENXIO; 2848 bp->b_ioflags |= BIO_ERROR; 2849 bufdone(bp); 2850 return; 2851 } 2852 swapgeom_acquire(cp); 2853 mtx_unlock(&sw_dev_mtx); 2854 if (bp->b_iocmd == BIO_WRITE) 2855 bio = g_new_bio(); 2856 else 2857 bio = g_alloc_bio(); 2858 if (bio == NULL) { 2859 mtx_lock(&sw_dev_mtx); 2860 swapgeom_release(cp, sp); 2861 mtx_unlock(&sw_dev_mtx); 2862 bp->b_error = ENOMEM; 2863 bp->b_ioflags |= BIO_ERROR; 2864 printf("swap_pager: cannot allocate bio\n"); 2865 bufdone(bp); 2866 return; 2867 } 2868 2869 bp->b_caller1 = bio; 2870 bio->bio_caller1 = sp; 2871 bio->bio_caller2 = bp; 2872 bio->bio_cmd = bp->b_iocmd; 2873 bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE; 2874 bio->bio_length = bp->b_bcount; 2875 bio->bio_done = swapgeom_done; 2876 bio->bio_flags |= BIO_SWAP; 2877 if (!buf_mapped(bp)) { 2878 bio->bio_ma = bp->b_pages; 2879 bio->bio_data = unmapped_buf; 2880 bio->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; 2881 bio->bio_ma_n = bp->b_npages; 2882 bio->bio_flags |= BIO_UNMAPPED; 2883 } else { 2884 bio->bio_data = bp->b_data; 2885 bio->bio_ma = NULL; 2886 } 2887 g_io_request(bio, cp); 2888 return; 2889 } 2890 2891 static void 2892 swapgeom_orphan(struct g_consumer *cp) 2893 { 2894 struct swdevt *sp; 2895 int destroy; 2896 2897 mtx_lock(&sw_dev_mtx); 2898 TAILQ_FOREACH(sp, &swtailq, sw_list) { 2899 if (sp->sw_id == cp) { 2900 sp->sw_flags |= SW_CLOSING; 2901 break; 2902 } 2903 } 2904 /* 2905 * Drop reference we were created with. Do directly since we're in a 2906 * special context where we don't have to queue the call to 2907 * swapgeom_close_ev(). 2908 */ 2909 cp->index--; 2910 destroy = ((sp != NULL) && (cp->index == 0)); 2911 if (destroy) 2912 sp->sw_id = NULL; 2913 mtx_unlock(&sw_dev_mtx); 2914 if (destroy) 2915 swapgeom_close_ev(cp, 0); 2916 } 2917 2918 static void 2919 swapgeom_close(struct thread *td, struct swdevt *sw) 2920 { 2921 struct g_consumer *cp; 2922 2923 mtx_lock(&sw_dev_mtx); 2924 cp = sw->sw_id; 2925 sw->sw_id = NULL; 2926 mtx_unlock(&sw_dev_mtx); 2927 2928 /* 2929 * swapgeom_close() may be called from the biodone context, 2930 * where we cannot perform topology changes. Delegate the 2931 * work to the events thread. 2932 */ 2933 if (cp != NULL) 2934 g_waitfor_event(swapgeom_close_ev, cp, M_WAITOK, NULL); 2935 } 2936 2937 static int 2938 swapongeom_locked(struct cdev *dev, struct vnode *vp) 2939 { 2940 struct g_provider *pp; 2941 struct g_consumer *cp; 2942 static struct g_geom *gp; 2943 struct swdevt *sp; 2944 u_long nblks; 2945 int error; 2946 2947 pp = g_dev_getprovider(dev); 2948 if (pp == NULL) 2949 return (ENODEV); 2950 mtx_lock(&sw_dev_mtx); 2951 TAILQ_FOREACH(sp, &swtailq, sw_list) { 2952 cp = sp->sw_id; 2953 if (cp != NULL && cp->provider == pp) { 2954 mtx_unlock(&sw_dev_mtx); 2955 return (EBUSY); 2956 } 2957 } 2958 mtx_unlock(&sw_dev_mtx); 2959 if (gp == NULL) 2960 gp = g_new_geomf(&g_swap_class, "swap"); 2961 cp = g_new_consumer(gp); 2962 cp->index = 1; /* Number of active I/Os, plus one for being active. */ 2963 cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; 2964 g_attach(cp, pp); 2965 /* 2966 * XXX: Every time you think you can improve the margin for 2967 * footshooting, somebody depends on the ability to do so: 2968 * savecore(8) wants to write to our swapdev so we cannot 2969 * set an exclusive count :-( 2970 */ 2971 error = g_access(cp, 1, 1, 0); 2972 if (error != 0) { 2973 g_detach(cp); 2974 g_destroy_consumer(cp); 2975 return (error); 2976 } 2977 nblks = pp->mediasize / DEV_BSIZE; 2978 swaponsomething(vp, cp, nblks, swapgeom_strategy, 2979 swapgeom_close, dev2udev(dev), 2980 (pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ? SW_UNMAPPED : 0); 2981 return (0); 2982 } 2983 2984 static int 2985 swapongeom(struct vnode *vp) 2986 { 2987 int error; 2988 2989 ASSERT_VOP_ELOCKED(vp, "swapongeom"); 2990 if (vp->v_type != VCHR || VN_IS_DOOMED(vp)) { 2991 error = ENOENT; 2992 } else { 2993 g_topology_lock(); 2994 error = swapongeom_locked(vp->v_rdev, vp); 2995 g_topology_unlock(); 2996 } 2997 return (error); 2998 } 2999 3000 /* 3001 * VNODE backend 3002 * 3003 * This is used mainly for network filesystem (read: probably only tested 3004 * with NFS) swapfiles. 3005 * 3006 */ 3007 3008 static void 3009 swapdev_strategy(struct buf *bp, struct swdevt *sp) 3010 { 3011 struct vnode *vp2; 3012 3013 bp->b_blkno = ctodb(bp->b_blkno - sp->sw_first); 3014 3015 vp2 = sp->sw_id; 3016 vhold(vp2); 3017 if (bp->b_iocmd == BIO_WRITE) { 3018 vn_lock(vp2, LK_EXCLUSIVE | LK_RETRY); 3019 if (bp->b_bufobj) 3020 bufobj_wdrop(bp->b_bufobj); 3021 bufobj_wref(&vp2->v_bufobj); 3022 } else { 3023 vn_lock(vp2, LK_SHARED | LK_RETRY); 3024 } 3025 if (bp->b_bufobj != &vp2->v_bufobj) 3026 bp->b_bufobj = &vp2->v_bufobj; 3027 bp->b_vp = vp2; 3028 bp->b_iooffset = dbtob(bp->b_blkno); 3029 bstrategy(bp); 3030 VOP_UNLOCK(vp2); 3031 } 3032 3033 static void 3034 swapdev_close(struct thread *td, struct swdevt *sp) 3035 { 3036 struct vnode *vp; 3037 3038 vp = sp->sw_vp; 3039 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 3040 VOP_CLOSE(vp, FREAD | FWRITE, td->td_ucred, td); 3041 vput(vp); 3042 } 3043 3044 static int 3045 swaponvp(struct thread *td, struct vnode *vp, u_long nblks) 3046 { 3047 struct swdevt *sp; 3048 int error; 3049 3050 ASSERT_VOP_ELOCKED(vp, "swaponvp"); 3051 if (nblks == 0) 3052 return (ENXIO); 3053 mtx_lock(&sw_dev_mtx); 3054 TAILQ_FOREACH(sp, &swtailq, sw_list) { 3055 if (sp->sw_id == vp) { 3056 mtx_unlock(&sw_dev_mtx); 3057 return (EBUSY); 3058 } 3059 } 3060 mtx_unlock(&sw_dev_mtx); 3061 3062 #ifdef MAC 3063 error = mac_system_check_swapon(td->td_ucred, vp); 3064 if (error == 0) 3065 #endif 3066 error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, NULL); 3067 if (error != 0) 3068 return (error); 3069 3070 swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close, 3071 NODEV, 0); 3072 return (0); 3073 } 3074 3075 static int 3076 sysctl_swap_async_max(SYSCTL_HANDLER_ARGS) 3077 { 3078 int error, new, n; 3079 3080 new = nsw_wcount_async_max; 3081 error = sysctl_handle_int(oidp, &new, 0, req); 3082 if (error != 0 || req->newptr == NULL) 3083 return (error); 3084 3085 if (new > nswbuf / 2 || new < 1) 3086 return (EINVAL); 3087 3088 mtx_lock(&swbuf_mtx); 3089 while (nsw_wcount_async_max != new) { 3090 /* 3091 * Adjust difference. If the current async count is too low, 3092 * we will need to sqeeze our update slowly in. Sleep with a 3093 * higher priority than getpbuf() to finish faster. 3094 */ 3095 n = new - nsw_wcount_async_max; 3096 if (nsw_wcount_async + n >= 0) { 3097 nsw_wcount_async += n; 3098 nsw_wcount_async_max += n; 3099 wakeup(&nsw_wcount_async); 3100 } else { 3101 nsw_wcount_async_max -= nsw_wcount_async; 3102 nsw_wcount_async = 0; 3103 msleep(&nsw_wcount_async, &swbuf_mtx, PSWP, 3104 "swpsysctl", 0); 3105 } 3106 } 3107 mtx_unlock(&swbuf_mtx); 3108 3109 return (0); 3110 } 3111 3112 static void 3113 swap_pager_update_writecount(vm_object_t object, vm_offset_t start, 3114 vm_offset_t end) 3115 { 3116 3117 VM_OBJECT_WLOCK(object); 3118 KASSERT((object->flags & OBJ_ANON) == 0, 3119 ("Splittable object with writecount")); 3120 object->un_pager.swp.writemappings += (vm_ooffset_t)end - start; 3121 VM_OBJECT_WUNLOCK(object); 3122 } 3123 3124 static void 3125 swap_pager_release_writecount(vm_object_t object, vm_offset_t start, 3126 vm_offset_t end) 3127 { 3128 3129 VM_OBJECT_WLOCK(object); 3130 KASSERT((object->flags & OBJ_ANON) == 0, 3131 ("Splittable object with writecount")); 3132 object->un_pager.swp.writemappings -= (vm_ooffset_t)end - start; 3133 VM_OBJECT_WUNLOCK(object); 3134 } 3135