1 /*- 2 * Copyright (c) 1998 Matthew Dillon, 3 * Copyright (c) 1994 John S. Dyson 4 * Copyright (c) 1990 University of Utah. 5 * Copyright (c) 1982, 1986, 1989, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * the Systems Programming Group of the University of Utah Computer 10 * Science Department. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. All advertising materials mentioning features or use of this software 21 * must display the following acknowledgement: 22 * This product includes software developed by the University of 23 * California, Berkeley and its contributors. 24 * 4. Neither the name of the University nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * New Swap System 41 * Matthew Dillon 42 * 43 * Radix Bitmap 'blists'. 44 * 45 * - The new swapper uses the new radix bitmap code. This should scale 46 * to arbitrarily small or arbitrarily large swap spaces and an almost 47 * arbitrary degree of fragmentation. 48 * 49 * Features: 50 * 51 * - on the fly reallocation of swap during putpages. The new system 52 * does not try to keep previously allocated swap blocks for dirty 53 * pages. 54 * 55 * - on the fly deallocation of swap 56 * 57 * - No more garbage collection required. Unnecessarily allocated swap 58 * blocks only exist for dirty vm_page_t's now and these are already 59 * cycled (in a high-load system) by the pager. We also do on-the-fly 60 * removal of invalidated swap blocks when a page is destroyed 61 * or renamed. 62 * 63 * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$ 64 * 65 * @(#)swap_pager.c 8.9 (Berkeley) 3/21/94 66 * @(#)vm_swap.c 8.5 (Berkeley) 2/17/94 67 */ 68 69 #include <sys/cdefs.h> 70 __FBSDID("$FreeBSD$"); 71 72 #include "opt_swap.h" 73 #include "opt_vm.h" 74 75 #include <sys/param.h> 76 #include <sys/systm.h> 77 #include <sys/conf.h> 78 #include <sys/kernel.h> 79 #include <sys/priv.h> 80 #include <sys/proc.h> 81 #include <sys/bio.h> 82 #include <sys/buf.h> 83 #include <sys/disk.h> 84 #include <sys/fcntl.h> 85 #include <sys/mount.h> 86 #include <sys/namei.h> 87 #include <sys/vnode.h> 88 #include <sys/malloc.h> 89 #include <sys/racct.h> 90 #include <sys/resource.h> 91 #include <sys/resourcevar.h> 92 #include <sys/rwlock.h> 93 #include <sys/sysctl.h> 94 #include <sys/sysproto.h> 95 #include <sys/blist.h> 96 #include <sys/lock.h> 97 #include <sys/sx.h> 98 #include <sys/vmmeter.h> 99 100 #include <security/mac/mac_framework.h> 101 102 #include <vm/vm.h> 103 #include <vm/pmap.h> 104 #include <vm/vm_map.h> 105 #include <vm/vm_kern.h> 106 #include <vm/vm_object.h> 107 #include <vm/vm_page.h> 108 #include <vm/vm_pager.h> 109 #include <vm/vm_pageout.h> 110 #include <vm/vm_param.h> 111 #include <vm/swap_pager.h> 112 #include <vm/vm_extern.h> 113 #include <vm/uma.h> 114 115 #include <geom/geom.h> 116 117 /* 118 * SWB_NPAGES must be a power of 2. It may be set to 1, 2, 4, 8, 16 119 * or 32 pages per allocation. 120 * The 32-page limit is due to the radix code (kern/subr_blist.c). 121 */ 122 #ifndef MAX_PAGEOUT_CLUSTER 123 #define MAX_PAGEOUT_CLUSTER 16 124 #endif 125 126 #if !defined(SWB_NPAGES) 127 #define SWB_NPAGES MAX_PAGEOUT_CLUSTER 128 #endif 129 130 /* 131 * The swblock structure maps an object and a small, fixed-size range 132 * of page indices to disk addresses within a swap area. 133 * The collection of these mappings is implemented as a hash table. 134 * Unused disk addresses within a swap area are allocated and managed 135 * using a blist. 136 */ 137 #define SWCORRECT(n) (sizeof(void *) * (n) / sizeof(daddr_t)) 138 #define SWAP_META_PAGES (SWB_NPAGES * 2) 139 #define SWAP_META_MASK (SWAP_META_PAGES - 1) 140 141 struct swblock { 142 struct swblock *swb_hnext; 143 vm_object_t swb_object; 144 vm_pindex_t swb_index; 145 int swb_count; 146 daddr_t swb_pages[SWAP_META_PAGES]; 147 }; 148 149 static MALLOC_DEFINE(M_VMPGDATA, "vm_pgdata", "swap pager private data"); 150 static struct mtx sw_dev_mtx; 151 static TAILQ_HEAD(, swdevt) swtailq = TAILQ_HEAD_INITIALIZER(swtailq); 152 static struct swdevt *swdevhd; /* Allocate from here next */ 153 static int nswapdev; /* Number of swap devices */ 154 int swap_pager_avail; 155 static int swdev_syscall_active = 0; /* serialize swap(on|off) */ 156 157 static vm_ooffset_t swap_total; 158 SYSCTL_QUAD(_vm, OID_AUTO, swap_total, CTLFLAG_RD, &swap_total, 0, 159 "Total amount of available swap storage."); 160 static vm_ooffset_t swap_reserved; 161 SYSCTL_QUAD(_vm, OID_AUTO, swap_reserved, CTLFLAG_RD, &swap_reserved, 0, 162 "Amount of swap storage needed to back all allocated anonymous memory."); 163 static int overcommit = 0; 164 SYSCTL_INT(_vm, OID_AUTO, overcommit, CTLFLAG_RW, &overcommit, 0, 165 "Configure virtual memory overcommit behavior. See tuning(7) " 166 "for details."); 167 static unsigned long swzone; 168 SYSCTL_ULONG(_vm, OID_AUTO, swzone, CTLFLAG_RD, &swzone, 0, 169 "Actual size of swap metadata zone"); 170 static unsigned long swap_maxpages; 171 SYSCTL_ULONG(_vm, OID_AUTO, swap_maxpages, CTLFLAG_RD, &swap_maxpages, 0, 172 "Maximum amount of swap supported"); 173 174 /* bits from overcommit */ 175 #define SWAP_RESERVE_FORCE_ON (1 << 0) 176 #define SWAP_RESERVE_RLIMIT_ON (1 << 1) 177 #define SWAP_RESERVE_ALLOW_NONWIRED (1 << 2) 178 179 int 180 swap_reserve(vm_ooffset_t incr) 181 { 182 183 return (swap_reserve_by_cred(incr, curthread->td_ucred)); 184 } 185 186 int 187 swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred) 188 { 189 vm_ooffset_t r, s; 190 int res, error; 191 static int curfail; 192 static struct timeval lastfail; 193 struct uidinfo *uip; 194 195 uip = cred->cr_ruidinfo; 196 197 if (incr & PAGE_MASK) 198 panic("swap_reserve: & PAGE_MASK"); 199 200 #ifdef RACCT 201 if (racct_enable) { 202 PROC_LOCK(curproc); 203 error = racct_add(curproc, RACCT_SWAP, incr); 204 PROC_UNLOCK(curproc); 205 if (error != 0) 206 return (0); 207 } 208 #endif 209 210 res = 0; 211 mtx_lock(&sw_dev_mtx); 212 r = swap_reserved + incr; 213 if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) { 214 s = vm_cnt.v_page_count - vm_cnt.v_free_reserved - vm_cnt.v_wire_count; 215 s *= PAGE_SIZE; 216 } else 217 s = 0; 218 s += swap_total; 219 if ((overcommit & SWAP_RESERVE_FORCE_ON) == 0 || r <= s || 220 (error = priv_check(curthread, PRIV_VM_SWAP_NOQUOTA)) == 0) { 221 res = 1; 222 swap_reserved = r; 223 } 224 mtx_unlock(&sw_dev_mtx); 225 226 if (res) { 227 UIDINFO_VMSIZE_LOCK(uip); 228 if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 && 229 uip->ui_vmsize + incr > lim_cur(curthread, RLIMIT_SWAP) && 230 priv_check(curthread, PRIV_VM_SWAP_NORLIMIT)) 231 res = 0; 232 else 233 uip->ui_vmsize += incr; 234 UIDINFO_VMSIZE_UNLOCK(uip); 235 if (!res) { 236 mtx_lock(&sw_dev_mtx); 237 swap_reserved -= incr; 238 mtx_unlock(&sw_dev_mtx); 239 } 240 } 241 if (!res && ppsratecheck(&lastfail, &curfail, 1)) { 242 printf("uid %d, pid %d: swap reservation for %jd bytes failed\n", 243 uip->ui_uid, curproc->p_pid, incr); 244 } 245 246 #ifdef RACCT 247 if (!res) { 248 PROC_LOCK(curproc); 249 racct_sub(curproc, RACCT_SWAP, incr); 250 PROC_UNLOCK(curproc); 251 } 252 #endif 253 254 return (res); 255 } 256 257 void 258 swap_reserve_force(vm_ooffset_t incr) 259 { 260 struct uidinfo *uip; 261 262 mtx_lock(&sw_dev_mtx); 263 swap_reserved += incr; 264 mtx_unlock(&sw_dev_mtx); 265 266 #ifdef RACCT 267 PROC_LOCK(curproc); 268 racct_add_force(curproc, RACCT_SWAP, incr); 269 PROC_UNLOCK(curproc); 270 #endif 271 272 uip = curthread->td_ucred->cr_ruidinfo; 273 PROC_LOCK(curproc); 274 UIDINFO_VMSIZE_LOCK(uip); 275 uip->ui_vmsize += incr; 276 UIDINFO_VMSIZE_UNLOCK(uip); 277 PROC_UNLOCK(curproc); 278 } 279 280 void 281 swap_release(vm_ooffset_t decr) 282 { 283 struct ucred *cred; 284 285 PROC_LOCK(curproc); 286 cred = curthread->td_ucred; 287 swap_release_by_cred(decr, cred); 288 PROC_UNLOCK(curproc); 289 } 290 291 void 292 swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred) 293 { 294 struct uidinfo *uip; 295 296 uip = cred->cr_ruidinfo; 297 298 if (decr & PAGE_MASK) 299 panic("swap_release: & PAGE_MASK"); 300 301 mtx_lock(&sw_dev_mtx); 302 if (swap_reserved < decr) 303 panic("swap_reserved < decr"); 304 swap_reserved -= decr; 305 mtx_unlock(&sw_dev_mtx); 306 307 UIDINFO_VMSIZE_LOCK(uip); 308 if (uip->ui_vmsize < decr) 309 printf("negative vmsize for uid = %d\n", uip->ui_uid); 310 uip->ui_vmsize -= decr; 311 UIDINFO_VMSIZE_UNLOCK(uip); 312 313 racct_sub_cred(cred, RACCT_SWAP, decr); 314 } 315 316 static void swapdev_strategy(struct buf *, struct swdevt *sw); 317 318 #define SWM_FREE 0x02 /* free, period */ 319 #define SWM_POP 0x04 /* pop out */ 320 321 int swap_pager_full = 2; /* swap space exhaustion (task killing) */ 322 static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/ 323 static int nsw_rcount; /* free read buffers */ 324 static int nsw_wcount_sync; /* limit write buffers / synchronous */ 325 static int nsw_wcount_async; /* limit write buffers / asynchronous */ 326 static int nsw_wcount_async_max;/* assigned maximum */ 327 static int nsw_cluster_max; /* maximum VOP I/O allowed */ 328 329 static int sysctl_swap_async_max(SYSCTL_HANDLER_ARGS); 330 SYSCTL_PROC(_vm, OID_AUTO, swap_async_max, CTLTYPE_INT | CTLFLAG_RW, 331 NULL, 0, sysctl_swap_async_max, "I", "Maximum running async swap ops"); 332 333 static struct swblock **swhash; 334 static int swhash_mask; 335 static struct mtx swhash_mtx; 336 337 static struct sx sw_alloc_sx; 338 339 /* 340 * "named" and "unnamed" anon region objects. Try to reduce the overhead 341 * of searching a named list by hashing it just a little. 342 */ 343 344 #define NOBJLISTS 8 345 346 #define NOBJLIST(handle) \ 347 (&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)]) 348 349 static struct mtx sw_alloc_mtx; /* protect list manipulation */ 350 static struct pagerlst swap_pager_object_list[NOBJLISTS]; 351 static uma_zone_t swap_zone; 352 353 /* 354 * pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure 355 * calls hooked from other parts of the VM system and do not appear here. 356 * (see vm/swap_pager.h). 357 */ 358 static vm_object_t 359 swap_pager_alloc(void *handle, vm_ooffset_t size, 360 vm_prot_t prot, vm_ooffset_t offset, struct ucred *); 361 static void swap_pager_dealloc(vm_object_t object); 362 static int swap_pager_getpages(vm_object_t, vm_page_t *, int, int); 363 static int swap_pager_getpages_async(vm_object_t, vm_page_t *, int, int, 364 pgo_getpages_iodone_t, void *); 365 static void swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *); 366 static boolean_t 367 swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after); 368 static void swap_pager_init(void); 369 static void swap_pager_unswapped(vm_page_t); 370 static void swap_pager_swapoff(struct swdevt *sp); 371 372 struct pagerops swappagerops = { 373 .pgo_init = swap_pager_init, /* early system initialization of pager */ 374 .pgo_alloc = swap_pager_alloc, /* allocate an OBJT_SWAP object */ 375 .pgo_dealloc = swap_pager_dealloc, /* deallocate an OBJT_SWAP object */ 376 .pgo_getpages = swap_pager_getpages, /* pagein */ 377 .pgo_getpages_async = swap_pager_getpages_async, /* pagein (async) */ 378 .pgo_putpages = swap_pager_putpages, /* pageout */ 379 .pgo_haspage = swap_pager_haspage, /* get backing store status for page */ 380 .pgo_pageunswapped = swap_pager_unswapped, /* remove swap related to page */ 381 }; 382 383 /* 384 * dmmax is in page-sized chunks with the new swap system. It was 385 * dev-bsized chunks in the old. dmmax is always a power of 2. 386 * 387 * swap_*() routines are externally accessible. swp_*() routines are 388 * internal. 389 */ 390 static int dmmax; 391 static int nswap_lowat = 128; /* in pages, swap_pager_almost_full warn */ 392 static int nswap_hiwat = 512; /* in pages, swap_pager_almost_full warn */ 393 394 SYSCTL_INT(_vm, OID_AUTO, dmmax, 395 CTLFLAG_RD, &dmmax, 0, "Maximum size of a swap block"); 396 397 static void swp_sizecheck(void); 398 static void swp_pager_async_iodone(struct buf *bp); 399 static int swapongeom(struct thread *, struct vnode *); 400 static int swaponvp(struct thread *, struct vnode *, u_long); 401 static int swapoff_one(struct swdevt *sp, struct ucred *cred); 402 403 /* 404 * Swap bitmap functions 405 */ 406 static void swp_pager_freeswapspace(daddr_t blk, int npages); 407 static daddr_t swp_pager_getswapspace(int npages); 408 409 /* 410 * Metadata functions 411 */ 412 static struct swblock **swp_pager_hash(vm_object_t object, vm_pindex_t index); 413 static void swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t); 414 static void swp_pager_meta_free(vm_object_t, vm_pindex_t, daddr_t); 415 static void swp_pager_meta_free_all(vm_object_t); 416 static daddr_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int); 417 418 static void 419 swp_pager_free_nrpage(vm_page_t m) 420 { 421 422 vm_page_lock(m); 423 if (m->wire_count == 0) 424 vm_page_free(m); 425 vm_page_unlock(m); 426 } 427 428 /* 429 * SWP_SIZECHECK() - update swap_pager_full indication 430 * 431 * update the swap_pager_almost_full indication and warn when we are 432 * about to run out of swap space, using lowat/hiwat hysteresis. 433 * 434 * Clear swap_pager_full ( task killing ) indication when lowat is met. 435 * 436 * No restrictions on call 437 * This routine may not block. 438 */ 439 static void 440 swp_sizecheck(void) 441 { 442 443 if (swap_pager_avail < nswap_lowat) { 444 if (swap_pager_almost_full == 0) { 445 printf("swap_pager: out of swap space\n"); 446 swap_pager_almost_full = 1; 447 } 448 } else { 449 swap_pager_full = 0; 450 if (swap_pager_avail > nswap_hiwat) 451 swap_pager_almost_full = 0; 452 } 453 } 454 455 /* 456 * SWP_PAGER_HASH() - hash swap meta data 457 * 458 * This is an helper function which hashes the swapblk given 459 * the object and page index. It returns a pointer to a pointer 460 * to the object, or a pointer to a NULL pointer if it could not 461 * find a swapblk. 462 */ 463 static struct swblock ** 464 swp_pager_hash(vm_object_t object, vm_pindex_t index) 465 { 466 struct swblock **pswap; 467 struct swblock *swap; 468 469 index &= ~(vm_pindex_t)SWAP_META_MASK; 470 pswap = &swhash[(index ^ (int)(intptr_t)object) & swhash_mask]; 471 while ((swap = *pswap) != NULL) { 472 if (swap->swb_object == object && 473 swap->swb_index == index 474 ) { 475 break; 476 } 477 pswap = &swap->swb_hnext; 478 } 479 return (pswap); 480 } 481 482 /* 483 * SWAP_PAGER_INIT() - initialize the swap pager! 484 * 485 * Expected to be started from system init. NOTE: This code is run 486 * before much else so be careful what you depend on. Most of the VM 487 * system has yet to be initialized at this point. 488 */ 489 static void 490 swap_pager_init(void) 491 { 492 /* 493 * Initialize object lists 494 */ 495 int i; 496 497 for (i = 0; i < NOBJLISTS; ++i) 498 TAILQ_INIT(&swap_pager_object_list[i]); 499 mtx_init(&sw_alloc_mtx, "swap_pager list", NULL, MTX_DEF); 500 mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF); 501 502 /* 503 * Device Stripe, in PAGE_SIZE'd blocks 504 */ 505 dmmax = SWB_NPAGES * 2; 506 } 507 508 /* 509 * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process 510 * 511 * Expected to be started from pageout process once, prior to entering 512 * its main loop. 513 */ 514 void 515 swap_pager_swap_init(void) 516 { 517 unsigned long n, n2; 518 519 /* 520 * Number of in-transit swap bp operations. Don't 521 * exhaust the pbufs completely. Make sure we 522 * initialize workable values (0 will work for hysteresis 523 * but it isn't very efficient). 524 * 525 * The nsw_cluster_max is constrained by the bp->b_pages[] 526 * array (MAXPHYS/PAGE_SIZE) and our locally defined 527 * MAX_PAGEOUT_CLUSTER. Also be aware that swap ops are 528 * constrained by the swap device interleave stripe size. 529 * 530 * Currently we hardwire nsw_wcount_async to 4. This limit is 531 * designed to prevent other I/O from having high latencies due to 532 * our pageout I/O. The value 4 works well for one or two active swap 533 * devices but is probably a little low if you have more. Even so, 534 * a higher value would probably generate only a limited improvement 535 * with three or four active swap devices since the system does not 536 * typically have to pageout at extreme bandwidths. We will want 537 * at least 2 per swap devices, and 4 is a pretty good value if you 538 * have one NFS swap device due to the command/ack latency over NFS. 539 * So it all works out pretty well. 540 */ 541 nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER); 542 543 mtx_lock(&pbuf_mtx); 544 nsw_rcount = (nswbuf + 1) / 2; 545 nsw_wcount_sync = (nswbuf + 3) / 4; 546 nsw_wcount_async = 4; 547 nsw_wcount_async_max = nsw_wcount_async; 548 mtx_unlock(&pbuf_mtx); 549 550 /* 551 * Initialize our zone. Right now I'm just guessing on the number 552 * we need based on the number of pages in the system. Each swblock 553 * can hold 32 pages, so this is probably overkill. This reservation 554 * is typically limited to around 32MB by default. 555 */ 556 n = vm_cnt.v_page_count / 2; 557 if (maxswzone && n > maxswzone / sizeof(struct swblock)) 558 n = maxswzone / sizeof(struct swblock); 559 n2 = n; 560 swap_zone = uma_zcreate("SWAPMETA", sizeof(struct swblock), NULL, NULL, 561 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); 562 if (swap_zone == NULL) 563 panic("failed to create swap_zone."); 564 do { 565 if (uma_zone_reserve_kva(swap_zone, n)) 566 break; 567 /* 568 * if the allocation failed, try a zone two thirds the 569 * size of the previous attempt. 570 */ 571 n -= ((n + 2) / 3); 572 } while (n > 0); 573 if (n2 != n) 574 printf("Swap zone entries reduced from %lu to %lu.\n", n2, n); 575 swap_maxpages = n * SWAP_META_PAGES; 576 swzone = n * sizeof(struct swblock); 577 n2 = n; 578 579 /* 580 * Initialize our meta-data hash table. The swapper does not need to 581 * be quite as efficient as the VM system, so we do not use an 582 * oversized hash table. 583 * 584 * n: size of hash table, must be power of 2 585 * swhash_mask: hash table index mask 586 */ 587 for (n = 1; n < n2 / 8; n *= 2) 588 ; 589 swhash = malloc(sizeof(struct swblock *) * n, M_VMPGDATA, M_WAITOK | M_ZERO); 590 swhash_mask = n - 1; 591 mtx_init(&swhash_mtx, "swap_pager swhash", NULL, MTX_DEF); 592 } 593 594 /* 595 * SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate 596 * its metadata structures. 597 * 598 * This routine is called from the mmap and fork code to create a new 599 * OBJT_SWAP object. We do this by creating an OBJT_DEFAULT object 600 * and then converting it with swp_pager_meta_build(). 601 * 602 * This routine may block in vm_object_allocate() and create a named 603 * object lookup race, so we must interlock. 604 * 605 * MPSAFE 606 */ 607 static vm_object_t 608 swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, 609 vm_ooffset_t offset, struct ucred *cred) 610 { 611 vm_object_t object; 612 vm_pindex_t pindex; 613 614 pindex = OFF_TO_IDX(offset + PAGE_MASK + size); 615 if (handle) { 616 mtx_lock(&Giant); 617 /* 618 * Reference existing named region or allocate new one. There 619 * should not be a race here against swp_pager_meta_build() 620 * as called from vm_page_remove() in regards to the lookup 621 * of the handle. 622 */ 623 sx_xlock(&sw_alloc_sx); 624 object = vm_pager_object_lookup(NOBJLIST(handle), handle); 625 if (object == NULL) { 626 if (cred != NULL) { 627 if (!swap_reserve_by_cred(size, cred)) { 628 sx_xunlock(&sw_alloc_sx); 629 mtx_unlock(&Giant); 630 return (NULL); 631 } 632 crhold(cred); 633 } 634 object = vm_object_allocate(OBJT_DEFAULT, pindex); 635 VM_OBJECT_WLOCK(object); 636 object->handle = handle; 637 if (cred != NULL) { 638 object->cred = cred; 639 object->charge = size; 640 } 641 swp_pager_meta_build(object, 0, SWAPBLK_NONE); 642 VM_OBJECT_WUNLOCK(object); 643 } 644 sx_xunlock(&sw_alloc_sx); 645 mtx_unlock(&Giant); 646 } else { 647 if (cred != NULL) { 648 if (!swap_reserve_by_cred(size, cred)) 649 return (NULL); 650 crhold(cred); 651 } 652 object = vm_object_allocate(OBJT_DEFAULT, pindex); 653 VM_OBJECT_WLOCK(object); 654 if (cred != NULL) { 655 object->cred = cred; 656 object->charge = size; 657 } 658 swp_pager_meta_build(object, 0, SWAPBLK_NONE); 659 VM_OBJECT_WUNLOCK(object); 660 } 661 return (object); 662 } 663 664 /* 665 * SWAP_PAGER_DEALLOC() - remove swap metadata from object 666 * 667 * The swap backing for the object is destroyed. The code is 668 * designed such that we can reinstantiate it later, but this 669 * routine is typically called only when the entire object is 670 * about to be destroyed. 671 * 672 * The object must be locked. 673 */ 674 static void 675 swap_pager_dealloc(vm_object_t object) 676 { 677 678 /* 679 * Remove from list right away so lookups will fail if we block for 680 * pageout completion. 681 */ 682 if (object->handle != NULL) { 683 mtx_lock(&sw_alloc_mtx); 684 TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list); 685 mtx_unlock(&sw_alloc_mtx); 686 } 687 688 VM_OBJECT_ASSERT_WLOCKED(object); 689 vm_object_pip_wait(object, "swpdea"); 690 691 /* 692 * Free all remaining metadata. We only bother to free it from 693 * the swap meta data. We do not attempt to free swapblk's still 694 * associated with vm_page_t's for this object. We do not care 695 * if paging is still in progress on some objects. 696 */ 697 swp_pager_meta_free_all(object); 698 object->handle = NULL; 699 object->type = OBJT_DEAD; 700 } 701 702 /************************************************************************ 703 * SWAP PAGER BITMAP ROUTINES * 704 ************************************************************************/ 705 706 /* 707 * SWP_PAGER_GETSWAPSPACE() - allocate raw swap space 708 * 709 * Allocate swap for the requested number of pages. The starting 710 * swap block number (a page index) is returned or SWAPBLK_NONE 711 * if the allocation failed. 712 * 713 * Also has the side effect of advising that somebody made a mistake 714 * when they configured swap and didn't configure enough. 715 * 716 * This routine may not sleep. 717 * 718 * We allocate in round-robin fashion from the configured devices. 719 */ 720 static daddr_t 721 swp_pager_getswapspace(int npages) 722 { 723 daddr_t blk; 724 struct swdevt *sp; 725 int i; 726 727 blk = SWAPBLK_NONE; 728 mtx_lock(&sw_dev_mtx); 729 sp = swdevhd; 730 for (i = 0; i < nswapdev; i++) { 731 if (sp == NULL) 732 sp = TAILQ_FIRST(&swtailq); 733 if (!(sp->sw_flags & SW_CLOSING)) { 734 blk = blist_alloc(sp->sw_blist, npages); 735 if (blk != SWAPBLK_NONE) { 736 blk += sp->sw_first; 737 sp->sw_used += npages; 738 swap_pager_avail -= npages; 739 swp_sizecheck(); 740 swdevhd = TAILQ_NEXT(sp, sw_list); 741 goto done; 742 } 743 } 744 sp = TAILQ_NEXT(sp, sw_list); 745 } 746 if (swap_pager_full != 2) { 747 printf("swap_pager_getswapspace(%d): failed\n", npages); 748 swap_pager_full = 2; 749 swap_pager_almost_full = 1; 750 } 751 swdevhd = NULL; 752 done: 753 mtx_unlock(&sw_dev_mtx); 754 return (blk); 755 } 756 757 static int 758 swp_pager_isondev(daddr_t blk, struct swdevt *sp) 759 { 760 761 return (blk >= sp->sw_first && blk < sp->sw_end); 762 } 763 764 static void 765 swp_pager_strategy(struct buf *bp) 766 { 767 struct swdevt *sp; 768 769 mtx_lock(&sw_dev_mtx); 770 TAILQ_FOREACH(sp, &swtailq, sw_list) { 771 if (bp->b_blkno >= sp->sw_first && bp->b_blkno < sp->sw_end) { 772 mtx_unlock(&sw_dev_mtx); 773 if ((sp->sw_flags & SW_UNMAPPED) != 0 && 774 unmapped_buf_allowed) { 775 bp->b_kvaalloc = bp->b_data; 776 bp->b_data = unmapped_buf; 777 bp->b_kvabase = unmapped_buf; 778 bp->b_offset = 0; 779 bp->b_flags |= B_UNMAPPED; 780 } else { 781 pmap_qenter((vm_offset_t)bp->b_data, 782 &bp->b_pages[0], bp->b_bcount / PAGE_SIZE); 783 } 784 sp->sw_strategy(bp, sp); 785 return; 786 } 787 } 788 panic("Swapdev not found"); 789 } 790 791 792 /* 793 * SWP_PAGER_FREESWAPSPACE() - free raw swap space 794 * 795 * This routine returns the specified swap blocks back to the bitmap. 796 * 797 * This routine may not sleep. 798 */ 799 static void 800 swp_pager_freeswapspace(daddr_t blk, int npages) 801 { 802 struct swdevt *sp; 803 804 mtx_lock(&sw_dev_mtx); 805 TAILQ_FOREACH(sp, &swtailq, sw_list) { 806 if (blk >= sp->sw_first && blk < sp->sw_end) { 807 sp->sw_used -= npages; 808 /* 809 * If we are attempting to stop swapping on 810 * this device, we don't want to mark any 811 * blocks free lest they be reused. 812 */ 813 if ((sp->sw_flags & SW_CLOSING) == 0) { 814 blist_free(sp->sw_blist, blk - sp->sw_first, 815 npages); 816 swap_pager_avail += npages; 817 swp_sizecheck(); 818 } 819 mtx_unlock(&sw_dev_mtx); 820 return; 821 } 822 } 823 panic("Swapdev not found"); 824 } 825 826 /* 827 * SWAP_PAGER_FREESPACE() - frees swap blocks associated with a page 828 * range within an object. 829 * 830 * This is a globally accessible routine. 831 * 832 * This routine removes swapblk assignments from swap metadata. 833 * 834 * The external callers of this routine typically have already destroyed 835 * or renamed vm_page_t's associated with this range in the object so 836 * we should be ok. 837 * 838 * The object must be locked. 839 */ 840 void 841 swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size) 842 { 843 844 swp_pager_meta_free(object, start, size); 845 } 846 847 /* 848 * SWAP_PAGER_RESERVE() - reserve swap blocks in object 849 * 850 * Assigns swap blocks to the specified range within the object. The 851 * swap blocks are not zeroed. Any previous swap assignment is destroyed. 852 * 853 * Returns 0 on success, -1 on failure. 854 */ 855 int 856 swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size) 857 { 858 int n = 0; 859 daddr_t blk = SWAPBLK_NONE; 860 vm_pindex_t beg = start; /* save start index */ 861 862 VM_OBJECT_WLOCK(object); 863 while (size) { 864 if (n == 0) { 865 n = BLIST_MAX_ALLOC; 866 while ((blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE) { 867 n >>= 1; 868 if (n == 0) { 869 swp_pager_meta_free(object, beg, start - beg); 870 VM_OBJECT_WUNLOCK(object); 871 return (-1); 872 } 873 } 874 } 875 swp_pager_meta_build(object, start, blk); 876 --size; 877 ++start; 878 ++blk; 879 --n; 880 } 881 swp_pager_meta_free(object, start, n); 882 VM_OBJECT_WUNLOCK(object); 883 return (0); 884 } 885 886 /* 887 * SWAP_PAGER_COPY() - copy blocks from source pager to destination pager 888 * and destroy the source. 889 * 890 * Copy any valid swapblks from the source to the destination. In 891 * cases where both the source and destination have a valid swapblk, 892 * we keep the destination's. 893 * 894 * This routine is allowed to sleep. It may sleep allocating metadata 895 * indirectly through swp_pager_meta_build() or if paging is still in 896 * progress on the source. 897 * 898 * The source object contains no vm_page_t's (which is just as well) 899 * 900 * The source object is of type OBJT_SWAP. 901 * 902 * The source and destination objects must be locked. 903 * Both object locks may temporarily be released. 904 */ 905 void 906 swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject, 907 vm_pindex_t offset, int destroysource) 908 { 909 vm_pindex_t i; 910 911 VM_OBJECT_ASSERT_WLOCKED(srcobject); 912 VM_OBJECT_ASSERT_WLOCKED(dstobject); 913 914 /* 915 * If destroysource is set, we remove the source object from the 916 * swap_pager internal queue now. 917 */ 918 if (destroysource) { 919 if (srcobject->handle != NULL) { 920 mtx_lock(&sw_alloc_mtx); 921 TAILQ_REMOVE( 922 NOBJLIST(srcobject->handle), 923 srcobject, 924 pager_object_list 925 ); 926 mtx_unlock(&sw_alloc_mtx); 927 } 928 } 929 930 /* 931 * transfer source to destination. 932 */ 933 for (i = 0; i < dstobject->size; ++i) { 934 daddr_t dstaddr; 935 936 /* 937 * Locate (without changing) the swapblk on the destination, 938 * unless it is invalid in which case free it silently, or 939 * if the destination is a resident page, in which case the 940 * source is thrown away. 941 */ 942 dstaddr = swp_pager_meta_ctl(dstobject, i, 0); 943 944 if (dstaddr == SWAPBLK_NONE) { 945 /* 946 * Destination has no swapblk and is not resident, 947 * copy source. 948 */ 949 daddr_t srcaddr; 950 951 srcaddr = swp_pager_meta_ctl( 952 srcobject, 953 i + offset, 954 SWM_POP 955 ); 956 957 if (srcaddr != SWAPBLK_NONE) { 958 /* 959 * swp_pager_meta_build() can sleep. 960 */ 961 vm_object_pip_add(srcobject, 1); 962 VM_OBJECT_WUNLOCK(srcobject); 963 vm_object_pip_add(dstobject, 1); 964 swp_pager_meta_build(dstobject, i, srcaddr); 965 vm_object_pip_wakeup(dstobject); 966 VM_OBJECT_WLOCK(srcobject); 967 vm_object_pip_wakeup(srcobject); 968 } 969 } else { 970 /* 971 * Destination has valid swapblk or it is represented 972 * by a resident page. We destroy the sourceblock. 973 */ 974 975 swp_pager_meta_ctl(srcobject, i + offset, SWM_FREE); 976 } 977 } 978 979 /* 980 * Free left over swap blocks in source. 981 * 982 * We have to revert the type to OBJT_DEFAULT so we do not accidently 983 * double-remove the object from the swap queues. 984 */ 985 if (destroysource) { 986 swp_pager_meta_free_all(srcobject); 987 /* 988 * Reverting the type is not necessary, the caller is going 989 * to destroy srcobject directly, but I'm doing it here 990 * for consistency since we've removed the object from its 991 * queues. 992 */ 993 srcobject->type = OBJT_DEFAULT; 994 } 995 } 996 997 /* 998 * SWAP_PAGER_HASPAGE() - determine if we have good backing store for 999 * the requested page. 1000 * 1001 * We determine whether good backing store exists for the requested 1002 * page and return TRUE if it does, FALSE if it doesn't. 1003 * 1004 * If TRUE, we also try to determine how much valid, contiguous backing 1005 * store exists before and after the requested page within a reasonable 1006 * distance. We do not try to restrict it to the swap device stripe 1007 * (that is handled in getpages/putpages). It probably isn't worth 1008 * doing here. 1009 */ 1010 static boolean_t 1011 swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) 1012 { 1013 daddr_t blk0; 1014 1015 VM_OBJECT_ASSERT_LOCKED(object); 1016 /* 1017 * do we have good backing store at the requested index ? 1018 */ 1019 blk0 = swp_pager_meta_ctl(object, pindex, 0); 1020 1021 if (blk0 == SWAPBLK_NONE) { 1022 if (before) 1023 *before = 0; 1024 if (after) 1025 *after = 0; 1026 return (FALSE); 1027 } 1028 1029 /* 1030 * find backwards-looking contiguous good backing store 1031 */ 1032 if (before != NULL) { 1033 int i; 1034 1035 for (i = 1; i < (SWB_NPAGES/2); ++i) { 1036 daddr_t blk; 1037 1038 if (i > pindex) 1039 break; 1040 blk = swp_pager_meta_ctl(object, pindex - i, 0); 1041 if (blk != blk0 - i) 1042 break; 1043 } 1044 *before = (i - 1); 1045 } 1046 1047 /* 1048 * find forward-looking contiguous good backing store 1049 */ 1050 if (after != NULL) { 1051 int i; 1052 1053 for (i = 1; i < (SWB_NPAGES/2); ++i) { 1054 daddr_t blk; 1055 1056 blk = swp_pager_meta_ctl(object, pindex + i, 0); 1057 if (blk != blk0 + i) 1058 break; 1059 } 1060 *after = (i - 1); 1061 } 1062 return (TRUE); 1063 } 1064 1065 /* 1066 * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page 1067 * 1068 * This removes any associated swap backing store, whether valid or 1069 * not, from the page. 1070 * 1071 * This routine is typically called when a page is made dirty, at 1072 * which point any associated swap can be freed. MADV_FREE also 1073 * calls us in a special-case situation 1074 * 1075 * NOTE!!! If the page is clean and the swap was valid, the caller 1076 * should make the page dirty before calling this routine. This routine 1077 * does NOT change the m->dirty status of the page. Also: MADV_FREE 1078 * depends on it. 1079 * 1080 * This routine may not sleep. 1081 * 1082 * The object containing the page must be locked. 1083 */ 1084 static void 1085 swap_pager_unswapped(vm_page_t m) 1086 { 1087 1088 swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE); 1089 } 1090 1091 /* 1092 * SWAP_PAGER_GETPAGES() - bring pages in from swap 1093 * 1094 * Attempt to retrieve (m, count) pages from backing store, but make 1095 * sure we retrieve at least m[reqpage]. We try to load in as large 1096 * a chunk surrounding m[reqpage] as is contiguous in swap and which 1097 * belongs to the same object. 1098 * 1099 * The code is designed for asynchronous operation and 1100 * immediate-notification of 'reqpage' but tends not to be 1101 * used that way. Please do not optimize-out this algorithmic 1102 * feature, I intend to improve on it in the future. 1103 * 1104 * The parent has a single vm_object_pip_add() reference prior to 1105 * calling us and we should return with the same. 1106 * 1107 * The parent has BUSY'd the pages. We should return with 'm' 1108 * left busy, but the others adjusted. 1109 */ 1110 static int 1111 swap_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage) 1112 { 1113 struct buf *bp; 1114 vm_page_t mreq; 1115 int i; 1116 int j; 1117 daddr_t blk; 1118 1119 mreq = m[reqpage]; 1120 1121 KASSERT(mreq->object == object, 1122 ("swap_pager_getpages: object mismatch %p/%p", 1123 object, mreq->object)); 1124 1125 /* 1126 * Calculate range to retrieve. The pages have already been assigned 1127 * their swapblks. We require a *contiguous* range but we know it to 1128 * not span devices. If we do not supply it, bad things 1129 * happen. Note that blk, iblk & jblk can be SWAPBLK_NONE, but the 1130 * loops are set up such that the case(s) are handled implicitly. 1131 * 1132 * The swp_*() calls must be made with the object locked. 1133 */ 1134 blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0); 1135 1136 for (i = reqpage - 1; i >= 0; --i) { 1137 daddr_t iblk; 1138 1139 iblk = swp_pager_meta_ctl(m[i]->object, m[i]->pindex, 0); 1140 if (blk != iblk + (reqpage - i)) 1141 break; 1142 } 1143 ++i; 1144 1145 for (j = reqpage + 1; j < count; ++j) { 1146 daddr_t jblk; 1147 1148 jblk = swp_pager_meta_ctl(m[j]->object, m[j]->pindex, 0); 1149 if (blk != jblk - (j - reqpage)) 1150 break; 1151 } 1152 1153 /* 1154 * free pages outside our collection range. Note: we never free 1155 * mreq, it must remain busy throughout. 1156 */ 1157 if (0 < i || j < count) { 1158 int k; 1159 1160 for (k = 0; k < i; ++k) 1161 swp_pager_free_nrpage(m[k]); 1162 for (k = j; k < count; ++k) 1163 swp_pager_free_nrpage(m[k]); 1164 } 1165 1166 /* 1167 * Return VM_PAGER_FAIL if we have nothing to do. Return mreq 1168 * still busy, but the others unbusied. 1169 */ 1170 if (blk == SWAPBLK_NONE) 1171 return (VM_PAGER_FAIL); 1172 1173 /* 1174 * Getpbuf() can sleep. 1175 */ 1176 VM_OBJECT_WUNLOCK(object); 1177 /* 1178 * Get a swap buffer header to perform the IO 1179 */ 1180 bp = getpbuf(&nsw_rcount); 1181 bp->b_flags |= B_PAGING; 1182 1183 bp->b_iocmd = BIO_READ; 1184 bp->b_iodone = swp_pager_async_iodone; 1185 bp->b_rcred = crhold(thread0.td_ucred); 1186 bp->b_wcred = crhold(thread0.td_ucred); 1187 bp->b_blkno = blk - (reqpage - i); 1188 bp->b_bcount = PAGE_SIZE * (j - i); 1189 bp->b_bufsize = PAGE_SIZE * (j - i); 1190 bp->b_pager.pg_reqpage = reqpage - i; 1191 1192 VM_OBJECT_WLOCK(object); 1193 { 1194 int k; 1195 1196 for (k = i; k < j; ++k) { 1197 bp->b_pages[k - i] = m[k]; 1198 m[k]->oflags |= VPO_SWAPINPROG; 1199 } 1200 } 1201 bp->b_npages = j - i; 1202 1203 PCPU_INC(cnt.v_swapin); 1204 PCPU_ADD(cnt.v_swappgsin, bp->b_npages); 1205 1206 /* 1207 * We still hold the lock on mreq, and our automatic completion routine 1208 * does not remove it. 1209 */ 1210 vm_object_pip_add(object, bp->b_npages); 1211 VM_OBJECT_WUNLOCK(object); 1212 1213 /* 1214 * perform the I/O. NOTE!!! bp cannot be considered valid after 1215 * this point because we automatically release it on completion. 1216 * Instead, we look at the one page we are interested in which we 1217 * still hold a lock on even through the I/O completion. 1218 * 1219 * The other pages in our m[] array are also released on completion, 1220 * so we cannot assume they are valid anymore either. 1221 * 1222 * NOTE: b_blkno is destroyed by the call to swapdev_strategy 1223 */ 1224 BUF_KERNPROC(bp); 1225 swp_pager_strategy(bp); 1226 1227 /* 1228 * wait for the page we want to complete. VPO_SWAPINPROG is always 1229 * cleared on completion. If an I/O error occurs, SWAPBLK_NONE 1230 * is set in the meta-data. 1231 */ 1232 VM_OBJECT_WLOCK(object); 1233 while ((mreq->oflags & VPO_SWAPINPROG) != 0) { 1234 mreq->oflags |= VPO_SWAPSLEEP; 1235 PCPU_INC(cnt.v_intrans); 1236 if (VM_OBJECT_SLEEP(object, &object->paging_in_progress, PSWP, 1237 "swread", hz * 20)) { 1238 printf( 1239 "swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n", 1240 bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount); 1241 } 1242 } 1243 1244 /* 1245 * mreq is left busied after completion, but all the other pages 1246 * are freed. If we had an unrecoverable read error the page will 1247 * not be valid. 1248 */ 1249 if (mreq->valid != VM_PAGE_BITS_ALL) { 1250 return (VM_PAGER_ERROR); 1251 } else { 1252 return (VM_PAGER_OK); 1253 } 1254 1255 /* 1256 * A final note: in a low swap situation, we cannot deallocate swap 1257 * and mark a page dirty here because the caller is likely to mark 1258 * the page clean when we return, causing the page to possibly revert 1259 * to all-zero's later. 1260 */ 1261 } 1262 1263 /* 1264 * swap_pager_getpages_async(): 1265 * 1266 * Right now this is emulation of asynchronous operation on top of 1267 * swap_pager_getpages(). 1268 */ 1269 static int 1270 swap_pager_getpages_async(vm_object_t object, vm_page_t *m, int count, 1271 int reqpage, pgo_getpages_iodone_t iodone, void *arg) 1272 { 1273 int r, error; 1274 1275 r = swap_pager_getpages(object, m, count, reqpage); 1276 VM_OBJECT_WUNLOCK(object); 1277 switch (r) { 1278 case VM_PAGER_OK: 1279 error = 0; 1280 break; 1281 case VM_PAGER_ERROR: 1282 error = EIO; 1283 break; 1284 case VM_PAGER_FAIL: 1285 error = EINVAL; 1286 break; 1287 default: 1288 panic("unhandled swap_pager_getpages() error %d", r); 1289 } 1290 (iodone)(arg, m, count, error); 1291 VM_OBJECT_WLOCK(object); 1292 1293 return (r); 1294 } 1295 1296 /* 1297 * swap_pager_putpages: 1298 * 1299 * Assign swap (if necessary) and initiate I/O on the specified pages. 1300 * 1301 * We support both OBJT_DEFAULT and OBJT_SWAP objects. DEFAULT objects 1302 * are automatically converted to SWAP objects. 1303 * 1304 * In a low memory situation we may block in VOP_STRATEGY(), but the new 1305 * vm_page reservation system coupled with properly written VFS devices 1306 * should ensure that no low-memory deadlock occurs. This is an area 1307 * which needs work. 1308 * 1309 * The parent has N vm_object_pip_add() references prior to 1310 * calling us and will remove references for rtvals[] that are 1311 * not set to VM_PAGER_PEND. We need to remove the rest on I/O 1312 * completion. 1313 * 1314 * The parent has soft-busy'd the pages it passes us and will unbusy 1315 * those whos rtvals[] entry is not set to VM_PAGER_PEND on return. 1316 * We need to unbusy the rest on I/O completion. 1317 */ 1318 void 1319 swap_pager_putpages(vm_object_t object, vm_page_t *m, int count, 1320 int flags, int *rtvals) 1321 { 1322 int i, n; 1323 boolean_t sync; 1324 1325 if (count && m[0]->object != object) { 1326 panic("swap_pager_putpages: object mismatch %p/%p", 1327 object, 1328 m[0]->object 1329 ); 1330 } 1331 1332 /* 1333 * Step 1 1334 * 1335 * Turn object into OBJT_SWAP 1336 * check for bogus sysops 1337 * force sync if not pageout process 1338 */ 1339 if (object->type != OBJT_SWAP) 1340 swp_pager_meta_build(object, 0, SWAPBLK_NONE); 1341 VM_OBJECT_WUNLOCK(object); 1342 1343 n = 0; 1344 if (curproc != pageproc) 1345 sync = TRUE; 1346 else 1347 sync = (flags & VM_PAGER_PUT_SYNC) != 0; 1348 1349 /* 1350 * Step 2 1351 * 1352 * Assign swap blocks and issue I/O. We reallocate swap on the fly. 1353 * The page is left dirty until the pageout operation completes 1354 * successfully. 1355 */ 1356 for (i = 0; i < count; i += n) { 1357 int j; 1358 struct buf *bp; 1359 daddr_t blk; 1360 1361 /* 1362 * Maximum I/O size is limited by a number of factors. 1363 */ 1364 n = min(BLIST_MAX_ALLOC, count - i); 1365 n = min(n, nsw_cluster_max); 1366 1367 /* 1368 * Get biggest block of swap we can. If we fail, fall 1369 * back and try to allocate a smaller block. Don't go 1370 * overboard trying to allocate space if it would overly 1371 * fragment swap. 1372 */ 1373 while ( 1374 (blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE && 1375 n > 4 1376 ) { 1377 n >>= 1; 1378 } 1379 if (blk == SWAPBLK_NONE) { 1380 for (j = 0; j < n; ++j) 1381 rtvals[i+j] = VM_PAGER_FAIL; 1382 continue; 1383 } 1384 1385 /* 1386 * All I/O parameters have been satisfied, build the I/O 1387 * request and assign the swap space. 1388 */ 1389 if (sync == TRUE) { 1390 bp = getpbuf(&nsw_wcount_sync); 1391 } else { 1392 bp = getpbuf(&nsw_wcount_async); 1393 bp->b_flags = B_ASYNC; 1394 } 1395 bp->b_flags |= B_PAGING; 1396 bp->b_iocmd = BIO_WRITE; 1397 1398 bp->b_rcred = crhold(thread0.td_ucred); 1399 bp->b_wcred = crhold(thread0.td_ucred); 1400 bp->b_bcount = PAGE_SIZE * n; 1401 bp->b_bufsize = PAGE_SIZE * n; 1402 bp->b_blkno = blk; 1403 1404 VM_OBJECT_WLOCK(object); 1405 for (j = 0; j < n; ++j) { 1406 vm_page_t mreq = m[i+j]; 1407 1408 swp_pager_meta_build( 1409 mreq->object, 1410 mreq->pindex, 1411 blk + j 1412 ); 1413 vm_page_dirty(mreq); 1414 rtvals[i+j] = VM_PAGER_OK; 1415 1416 mreq->oflags |= VPO_SWAPINPROG; 1417 bp->b_pages[j] = mreq; 1418 } 1419 VM_OBJECT_WUNLOCK(object); 1420 bp->b_npages = n; 1421 /* 1422 * Must set dirty range for NFS to work. 1423 */ 1424 bp->b_dirtyoff = 0; 1425 bp->b_dirtyend = bp->b_bcount; 1426 1427 PCPU_INC(cnt.v_swapout); 1428 PCPU_ADD(cnt.v_swappgsout, bp->b_npages); 1429 1430 /* 1431 * asynchronous 1432 * 1433 * NOTE: b_blkno is destroyed by the call to swapdev_strategy 1434 */ 1435 if (sync == FALSE) { 1436 bp->b_iodone = swp_pager_async_iodone; 1437 BUF_KERNPROC(bp); 1438 swp_pager_strategy(bp); 1439 1440 for (j = 0; j < n; ++j) 1441 rtvals[i+j] = VM_PAGER_PEND; 1442 /* restart outter loop */ 1443 continue; 1444 } 1445 1446 /* 1447 * synchronous 1448 * 1449 * NOTE: b_blkno is destroyed by the call to swapdev_strategy 1450 */ 1451 bp->b_iodone = bdone; 1452 swp_pager_strategy(bp); 1453 1454 /* 1455 * Wait for the sync I/O to complete, then update rtvals. 1456 * We just set the rtvals[] to VM_PAGER_PEND so we can call 1457 * our async completion routine at the end, thus avoiding a 1458 * double-free. 1459 */ 1460 bwait(bp, PVM, "swwrt"); 1461 for (j = 0; j < n; ++j) 1462 rtvals[i+j] = VM_PAGER_PEND; 1463 /* 1464 * Now that we are through with the bp, we can call the 1465 * normal async completion, which frees everything up. 1466 */ 1467 swp_pager_async_iodone(bp); 1468 } 1469 VM_OBJECT_WLOCK(object); 1470 } 1471 1472 /* 1473 * swp_pager_async_iodone: 1474 * 1475 * Completion routine for asynchronous reads and writes from/to swap. 1476 * Also called manually by synchronous code to finish up a bp. 1477 * 1478 * This routine may not sleep. 1479 */ 1480 static void 1481 swp_pager_async_iodone(struct buf *bp) 1482 { 1483 int i; 1484 vm_object_t object = NULL; 1485 1486 /* 1487 * report error 1488 */ 1489 if (bp->b_ioflags & BIO_ERROR) { 1490 printf( 1491 "swap_pager: I/O error - %s failed; blkno %ld," 1492 "size %ld, error %d\n", 1493 ((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"), 1494 (long)bp->b_blkno, 1495 (long)bp->b_bcount, 1496 bp->b_error 1497 ); 1498 } 1499 1500 /* 1501 * remove the mapping for kernel virtual 1502 */ 1503 if ((bp->b_flags & B_UNMAPPED) != 0) { 1504 bp->b_data = bp->b_kvaalloc; 1505 bp->b_kvabase = bp->b_kvaalloc; 1506 bp->b_flags &= ~B_UNMAPPED; 1507 } else 1508 pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages); 1509 1510 if (bp->b_npages) { 1511 object = bp->b_pages[0]->object; 1512 VM_OBJECT_WLOCK(object); 1513 } 1514 1515 /* 1516 * cleanup pages. If an error occurs writing to swap, we are in 1517 * very serious trouble. If it happens to be a disk error, though, 1518 * we may be able to recover by reassigning the swap later on. So 1519 * in this case we remove the m->swapblk assignment for the page 1520 * but do not free it in the rlist. The errornous block(s) are thus 1521 * never reallocated as swap. Redirty the page and continue. 1522 */ 1523 for (i = 0; i < bp->b_npages; ++i) { 1524 vm_page_t m = bp->b_pages[i]; 1525 1526 m->oflags &= ~VPO_SWAPINPROG; 1527 if (m->oflags & VPO_SWAPSLEEP) { 1528 m->oflags &= ~VPO_SWAPSLEEP; 1529 wakeup(&object->paging_in_progress); 1530 } 1531 1532 if (bp->b_ioflags & BIO_ERROR) { 1533 /* 1534 * If an error occurs I'd love to throw the swapblk 1535 * away without freeing it back to swapspace, so it 1536 * can never be used again. But I can't from an 1537 * interrupt. 1538 */ 1539 if (bp->b_iocmd == BIO_READ) { 1540 /* 1541 * When reading, reqpage needs to stay 1542 * locked for the parent, but all other 1543 * pages can be freed. We still want to 1544 * wakeup the parent waiting on the page, 1545 * though. ( also: pg_reqpage can be -1 and 1546 * not match anything ). 1547 * 1548 * We have to wake specifically requested pages 1549 * up too because we cleared VPO_SWAPINPROG and 1550 * someone may be waiting for that. 1551 * 1552 * NOTE: for reads, m->dirty will probably 1553 * be overridden by the original caller of 1554 * getpages so don't play cute tricks here. 1555 */ 1556 m->valid = 0; 1557 if (i != bp->b_pager.pg_reqpage) 1558 swp_pager_free_nrpage(m); 1559 else { 1560 vm_page_lock(m); 1561 vm_page_flash(m); 1562 vm_page_unlock(m); 1563 } 1564 /* 1565 * If i == bp->b_pager.pg_reqpage, do not wake 1566 * the page up. The caller needs to. 1567 */ 1568 } else { 1569 /* 1570 * If a write error occurs, reactivate page 1571 * so it doesn't clog the inactive list, 1572 * then finish the I/O. 1573 */ 1574 vm_page_dirty(m); 1575 vm_page_lock(m); 1576 vm_page_activate(m); 1577 vm_page_unlock(m); 1578 vm_page_sunbusy(m); 1579 } 1580 } else if (bp->b_iocmd == BIO_READ) { 1581 /* 1582 * NOTE: for reads, m->dirty will probably be 1583 * overridden by the original caller of getpages so 1584 * we cannot set them in order to free the underlying 1585 * swap in a low-swap situation. I don't think we'd 1586 * want to do that anyway, but it was an optimization 1587 * that existed in the old swapper for a time before 1588 * it got ripped out due to precisely this problem. 1589 * 1590 * If not the requested page then deactivate it. 1591 * 1592 * Note that the requested page, reqpage, is left 1593 * busied, but we still have to wake it up. The 1594 * other pages are released (unbusied) by 1595 * vm_page_xunbusy(). 1596 */ 1597 KASSERT(!pmap_page_is_mapped(m), 1598 ("swp_pager_async_iodone: page %p is mapped", m)); 1599 m->valid = VM_PAGE_BITS_ALL; 1600 KASSERT(m->dirty == 0, 1601 ("swp_pager_async_iodone: page %p is dirty", m)); 1602 1603 /* 1604 * We have to wake specifically requested pages 1605 * up too because we cleared VPO_SWAPINPROG and 1606 * could be waiting for it in getpages. However, 1607 * be sure to not unbusy getpages specifically 1608 * requested page - getpages expects it to be 1609 * left busy. 1610 */ 1611 if (i != bp->b_pager.pg_reqpage) { 1612 vm_page_lock(m); 1613 vm_page_deactivate(m); 1614 vm_page_unlock(m); 1615 vm_page_xunbusy(m); 1616 } else { 1617 vm_page_lock(m); 1618 vm_page_flash(m); 1619 vm_page_unlock(m); 1620 } 1621 } else { 1622 /* 1623 * For write success, clear the dirty 1624 * status, then finish the I/O ( which decrements the 1625 * busy count and possibly wakes waiter's up ). 1626 */ 1627 KASSERT(!pmap_page_is_write_mapped(m), 1628 ("swp_pager_async_iodone: page %p is not write" 1629 " protected", m)); 1630 vm_page_undirty(m); 1631 vm_page_sunbusy(m); 1632 if (vm_page_count_severe()) { 1633 vm_page_lock(m); 1634 vm_page_try_to_cache(m); 1635 vm_page_unlock(m); 1636 } 1637 } 1638 } 1639 1640 /* 1641 * adjust pip. NOTE: the original parent may still have its own 1642 * pip refs on the object. 1643 */ 1644 if (object != NULL) { 1645 vm_object_pip_wakeupn(object, bp->b_npages); 1646 VM_OBJECT_WUNLOCK(object); 1647 } 1648 1649 /* 1650 * swapdev_strategy() manually sets b_vp and b_bufobj before calling 1651 * bstrategy(). Set them back to NULL now we're done with it, or we'll 1652 * trigger a KASSERT in relpbuf(). 1653 */ 1654 if (bp->b_vp) { 1655 bp->b_vp = NULL; 1656 bp->b_bufobj = NULL; 1657 } 1658 /* 1659 * release the physical I/O buffer 1660 */ 1661 relpbuf( 1662 bp, 1663 ((bp->b_iocmd == BIO_READ) ? &nsw_rcount : 1664 ((bp->b_flags & B_ASYNC) ? 1665 &nsw_wcount_async : 1666 &nsw_wcount_sync 1667 ) 1668 ) 1669 ); 1670 } 1671 1672 /* 1673 * swap_pager_isswapped: 1674 * 1675 * Return 1 if at least one page in the given object is paged 1676 * out to the given swap device. 1677 * 1678 * This routine may not sleep. 1679 */ 1680 int 1681 swap_pager_isswapped(vm_object_t object, struct swdevt *sp) 1682 { 1683 daddr_t index = 0; 1684 int bcount; 1685 int i; 1686 1687 VM_OBJECT_ASSERT_WLOCKED(object); 1688 if (object->type != OBJT_SWAP) 1689 return (0); 1690 1691 mtx_lock(&swhash_mtx); 1692 for (bcount = 0; bcount < object->un_pager.swp.swp_bcount; bcount++) { 1693 struct swblock *swap; 1694 1695 if ((swap = *swp_pager_hash(object, index)) != NULL) { 1696 for (i = 0; i < SWAP_META_PAGES; ++i) { 1697 if (swp_pager_isondev(swap->swb_pages[i], sp)) { 1698 mtx_unlock(&swhash_mtx); 1699 return (1); 1700 } 1701 } 1702 } 1703 index += SWAP_META_PAGES; 1704 } 1705 mtx_unlock(&swhash_mtx); 1706 return (0); 1707 } 1708 1709 /* 1710 * SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in 1711 * 1712 * This routine dissociates the page at the given index within a 1713 * swap block from its backing store, paging it in if necessary. 1714 * If the page is paged in, it is placed in the inactive queue, 1715 * since it had its backing store ripped out from under it. 1716 * We also attempt to swap in all other pages in the swap block, 1717 * we only guarantee that the one at the specified index is 1718 * paged in. 1719 * 1720 * XXX - The code to page the whole block in doesn't work, so we 1721 * revert to the one-by-one behavior for now. Sigh. 1722 */ 1723 static inline void 1724 swp_pager_force_pagein(vm_object_t object, vm_pindex_t pindex) 1725 { 1726 vm_page_t m; 1727 1728 vm_object_pip_add(object, 1); 1729 m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL); 1730 if (m->valid == VM_PAGE_BITS_ALL) { 1731 vm_object_pip_wakeup(object); 1732 vm_page_dirty(m); 1733 vm_page_lock(m); 1734 vm_page_activate(m); 1735 vm_page_unlock(m); 1736 vm_page_xunbusy(m); 1737 vm_pager_page_unswapped(m); 1738 return; 1739 } 1740 1741 if (swap_pager_getpages(object, &m, 1, 0) != VM_PAGER_OK) 1742 panic("swap_pager_force_pagein: read from swap failed");/*XXX*/ 1743 vm_object_pip_wakeup(object); 1744 vm_page_dirty(m); 1745 vm_page_lock(m); 1746 vm_page_deactivate(m); 1747 vm_page_unlock(m); 1748 vm_page_xunbusy(m); 1749 vm_pager_page_unswapped(m); 1750 } 1751 1752 /* 1753 * swap_pager_swapoff: 1754 * 1755 * Page in all of the pages that have been paged out to the 1756 * given device. The corresponding blocks in the bitmap must be 1757 * marked as allocated and the device must be flagged SW_CLOSING. 1758 * There may be no processes swapped out to the device. 1759 * 1760 * This routine may block. 1761 */ 1762 static void 1763 swap_pager_swapoff(struct swdevt *sp) 1764 { 1765 struct swblock *swap; 1766 int i, j, retries; 1767 1768 GIANT_REQUIRED; 1769 1770 retries = 0; 1771 full_rescan: 1772 mtx_lock(&swhash_mtx); 1773 for (i = 0; i <= swhash_mask; i++) { /* '<=' is correct here */ 1774 restart: 1775 for (swap = swhash[i]; swap != NULL; swap = swap->swb_hnext) { 1776 vm_object_t object = swap->swb_object; 1777 vm_pindex_t pindex = swap->swb_index; 1778 for (j = 0; j < SWAP_META_PAGES; ++j) { 1779 if (swp_pager_isondev(swap->swb_pages[j], sp)) { 1780 /* avoid deadlock */ 1781 if (!VM_OBJECT_TRYWLOCK(object)) { 1782 break; 1783 } else { 1784 mtx_unlock(&swhash_mtx); 1785 swp_pager_force_pagein(object, 1786 pindex + j); 1787 VM_OBJECT_WUNLOCK(object); 1788 mtx_lock(&swhash_mtx); 1789 goto restart; 1790 } 1791 } 1792 } 1793 } 1794 } 1795 mtx_unlock(&swhash_mtx); 1796 if (sp->sw_used) { 1797 /* 1798 * Objects may be locked or paging to the device being 1799 * removed, so we will miss their pages and need to 1800 * make another pass. We have marked this device as 1801 * SW_CLOSING, so the activity should finish soon. 1802 */ 1803 retries++; 1804 if (retries > 100) { 1805 panic("swapoff: failed to locate %d swap blocks", 1806 sp->sw_used); 1807 } 1808 pause("swpoff", hz / 20); 1809 goto full_rescan; 1810 } 1811 } 1812 1813 /************************************************************************ 1814 * SWAP META DATA * 1815 ************************************************************************ 1816 * 1817 * These routines manipulate the swap metadata stored in the 1818 * OBJT_SWAP object. 1819 * 1820 * Swap metadata is implemented with a global hash and not directly 1821 * linked into the object. Instead the object simply contains 1822 * appropriate tracking counters. 1823 */ 1824 1825 /* 1826 * SWP_PAGER_META_BUILD() - add swap block to swap meta data for object 1827 * 1828 * We first convert the object to a swap object if it is a default 1829 * object. 1830 * 1831 * The specified swapblk is added to the object's swap metadata. If 1832 * the swapblk is not valid, it is freed instead. Any previously 1833 * assigned swapblk is freed. 1834 */ 1835 static void 1836 swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk) 1837 { 1838 static volatile int exhausted; 1839 struct swblock *swap; 1840 struct swblock **pswap; 1841 int idx; 1842 1843 VM_OBJECT_ASSERT_WLOCKED(object); 1844 /* 1845 * Convert default object to swap object if necessary 1846 */ 1847 if (object->type != OBJT_SWAP) { 1848 object->type = OBJT_SWAP; 1849 object->un_pager.swp.swp_bcount = 0; 1850 1851 if (object->handle != NULL) { 1852 mtx_lock(&sw_alloc_mtx); 1853 TAILQ_INSERT_TAIL( 1854 NOBJLIST(object->handle), 1855 object, 1856 pager_object_list 1857 ); 1858 mtx_unlock(&sw_alloc_mtx); 1859 } 1860 } 1861 1862 /* 1863 * Locate hash entry. If not found create, but if we aren't adding 1864 * anything just return. If we run out of space in the map we wait 1865 * and, since the hash table may have changed, retry. 1866 */ 1867 retry: 1868 mtx_lock(&swhash_mtx); 1869 pswap = swp_pager_hash(object, pindex); 1870 1871 if ((swap = *pswap) == NULL) { 1872 int i; 1873 1874 if (swapblk == SWAPBLK_NONE) 1875 goto done; 1876 1877 swap = *pswap = uma_zalloc(swap_zone, M_NOWAIT | 1878 (curproc == pageproc ? M_USE_RESERVE : 0)); 1879 if (swap == NULL) { 1880 mtx_unlock(&swhash_mtx); 1881 VM_OBJECT_WUNLOCK(object); 1882 if (uma_zone_exhausted(swap_zone)) { 1883 if (atomic_cmpset_int(&exhausted, 0, 1)) 1884 printf("swap zone exhausted, " 1885 "increase kern.maxswzone\n"); 1886 vm_pageout_oom(VM_OOM_SWAPZ); 1887 pause("swzonex", 10); 1888 } else 1889 VM_WAIT; 1890 VM_OBJECT_WLOCK(object); 1891 goto retry; 1892 } 1893 1894 if (atomic_cmpset_int(&exhausted, 1, 0)) 1895 printf("swap zone ok\n"); 1896 1897 swap->swb_hnext = NULL; 1898 swap->swb_object = object; 1899 swap->swb_index = pindex & ~(vm_pindex_t)SWAP_META_MASK; 1900 swap->swb_count = 0; 1901 1902 ++object->un_pager.swp.swp_bcount; 1903 1904 for (i = 0; i < SWAP_META_PAGES; ++i) 1905 swap->swb_pages[i] = SWAPBLK_NONE; 1906 } 1907 1908 /* 1909 * Delete prior contents of metadata 1910 */ 1911 idx = pindex & SWAP_META_MASK; 1912 1913 if (swap->swb_pages[idx] != SWAPBLK_NONE) { 1914 swp_pager_freeswapspace(swap->swb_pages[idx], 1); 1915 --swap->swb_count; 1916 } 1917 1918 /* 1919 * Enter block into metadata 1920 */ 1921 swap->swb_pages[idx] = swapblk; 1922 if (swapblk != SWAPBLK_NONE) 1923 ++swap->swb_count; 1924 done: 1925 mtx_unlock(&swhash_mtx); 1926 } 1927 1928 /* 1929 * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata 1930 * 1931 * The requested range of blocks is freed, with any associated swap 1932 * returned to the swap bitmap. 1933 * 1934 * This routine will free swap metadata structures as they are cleaned 1935 * out. This routine does *NOT* operate on swap metadata associated 1936 * with resident pages. 1937 */ 1938 static void 1939 swp_pager_meta_free(vm_object_t object, vm_pindex_t index, daddr_t count) 1940 { 1941 1942 VM_OBJECT_ASSERT_LOCKED(object); 1943 if (object->type != OBJT_SWAP) 1944 return; 1945 1946 while (count > 0) { 1947 struct swblock **pswap; 1948 struct swblock *swap; 1949 1950 mtx_lock(&swhash_mtx); 1951 pswap = swp_pager_hash(object, index); 1952 1953 if ((swap = *pswap) != NULL) { 1954 daddr_t v = swap->swb_pages[index & SWAP_META_MASK]; 1955 1956 if (v != SWAPBLK_NONE) { 1957 swp_pager_freeswapspace(v, 1); 1958 swap->swb_pages[index & SWAP_META_MASK] = 1959 SWAPBLK_NONE; 1960 if (--swap->swb_count == 0) { 1961 *pswap = swap->swb_hnext; 1962 uma_zfree(swap_zone, swap); 1963 --object->un_pager.swp.swp_bcount; 1964 } 1965 } 1966 --count; 1967 ++index; 1968 } else { 1969 int n = SWAP_META_PAGES - (index & SWAP_META_MASK); 1970 count -= n; 1971 index += n; 1972 } 1973 mtx_unlock(&swhash_mtx); 1974 } 1975 } 1976 1977 /* 1978 * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object 1979 * 1980 * This routine locates and destroys all swap metadata associated with 1981 * an object. 1982 */ 1983 static void 1984 swp_pager_meta_free_all(vm_object_t object) 1985 { 1986 daddr_t index = 0; 1987 1988 VM_OBJECT_ASSERT_WLOCKED(object); 1989 if (object->type != OBJT_SWAP) 1990 return; 1991 1992 while (object->un_pager.swp.swp_bcount) { 1993 struct swblock **pswap; 1994 struct swblock *swap; 1995 1996 mtx_lock(&swhash_mtx); 1997 pswap = swp_pager_hash(object, index); 1998 if ((swap = *pswap) != NULL) { 1999 int i; 2000 2001 for (i = 0; i < SWAP_META_PAGES; ++i) { 2002 daddr_t v = swap->swb_pages[i]; 2003 if (v != SWAPBLK_NONE) { 2004 --swap->swb_count; 2005 swp_pager_freeswapspace(v, 1); 2006 } 2007 } 2008 if (swap->swb_count != 0) 2009 panic("swap_pager_meta_free_all: swb_count != 0"); 2010 *pswap = swap->swb_hnext; 2011 uma_zfree(swap_zone, swap); 2012 --object->un_pager.swp.swp_bcount; 2013 } 2014 mtx_unlock(&swhash_mtx); 2015 index += SWAP_META_PAGES; 2016 } 2017 } 2018 2019 /* 2020 * SWP_PAGER_METACTL() - misc control of swap and vm_page_t meta data. 2021 * 2022 * This routine is capable of looking up, popping, or freeing 2023 * swapblk assignments in the swap meta data or in the vm_page_t. 2024 * The routine typically returns the swapblk being looked-up, or popped, 2025 * or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block 2026 * was invalid. This routine will automatically free any invalid 2027 * meta-data swapblks. 2028 * 2029 * It is not possible to store invalid swapblks in the swap meta data 2030 * (other then a literal 'SWAPBLK_NONE'), so we don't bother checking. 2031 * 2032 * When acting on a busy resident page and paging is in progress, we 2033 * have to wait until paging is complete but otherwise can act on the 2034 * busy page. 2035 * 2036 * SWM_FREE remove and free swap block from metadata 2037 * SWM_POP remove from meta data but do not free.. pop it out 2038 */ 2039 static daddr_t 2040 swp_pager_meta_ctl(vm_object_t object, vm_pindex_t pindex, int flags) 2041 { 2042 struct swblock **pswap; 2043 struct swblock *swap; 2044 daddr_t r1; 2045 int idx; 2046 2047 VM_OBJECT_ASSERT_LOCKED(object); 2048 /* 2049 * The meta data only exists of the object is OBJT_SWAP 2050 * and even then might not be allocated yet. 2051 */ 2052 if (object->type != OBJT_SWAP) 2053 return (SWAPBLK_NONE); 2054 2055 r1 = SWAPBLK_NONE; 2056 mtx_lock(&swhash_mtx); 2057 pswap = swp_pager_hash(object, pindex); 2058 2059 if ((swap = *pswap) != NULL) { 2060 idx = pindex & SWAP_META_MASK; 2061 r1 = swap->swb_pages[idx]; 2062 2063 if (r1 != SWAPBLK_NONE) { 2064 if (flags & SWM_FREE) { 2065 swp_pager_freeswapspace(r1, 1); 2066 r1 = SWAPBLK_NONE; 2067 } 2068 if (flags & (SWM_FREE|SWM_POP)) { 2069 swap->swb_pages[idx] = SWAPBLK_NONE; 2070 if (--swap->swb_count == 0) { 2071 *pswap = swap->swb_hnext; 2072 uma_zfree(swap_zone, swap); 2073 --object->un_pager.swp.swp_bcount; 2074 } 2075 } 2076 } 2077 } 2078 mtx_unlock(&swhash_mtx); 2079 return (r1); 2080 } 2081 2082 /* 2083 * System call swapon(name) enables swapping on device name, 2084 * which must be in the swdevsw. Return EBUSY 2085 * if already swapping on this device. 2086 */ 2087 #ifndef _SYS_SYSPROTO_H_ 2088 struct swapon_args { 2089 char *name; 2090 }; 2091 #endif 2092 2093 /* 2094 * MPSAFE 2095 */ 2096 /* ARGSUSED */ 2097 int 2098 sys_swapon(struct thread *td, struct swapon_args *uap) 2099 { 2100 struct vattr attr; 2101 struct vnode *vp; 2102 struct nameidata nd; 2103 int error; 2104 2105 error = priv_check(td, PRIV_SWAPON); 2106 if (error) 2107 return (error); 2108 2109 mtx_lock(&Giant); 2110 while (swdev_syscall_active) 2111 tsleep(&swdev_syscall_active, PUSER - 1, "swpon", 0); 2112 swdev_syscall_active = 1; 2113 2114 /* 2115 * Swap metadata may not fit in the KVM if we have physical 2116 * memory of >1GB. 2117 */ 2118 if (swap_zone == NULL) { 2119 error = ENOMEM; 2120 goto done; 2121 } 2122 2123 NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | AUDITVNODE1, UIO_USERSPACE, 2124 uap->name, td); 2125 error = namei(&nd); 2126 if (error) 2127 goto done; 2128 2129 NDFREE(&nd, NDF_ONLY_PNBUF); 2130 vp = nd.ni_vp; 2131 2132 if (vn_isdisk(vp, &error)) { 2133 error = swapongeom(td, vp); 2134 } else if (vp->v_type == VREG && 2135 (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && 2136 (error = VOP_GETATTR(vp, &attr, td->td_ucred)) == 0) { 2137 /* 2138 * Allow direct swapping to NFS regular files in the same 2139 * way that nfs_mountroot() sets up diskless swapping. 2140 */ 2141 error = swaponvp(td, vp, attr.va_size / DEV_BSIZE); 2142 } 2143 2144 if (error) 2145 vrele(vp); 2146 done: 2147 swdev_syscall_active = 0; 2148 wakeup_one(&swdev_syscall_active); 2149 mtx_unlock(&Giant); 2150 return (error); 2151 } 2152 2153 /* 2154 * Check that the total amount of swap currently configured does not 2155 * exceed half the theoretical maximum. If it does, print a warning 2156 * message and return -1; otherwise, return 0. 2157 */ 2158 static int 2159 swapon_check_swzone(unsigned long npages) 2160 { 2161 unsigned long maxpages; 2162 2163 /* absolute maximum we can handle assuming 100% efficiency */ 2164 maxpages = uma_zone_get_max(swap_zone) * SWAP_META_PAGES; 2165 2166 /* recommend using no more than half that amount */ 2167 if (npages > maxpages / 2) { 2168 printf("warning: total configured swap (%lu pages) " 2169 "exceeds maximum recommended amount (%lu pages).\n", 2170 npages, maxpages / 2); 2171 printf("warning: increase kern.maxswzone " 2172 "or reduce amount of swap.\n"); 2173 return (-1); 2174 } 2175 return (0); 2176 } 2177 2178 static void 2179 swaponsomething(struct vnode *vp, void *id, u_long nblks, 2180 sw_strategy_t *strategy, sw_close_t *close, dev_t dev, int flags) 2181 { 2182 struct swdevt *sp, *tsp; 2183 swblk_t dvbase; 2184 u_long mblocks; 2185 2186 /* 2187 * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks. 2188 * First chop nblks off to page-align it, then convert. 2189 * 2190 * sw->sw_nblks is in page-sized chunks now too. 2191 */ 2192 nblks &= ~(ctodb(1) - 1); 2193 nblks = dbtoc(nblks); 2194 2195 /* 2196 * If we go beyond this, we get overflows in the radix 2197 * tree bitmap code. 2198 */ 2199 mblocks = 0x40000000 / BLIST_META_RADIX; 2200 if (nblks > mblocks) { 2201 printf( 2202 "WARNING: reducing swap size to maximum of %luMB per unit\n", 2203 mblocks / 1024 / 1024 * PAGE_SIZE); 2204 nblks = mblocks; 2205 } 2206 2207 sp = malloc(sizeof *sp, M_VMPGDATA, M_WAITOK | M_ZERO); 2208 sp->sw_vp = vp; 2209 sp->sw_id = id; 2210 sp->sw_dev = dev; 2211 sp->sw_flags = 0; 2212 sp->sw_nblks = nblks; 2213 sp->sw_used = 0; 2214 sp->sw_strategy = strategy; 2215 sp->sw_close = close; 2216 sp->sw_flags = flags; 2217 2218 sp->sw_blist = blist_create(nblks, M_WAITOK); 2219 /* 2220 * Do not free the first two block in order to avoid overwriting 2221 * any bsd label at the front of the partition 2222 */ 2223 blist_free(sp->sw_blist, 2, nblks - 2); 2224 2225 dvbase = 0; 2226 mtx_lock(&sw_dev_mtx); 2227 TAILQ_FOREACH(tsp, &swtailq, sw_list) { 2228 if (tsp->sw_end >= dvbase) { 2229 /* 2230 * We put one uncovered page between the devices 2231 * in order to definitively prevent any cross-device 2232 * I/O requests 2233 */ 2234 dvbase = tsp->sw_end + 1; 2235 } 2236 } 2237 sp->sw_first = dvbase; 2238 sp->sw_end = dvbase + nblks; 2239 TAILQ_INSERT_TAIL(&swtailq, sp, sw_list); 2240 nswapdev++; 2241 swap_pager_avail += nblks; 2242 swap_total += (vm_ooffset_t)nblks * PAGE_SIZE; 2243 swapon_check_swzone(swap_total / PAGE_SIZE); 2244 swp_sizecheck(); 2245 mtx_unlock(&sw_dev_mtx); 2246 } 2247 2248 /* 2249 * SYSCALL: swapoff(devname) 2250 * 2251 * Disable swapping on the given device. 2252 * 2253 * XXX: Badly designed system call: it should use a device index 2254 * rather than filename as specification. We keep sw_vp around 2255 * only to make this work. 2256 */ 2257 #ifndef _SYS_SYSPROTO_H_ 2258 struct swapoff_args { 2259 char *name; 2260 }; 2261 #endif 2262 2263 /* 2264 * MPSAFE 2265 */ 2266 /* ARGSUSED */ 2267 int 2268 sys_swapoff(struct thread *td, struct swapoff_args *uap) 2269 { 2270 struct vnode *vp; 2271 struct nameidata nd; 2272 struct swdevt *sp; 2273 int error; 2274 2275 error = priv_check(td, PRIV_SWAPOFF); 2276 if (error) 2277 return (error); 2278 2279 mtx_lock(&Giant); 2280 while (swdev_syscall_active) 2281 tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0); 2282 swdev_syscall_active = 1; 2283 2284 NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name, 2285 td); 2286 error = namei(&nd); 2287 if (error) 2288 goto done; 2289 NDFREE(&nd, NDF_ONLY_PNBUF); 2290 vp = nd.ni_vp; 2291 2292 mtx_lock(&sw_dev_mtx); 2293 TAILQ_FOREACH(sp, &swtailq, sw_list) { 2294 if (sp->sw_vp == vp) 2295 break; 2296 } 2297 mtx_unlock(&sw_dev_mtx); 2298 if (sp == NULL) { 2299 error = EINVAL; 2300 goto done; 2301 } 2302 error = swapoff_one(sp, td->td_ucred); 2303 done: 2304 swdev_syscall_active = 0; 2305 wakeup_one(&swdev_syscall_active); 2306 mtx_unlock(&Giant); 2307 return (error); 2308 } 2309 2310 static int 2311 swapoff_one(struct swdevt *sp, struct ucred *cred) 2312 { 2313 u_long nblks, dvbase; 2314 #ifdef MAC 2315 int error; 2316 #endif 2317 2318 mtx_assert(&Giant, MA_OWNED); 2319 #ifdef MAC 2320 (void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY); 2321 error = mac_system_check_swapoff(cred, sp->sw_vp); 2322 (void) VOP_UNLOCK(sp->sw_vp, 0); 2323 if (error != 0) 2324 return (error); 2325 #endif 2326 nblks = sp->sw_nblks; 2327 2328 /* 2329 * We can turn off this swap device safely only if the 2330 * available virtual memory in the system will fit the amount 2331 * of data we will have to page back in, plus an epsilon so 2332 * the system doesn't become critically low on swap space. 2333 */ 2334 if (vm_cnt.v_free_count + vm_cnt.v_cache_count + swap_pager_avail < 2335 nblks + nswap_lowat) { 2336 return (ENOMEM); 2337 } 2338 2339 /* 2340 * Prevent further allocations on this device. 2341 */ 2342 mtx_lock(&sw_dev_mtx); 2343 sp->sw_flags |= SW_CLOSING; 2344 for (dvbase = 0; dvbase < sp->sw_end; dvbase += dmmax) { 2345 swap_pager_avail -= blist_fill(sp->sw_blist, 2346 dvbase, dmmax); 2347 } 2348 swap_total -= (vm_ooffset_t)nblks * PAGE_SIZE; 2349 mtx_unlock(&sw_dev_mtx); 2350 2351 /* 2352 * Page in the contents of the device and close it. 2353 */ 2354 swap_pager_swapoff(sp); 2355 2356 sp->sw_close(curthread, sp); 2357 sp->sw_id = NULL; 2358 mtx_lock(&sw_dev_mtx); 2359 TAILQ_REMOVE(&swtailq, sp, sw_list); 2360 nswapdev--; 2361 if (nswapdev == 0) { 2362 swap_pager_full = 2; 2363 swap_pager_almost_full = 1; 2364 } 2365 if (swdevhd == sp) 2366 swdevhd = NULL; 2367 mtx_unlock(&sw_dev_mtx); 2368 blist_destroy(sp->sw_blist); 2369 free(sp, M_VMPGDATA); 2370 return (0); 2371 } 2372 2373 void 2374 swapoff_all(void) 2375 { 2376 struct swdevt *sp, *spt; 2377 const char *devname; 2378 int error; 2379 2380 mtx_lock(&Giant); 2381 while (swdev_syscall_active) 2382 tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0); 2383 swdev_syscall_active = 1; 2384 2385 mtx_lock(&sw_dev_mtx); 2386 TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) { 2387 mtx_unlock(&sw_dev_mtx); 2388 if (vn_isdisk(sp->sw_vp, NULL)) 2389 devname = devtoname(sp->sw_vp->v_rdev); 2390 else 2391 devname = "[file]"; 2392 error = swapoff_one(sp, thread0.td_ucred); 2393 if (error != 0) { 2394 printf("Cannot remove swap device %s (error=%d), " 2395 "skipping.\n", devname, error); 2396 } else if (bootverbose) { 2397 printf("Swap device %s removed.\n", devname); 2398 } 2399 mtx_lock(&sw_dev_mtx); 2400 } 2401 mtx_unlock(&sw_dev_mtx); 2402 2403 swdev_syscall_active = 0; 2404 wakeup_one(&swdev_syscall_active); 2405 mtx_unlock(&Giant); 2406 } 2407 2408 void 2409 swap_pager_status(int *total, int *used) 2410 { 2411 struct swdevt *sp; 2412 2413 *total = 0; 2414 *used = 0; 2415 mtx_lock(&sw_dev_mtx); 2416 TAILQ_FOREACH(sp, &swtailq, sw_list) { 2417 *total += sp->sw_nblks; 2418 *used += sp->sw_used; 2419 } 2420 mtx_unlock(&sw_dev_mtx); 2421 } 2422 2423 int 2424 swap_dev_info(int name, struct xswdev *xs, char *devname, size_t len) 2425 { 2426 struct swdevt *sp; 2427 const char *tmp_devname; 2428 int error, n; 2429 2430 n = 0; 2431 error = ENOENT; 2432 mtx_lock(&sw_dev_mtx); 2433 TAILQ_FOREACH(sp, &swtailq, sw_list) { 2434 if (n != name) { 2435 n++; 2436 continue; 2437 } 2438 xs->xsw_version = XSWDEV_VERSION; 2439 xs->xsw_dev = sp->sw_dev; 2440 xs->xsw_flags = sp->sw_flags; 2441 xs->xsw_nblks = sp->sw_nblks; 2442 xs->xsw_used = sp->sw_used; 2443 if (devname != NULL) { 2444 if (vn_isdisk(sp->sw_vp, NULL)) 2445 tmp_devname = devtoname(sp->sw_vp->v_rdev); 2446 else 2447 tmp_devname = "[file]"; 2448 strncpy(devname, tmp_devname, len); 2449 } 2450 error = 0; 2451 break; 2452 } 2453 mtx_unlock(&sw_dev_mtx); 2454 return (error); 2455 } 2456 2457 static int 2458 sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS) 2459 { 2460 struct xswdev xs; 2461 int error; 2462 2463 if (arg2 != 1) /* name length */ 2464 return (EINVAL); 2465 error = swap_dev_info(*(int *)arg1, &xs, NULL, 0); 2466 if (error != 0) 2467 return (error); 2468 error = SYSCTL_OUT(req, &xs, sizeof(xs)); 2469 return (error); 2470 } 2471 2472 SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0, 2473 "Number of swap devices"); 2474 SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD, sysctl_vm_swap_info, 2475 "Swap statistics by device"); 2476 2477 /* 2478 * vmspace_swap_count() - count the approximate swap usage in pages for a 2479 * vmspace. 2480 * 2481 * The map must be locked. 2482 * 2483 * Swap usage is determined by taking the proportional swap used by 2484 * VM objects backing the VM map. To make up for fractional losses, 2485 * if the VM object has any swap use at all the associated map entries 2486 * count for at least 1 swap page. 2487 */ 2488 long 2489 vmspace_swap_count(struct vmspace *vmspace) 2490 { 2491 vm_map_t map; 2492 vm_map_entry_t cur; 2493 vm_object_t object; 2494 long count, n; 2495 2496 map = &vmspace->vm_map; 2497 count = 0; 2498 2499 for (cur = map->header.next; cur != &map->header; cur = cur->next) { 2500 if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 && 2501 (object = cur->object.vm_object) != NULL) { 2502 VM_OBJECT_WLOCK(object); 2503 if (object->type == OBJT_SWAP && 2504 object->un_pager.swp.swp_bcount != 0) { 2505 n = (cur->end - cur->start) / PAGE_SIZE; 2506 count += object->un_pager.swp.swp_bcount * 2507 SWAP_META_PAGES * n / object->size + 1; 2508 } 2509 VM_OBJECT_WUNLOCK(object); 2510 } 2511 } 2512 return (count); 2513 } 2514 2515 /* 2516 * GEOM backend 2517 * 2518 * Swapping onto disk devices. 2519 * 2520 */ 2521 2522 static g_orphan_t swapgeom_orphan; 2523 2524 static struct g_class g_swap_class = { 2525 .name = "SWAP", 2526 .version = G_VERSION, 2527 .orphan = swapgeom_orphan, 2528 }; 2529 2530 DECLARE_GEOM_CLASS(g_swap_class, g_class); 2531 2532 2533 static void 2534 swapgeom_close_ev(void *arg, int flags) 2535 { 2536 struct g_consumer *cp; 2537 2538 cp = arg; 2539 g_access(cp, -1, -1, 0); 2540 g_detach(cp); 2541 g_destroy_consumer(cp); 2542 } 2543 2544 static void 2545 swapgeom_done(struct bio *bp2) 2546 { 2547 struct swdevt *sp; 2548 struct buf *bp; 2549 struct g_consumer *cp; 2550 2551 bp = bp2->bio_caller2; 2552 cp = bp2->bio_from; 2553 bp->b_ioflags = bp2->bio_flags; 2554 if (bp2->bio_error) 2555 bp->b_ioflags |= BIO_ERROR; 2556 bp->b_resid = bp->b_bcount - bp2->bio_completed; 2557 bp->b_error = bp2->bio_error; 2558 bufdone(bp); 2559 mtx_lock(&sw_dev_mtx); 2560 if ((--cp->index) == 0 && cp->private) { 2561 if (g_post_event(swapgeom_close_ev, cp, M_NOWAIT, NULL) == 0) { 2562 sp = bp2->bio_caller1; 2563 sp->sw_id = NULL; 2564 } 2565 } 2566 mtx_unlock(&sw_dev_mtx); 2567 g_destroy_bio(bp2); 2568 } 2569 2570 static void 2571 swapgeom_strategy(struct buf *bp, struct swdevt *sp) 2572 { 2573 struct bio *bio; 2574 struct g_consumer *cp; 2575 2576 mtx_lock(&sw_dev_mtx); 2577 cp = sp->sw_id; 2578 if (cp == NULL) { 2579 mtx_unlock(&sw_dev_mtx); 2580 bp->b_error = ENXIO; 2581 bp->b_ioflags |= BIO_ERROR; 2582 bufdone(bp); 2583 return; 2584 } 2585 cp->index++; 2586 mtx_unlock(&sw_dev_mtx); 2587 if (bp->b_iocmd == BIO_WRITE) 2588 bio = g_new_bio(); 2589 else 2590 bio = g_alloc_bio(); 2591 if (bio == NULL) { 2592 bp->b_error = ENOMEM; 2593 bp->b_ioflags |= BIO_ERROR; 2594 bufdone(bp); 2595 return; 2596 } 2597 2598 bio->bio_caller1 = sp; 2599 bio->bio_caller2 = bp; 2600 bio->bio_cmd = bp->b_iocmd; 2601 bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE; 2602 bio->bio_length = bp->b_bcount; 2603 bio->bio_done = swapgeom_done; 2604 if ((bp->b_flags & B_UNMAPPED) != 0) { 2605 bio->bio_ma = bp->b_pages; 2606 bio->bio_data = unmapped_buf; 2607 bio->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; 2608 bio->bio_ma_n = bp->b_npages; 2609 bio->bio_flags |= BIO_UNMAPPED; 2610 } else { 2611 bio->bio_data = bp->b_data; 2612 bio->bio_ma = NULL; 2613 } 2614 g_io_request(bio, cp); 2615 return; 2616 } 2617 2618 static void 2619 swapgeom_orphan(struct g_consumer *cp) 2620 { 2621 struct swdevt *sp; 2622 int destroy; 2623 2624 mtx_lock(&sw_dev_mtx); 2625 TAILQ_FOREACH(sp, &swtailq, sw_list) { 2626 if (sp->sw_id == cp) { 2627 sp->sw_flags |= SW_CLOSING; 2628 break; 2629 } 2630 } 2631 cp->private = (void *)(uintptr_t)1; 2632 destroy = ((sp != NULL) && (cp->index == 0)); 2633 if (destroy) 2634 sp->sw_id = NULL; 2635 mtx_unlock(&sw_dev_mtx); 2636 if (destroy) 2637 swapgeom_close_ev(cp, 0); 2638 } 2639 2640 static void 2641 swapgeom_close(struct thread *td, struct swdevt *sw) 2642 { 2643 struct g_consumer *cp; 2644 2645 mtx_lock(&sw_dev_mtx); 2646 cp = sw->sw_id; 2647 sw->sw_id = NULL; 2648 mtx_unlock(&sw_dev_mtx); 2649 /* XXX: direct call when Giant untangled */ 2650 if (cp != NULL) 2651 g_waitfor_event(swapgeom_close_ev, cp, M_WAITOK, NULL); 2652 } 2653 2654 2655 struct swh0h0 { 2656 struct cdev *dev; 2657 struct vnode *vp; 2658 int error; 2659 }; 2660 2661 static void 2662 swapongeom_ev(void *arg, int flags) 2663 { 2664 struct swh0h0 *swh; 2665 struct g_provider *pp; 2666 struct g_consumer *cp; 2667 static struct g_geom *gp; 2668 struct swdevt *sp; 2669 u_long nblks; 2670 int error; 2671 2672 swh = arg; 2673 swh->error = 0; 2674 pp = g_dev_getprovider(swh->dev); 2675 if (pp == NULL) { 2676 swh->error = ENODEV; 2677 return; 2678 } 2679 mtx_lock(&sw_dev_mtx); 2680 TAILQ_FOREACH(sp, &swtailq, sw_list) { 2681 cp = sp->sw_id; 2682 if (cp != NULL && cp->provider == pp) { 2683 mtx_unlock(&sw_dev_mtx); 2684 swh->error = EBUSY; 2685 return; 2686 } 2687 } 2688 mtx_unlock(&sw_dev_mtx); 2689 if (gp == NULL) 2690 gp = g_new_geomf(&g_swap_class, "swap"); 2691 cp = g_new_consumer(gp); 2692 cp->index = 0; /* Number of active I/Os. */ 2693 cp->private = NULL; /* Orphanization flag */ 2694 g_attach(cp, pp); 2695 /* 2696 * XXX: Everytime you think you can improve the margin for 2697 * footshooting, somebody depends on the ability to do so: 2698 * savecore(8) wants to write to our swapdev so we cannot 2699 * set an exclusive count :-( 2700 */ 2701 error = g_access(cp, 1, 1, 0); 2702 if (error) { 2703 g_detach(cp); 2704 g_destroy_consumer(cp); 2705 swh->error = error; 2706 return; 2707 } 2708 nblks = pp->mediasize / DEV_BSIZE; 2709 swaponsomething(swh->vp, cp, nblks, swapgeom_strategy, 2710 swapgeom_close, dev2udev(swh->dev), 2711 (pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ? SW_UNMAPPED : 0); 2712 swh->error = 0; 2713 } 2714 2715 static int 2716 swapongeom(struct thread *td, struct vnode *vp) 2717 { 2718 int error; 2719 struct swh0h0 swh; 2720 2721 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2722 2723 swh.dev = vp->v_rdev; 2724 swh.vp = vp; 2725 swh.error = 0; 2726 /* XXX: direct call when Giant untangled */ 2727 error = g_waitfor_event(swapongeom_ev, &swh, M_WAITOK, NULL); 2728 if (!error) 2729 error = swh.error; 2730 VOP_UNLOCK(vp, 0); 2731 return (error); 2732 } 2733 2734 /* 2735 * VNODE backend 2736 * 2737 * This is used mainly for network filesystem (read: probably only tested 2738 * with NFS) swapfiles. 2739 * 2740 */ 2741 2742 static void 2743 swapdev_strategy(struct buf *bp, struct swdevt *sp) 2744 { 2745 struct vnode *vp2; 2746 2747 bp->b_blkno = ctodb(bp->b_blkno - sp->sw_first); 2748 2749 vp2 = sp->sw_id; 2750 vhold(vp2); 2751 if (bp->b_iocmd == BIO_WRITE) { 2752 if (bp->b_bufobj) 2753 bufobj_wdrop(bp->b_bufobj); 2754 bufobj_wref(&vp2->v_bufobj); 2755 } 2756 if (bp->b_bufobj != &vp2->v_bufobj) 2757 bp->b_bufobj = &vp2->v_bufobj; 2758 bp->b_vp = vp2; 2759 bp->b_iooffset = dbtob(bp->b_blkno); 2760 bstrategy(bp); 2761 return; 2762 } 2763 2764 static void 2765 swapdev_close(struct thread *td, struct swdevt *sp) 2766 { 2767 2768 VOP_CLOSE(sp->sw_vp, FREAD | FWRITE, td->td_ucred, td); 2769 vrele(sp->sw_vp); 2770 } 2771 2772 2773 static int 2774 swaponvp(struct thread *td, struct vnode *vp, u_long nblks) 2775 { 2776 struct swdevt *sp; 2777 int error; 2778 2779 if (nblks == 0) 2780 return (ENXIO); 2781 mtx_lock(&sw_dev_mtx); 2782 TAILQ_FOREACH(sp, &swtailq, sw_list) { 2783 if (sp->sw_id == vp) { 2784 mtx_unlock(&sw_dev_mtx); 2785 return (EBUSY); 2786 } 2787 } 2788 mtx_unlock(&sw_dev_mtx); 2789 2790 (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2791 #ifdef MAC 2792 error = mac_system_check_swapon(td->td_ucred, vp); 2793 if (error == 0) 2794 #endif 2795 error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, NULL); 2796 (void) VOP_UNLOCK(vp, 0); 2797 if (error) 2798 return (error); 2799 2800 swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close, 2801 NODEV, 0); 2802 return (0); 2803 } 2804 2805 static int 2806 sysctl_swap_async_max(SYSCTL_HANDLER_ARGS) 2807 { 2808 int error, new, n; 2809 2810 new = nsw_wcount_async_max; 2811 error = sysctl_handle_int(oidp, &new, 0, req); 2812 if (error != 0 || req->newptr == NULL) 2813 return (error); 2814 2815 if (new > nswbuf / 2 || new < 1) 2816 return (EINVAL); 2817 2818 mtx_lock(&pbuf_mtx); 2819 while (nsw_wcount_async_max != new) { 2820 /* 2821 * Adjust difference. If the current async count is too low, 2822 * we will need to sqeeze our update slowly in. Sleep with a 2823 * higher priority than getpbuf() to finish faster. 2824 */ 2825 n = new - nsw_wcount_async_max; 2826 if (nsw_wcount_async + n >= 0) { 2827 nsw_wcount_async += n; 2828 nsw_wcount_async_max += n; 2829 wakeup(&nsw_wcount_async); 2830 } else { 2831 nsw_wcount_async_max -= nsw_wcount_async; 2832 nsw_wcount_async = 0; 2833 msleep(&nsw_wcount_async, &pbuf_mtx, PSWP, 2834 "swpsysctl", 0); 2835 } 2836 } 2837 mtx_unlock(&pbuf_mtx); 2838 2839 return (0); 2840 } 2841