1 /*- 2 * Copyright (c) 1998 Matthew Dillon, 3 * Copyright (c) 1994 John S. Dyson 4 * Copyright (c) 1990 University of Utah. 5 * Copyright (c) 1982, 1986, 1989, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * the Systems Programming Group of the University of Utah Computer 10 * Science Department. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. All advertising materials mentioning features or use of this software 21 * must display the following acknowledgement: 22 * This product includes software developed by the University of 23 * California, Berkeley and its contributors. 24 * 4. Neither the name of the University nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * New Swap System 41 * Matthew Dillon 42 * 43 * Radix Bitmap 'blists'. 44 * 45 * - The new swapper uses the new radix bitmap code. This should scale 46 * to arbitrarily small or arbitrarily large swap spaces and an almost 47 * arbitrary degree of fragmentation. 48 * 49 * Features: 50 * 51 * - on the fly reallocation of swap during putpages. The new system 52 * does not try to keep previously allocated swap blocks for dirty 53 * pages. 54 * 55 * - on the fly deallocation of swap 56 * 57 * - No more garbage collection required. Unnecessarily allocated swap 58 * blocks only exist for dirty vm_page_t's now and these are already 59 * cycled (in a high-load system) by the pager. We also do on-the-fly 60 * removal of invalidated swap blocks when a page is destroyed 61 * or renamed. 62 * 63 * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$ 64 * 65 * @(#)swap_pager.c 8.9 (Berkeley) 3/21/94 66 * @(#)vm_swap.c 8.5 (Berkeley) 2/17/94 67 */ 68 69 #include <sys/cdefs.h> 70 __FBSDID("$FreeBSD$"); 71 72 #include "opt_swap.h" 73 #include "opt_vm.h" 74 75 #include <sys/param.h> 76 #include <sys/systm.h> 77 #include <sys/conf.h> 78 #include <sys/kernel.h> 79 #include <sys/priv.h> 80 #include <sys/proc.h> 81 #include <sys/bio.h> 82 #include <sys/buf.h> 83 #include <sys/disk.h> 84 #include <sys/fcntl.h> 85 #include <sys/mount.h> 86 #include <sys/namei.h> 87 #include <sys/vnode.h> 88 #include <sys/malloc.h> 89 #include <sys/resource.h> 90 #include <sys/resourcevar.h> 91 #include <sys/sysctl.h> 92 #include <sys/sysproto.h> 93 #include <sys/blist.h> 94 #include <sys/lock.h> 95 #include <sys/sx.h> 96 #include <sys/vmmeter.h> 97 98 #include <security/mac/mac_framework.h> 99 100 #include <vm/vm.h> 101 #include <vm/pmap.h> 102 #include <vm/vm_map.h> 103 #include <vm/vm_kern.h> 104 #include <vm/vm_object.h> 105 #include <vm/vm_page.h> 106 #include <vm/vm_pager.h> 107 #include <vm/vm_pageout.h> 108 #include <vm/vm_param.h> 109 #include <vm/swap_pager.h> 110 #include <vm/vm_extern.h> 111 #include <vm/uma.h> 112 113 #include <geom/geom.h> 114 115 /* 116 * SWB_NPAGES must be a power of 2. It may be set to 1, 2, 4, 8, or 16 117 * pages per allocation. We recommend you stick with the default of 8. 118 * The 16-page limit is due to the radix code (kern/subr_blist.c). 119 */ 120 #ifndef MAX_PAGEOUT_CLUSTER 121 #define MAX_PAGEOUT_CLUSTER 16 122 #endif 123 124 #if !defined(SWB_NPAGES) 125 #define SWB_NPAGES MAX_PAGEOUT_CLUSTER 126 #endif 127 128 /* 129 * Piecemeal swap metadata structure. Swap is stored in a radix tree. 130 * 131 * If SWB_NPAGES is 8 and sizeof(char *) == sizeof(daddr_t), our radix 132 * is basically 8. Assuming PAGE_SIZE == 4096, one tree level represents 133 * 32K worth of data, two levels represent 256K, three levels represent 134 * 2 MBytes. This is acceptable. 135 * 136 * Overall memory utilization is about the same as the old swap structure. 137 */ 138 #define SWCORRECT(n) (sizeof(void *) * (n) / sizeof(daddr_t)) 139 #define SWAP_META_PAGES (SWB_NPAGES * 2) 140 #define SWAP_META_MASK (SWAP_META_PAGES - 1) 141 142 struct swblock { 143 struct swblock *swb_hnext; 144 vm_object_t swb_object; 145 vm_pindex_t swb_index; 146 int swb_count; 147 daddr_t swb_pages[SWAP_META_PAGES]; 148 }; 149 150 static struct mtx sw_dev_mtx; 151 static TAILQ_HEAD(, swdevt) swtailq = TAILQ_HEAD_INITIALIZER(swtailq); 152 static struct swdevt *swdevhd; /* Allocate from here next */ 153 static int nswapdev; /* Number of swap devices */ 154 int swap_pager_avail; 155 static int swdev_syscall_active = 0; /* serialize swap(on|off) */ 156 157 static vm_ooffset_t swap_total; 158 SYSCTL_QUAD(_vm, OID_AUTO, swap_total, CTLFLAG_RD, &swap_total, 0, 159 "Total amount of available swap storage."); 160 static vm_ooffset_t swap_reserved; 161 SYSCTL_QUAD(_vm, OID_AUTO, swap_reserved, CTLFLAG_RD, &swap_reserved, 0, 162 "Amount of swap storage needed to back all allocated anonymous memory."); 163 static int overcommit = 0; 164 SYSCTL_INT(_vm, OID_AUTO, overcommit, CTLFLAG_RW, &overcommit, 0, 165 "Configure virtual memory overcommit behavior. See tuning(7) " 166 "for details."); 167 168 /* bits from overcommit */ 169 #define SWAP_RESERVE_FORCE_ON (1 << 0) 170 #define SWAP_RESERVE_RLIMIT_ON (1 << 1) 171 #define SWAP_RESERVE_ALLOW_NONWIRED (1 << 2) 172 173 int 174 swap_reserve(vm_ooffset_t incr) 175 { 176 177 return (swap_reserve_by_uid(incr, curthread->td_ucred->cr_ruidinfo)); 178 } 179 180 int 181 swap_reserve_by_uid(vm_ooffset_t incr, struct uidinfo *uip) 182 { 183 vm_ooffset_t r, s; 184 int res, error; 185 static int curfail; 186 static struct timeval lastfail; 187 188 if (incr & PAGE_MASK) 189 panic("swap_reserve: & PAGE_MASK"); 190 191 res = 0; 192 mtx_lock(&sw_dev_mtx); 193 r = swap_reserved + incr; 194 if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) { 195 s = cnt.v_page_count - cnt.v_free_reserved - cnt.v_wire_count; 196 s *= PAGE_SIZE; 197 } else 198 s = 0; 199 s += swap_total; 200 if ((overcommit & SWAP_RESERVE_FORCE_ON) == 0 || r <= s || 201 (error = priv_check(curthread, PRIV_VM_SWAP_NOQUOTA)) == 0) { 202 res = 1; 203 swap_reserved = r; 204 } 205 mtx_unlock(&sw_dev_mtx); 206 207 if (res) { 208 PROC_LOCK(curproc); 209 UIDINFO_VMSIZE_LOCK(uip); 210 if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 && 211 uip->ui_vmsize + incr > lim_cur(curproc, RLIMIT_SWAP) && 212 priv_check(curthread, PRIV_VM_SWAP_NORLIMIT)) 213 res = 0; 214 else 215 uip->ui_vmsize += incr; 216 UIDINFO_VMSIZE_UNLOCK(uip); 217 PROC_UNLOCK(curproc); 218 if (!res) { 219 mtx_lock(&sw_dev_mtx); 220 swap_reserved -= incr; 221 mtx_unlock(&sw_dev_mtx); 222 } 223 } 224 if (!res && ppsratecheck(&lastfail, &curfail, 1)) { 225 printf("uid %d, pid %d: swap reservation for %jd bytes failed\n", 226 curproc->p_pid, uip->ui_uid, incr); 227 } 228 229 return (res); 230 } 231 232 void 233 swap_reserve_force(vm_ooffset_t incr) 234 { 235 struct uidinfo *uip; 236 237 mtx_lock(&sw_dev_mtx); 238 swap_reserved += incr; 239 mtx_unlock(&sw_dev_mtx); 240 241 uip = curthread->td_ucred->cr_ruidinfo; 242 PROC_LOCK(curproc); 243 UIDINFO_VMSIZE_LOCK(uip); 244 uip->ui_vmsize += incr; 245 UIDINFO_VMSIZE_UNLOCK(uip); 246 PROC_UNLOCK(curproc); 247 } 248 249 void 250 swap_release(vm_ooffset_t decr) 251 { 252 struct uidinfo *uip; 253 254 PROC_LOCK(curproc); 255 uip = curthread->td_ucred->cr_ruidinfo; 256 swap_release_by_uid(decr, uip); 257 PROC_UNLOCK(curproc); 258 } 259 260 void 261 swap_release_by_uid(vm_ooffset_t decr, struct uidinfo *uip) 262 { 263 264 if (decr & PAGE_MASK) 265 panic("swap_release: & PAGE_MASK"); 266 267 mtx_lock(&sw_dev_mtx); 268 if (swap_reserved < decr) 269 panic("swap_reserved < decr"); 270 swap_reserved -= decr; 271 mtx_unlock(&sw_dev_mtx); 272 273 UIDINFO_VMSIZE_LOCK(uip); 274 if (uip->ui_vmsize < decr) 275 printf("negative vmsize for uid = %d\n", uip->ui_uid); 276 uip->ui_vmsize -= decr; 277 UIDINFO_VMSIZE_UNLOCK(uip); 278 } 279 280 static void swapdev_strategy(struct buf *, struct swdevt *sw); 281 282 #define SWM_FREE 0x02 /* free, period */ 283 #define SWM_POP 0x04 /* pop out */ 284 285 int swap_pager_full = 2; /* swap space exhaustion (task killing) */ 286 static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/ 287 static int nsw_rcount; /* free read buffers */ 288 static int nsw_wcount_sync; /* limit write buffers / synchronous */ 289 static int nsw_wcount_async; /* limit write buffers / asynchronous */ 290 static int nsw_wcount_async_max;/* assigned maximum */ 291 static int nsw_cluster_max; /* maximum VOP I/O allowed */ 292 293 static struct swblock **swhash; 294 static int swhash_mask; 295 static struct mtx swhash_mtx; 296 297 static int swap_async_max = 4; /* maximum in-progress async I/O's */ 298 static struct sx sw_alloc_sx; 299 300 301 SYSCTL_INT(_vm, OID_AUTO, swap_async_max, 302 CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops"); 303 304 /* 305 * "named" and "unnamed" anon region objects. Try to reduce the overhead 306 * of searching a named list by hashing it just a little. 307 */ 308 309 #define NOBJLISTS 8 310 311 #define NOBJLIST(handle) \ 312 (&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)]) 313 314 static struct mtx sw_alloc_mtx; /* protect list manipulation */ 315 static struct pagerlst swap_pager_object_list[NOBJLISTS]; 316 static uma_zone_t swap_zone; 317 static struct vm_object swap_zone_obj; 318 319 /* 320 * pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure 321 * calls hooked from other parts of the VM system and do not appear here. 322 * (see vm/swap_pager.h). 323 */ 324 static vm_object_t 325 swap_pager_alloc(void *handle, vm_ooffset_t size, 326 vm_prot_t prot, vm_ooffset_t offset, struct ucred *); 327 static void swap_pager_dealloc(vm_object_t object); 328 static int swap_pager_getpages(vm_object_t, vm_page_t *, int, int); 329 static void swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *); 330 static boolean_t 331 swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after); 332 static void swap_pager_init(void); 333 static void swap_pager_unswapped(vm_page_t); 334 static void swap_pager_swapoff(struct swdevt *sp); 335 336 struct pagerops swappagerops = { 337 .pgo_init = swap_pager_init, /* early system initialization of pager */ 338 .pgo_alloc = swap_pager_alloc, /* allocate an OBJT_SWAP object */ 339 .pgo_dealloc = swap_pager_dealloc, /* deallocate an OBJT_SWAP object */ 340 .pgo_getpages = swap_pager_getpages, /* pagein */ 341 .pgo_putpages = swap_pager_putpages, /* pageout */ 342 .pgo_haspage = swap_pager_haspage, /* get backing store status for page */ 343 .pgo_pageunswapped = swap_pager_unswapped, /* remove swap related to page */ 344 }; 345 346 /* 347 * dmmax is in page-sized chunks with the new swap system. It was 348 * dev-bsized chunks in the old. dmmax is always a power of 2. 349 * 350 * swap_*() routines are externally accessible. swp_*() routines are 351 * internal. 352 */ 353 static int dmmax; 354 static int nswap_lowat = 128; /* in pages, swap_pager_almost_full warn */ 355 static int nswap_hiwat = 512; /* in pages, swap_pager_almost_full warn */ 356 357 SYSCTL_INT(_vm, OID_AUTO, dmmax, 358 CTLFLAG_RD, &dmmax, 0, "Maximum size of a swap block"); 359 360 static void swp_sizecheck(void); 361 static void swp_pager_async_iodone(struct buf *bp); 362 static int swapongeom(struct thread *, struct vnode *); 363 static int swaponvp(struct thread *, struct vnode *, u_long); 364 static int swapoff_one(struct swdevt *sp, struct ucred *cred); 365 366 /* 367 * Swap bitmap functions 368 */ 369 static void swp_pager_freeswapspace(daddr_t blk, int npages); 370 static daddr_t swp_pager_getswapspace(int npages); 371 372 /* 373 * Metadata functions 374 */ 375 static struct swblock **swp_pager_hash(vm_object_t object, vm_pindex_t index); 376 static void swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t); 377 static void swp_pager_meta_free(vm_object_t, vm_pindex_t, daddr_t); 378 static void swp_pager_meta_free_all(vm_object_t); 379 static daddr_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int); 380 381 /* 382 * SWP_SIZECHECK() - update swap_pager_full indication 383 * 384 * update the swap_pager_almost_full indication and warn when we are 385 * about to run out of swap space, using lowat/hiwat hysteresis. 386 * 387 * Clear swap_pager_full ( task killing ) indication when lowat is met. 388 * 389 * No restrictions on call 390 * This routine may not block. 391 * This routine must be called at splvm() 392 */ 393 static void 394 swp_sizecheck(void) 395 { 396 397 if (swap_pager_avail < nswap_lowat) { 398 if (swap_pager_almost_full == 0) { 399 printf("swap_pager: out of swap space\n"); 400 swap_pager_almost_full = 1; 401 } 402 } else { 403 swap_pager_full = 0; 404 if (swap_pager_avail > nswap_hiwat) 405 swap_pager_almost_full = 0; 406 } 407 } 408 409 /* 410 * SWP_PAGER_HASH() - hash swap meta data 411 * 412 * This is an helper function which hashes the swapblk given 413 * the object and page index. It returns a pointer to a pointer 414 * to the object, or a pointer to a NULL pointer if it could not 415 * find a swapblk. 416 * 417 * This routine must be called at splvm(). 418 */ 419 static struct swblock ** 420 swp_pager_hash(vm_object_t object, vm_pindex_t index) 421 { 422 struct swblock **pswap; 423 struct swblock *swap; 424 425 index &= ~(vm_pindex_t)SWAP_META_MASK; 426 pswap = &swhash[(index ^ (int)(intptr_t)object) & swhash_mask]; 427 while ((swap = *pswap) != NULL) { 428 if (swap->swb_object == object && 429 swap->swb_index == index 430 ) { 431 break; 432 } 433 pswap = &swap->swb_hnext; 434 } 435 return (pswap); 436 } 437 438 /* 439 * SWAP_PAGER_INIT() - initialize the swap pager! 440 * 441 * Expected to be started from system init. NOTE: This code is run 442 * before much else so be careful what you depend on. Most of the VM 443 * system has yet to be initialized at this point. 444 */ 445 static void 446 swap_pager_init(void) 447 { 448 /* 449 * Initialize object lists 450 */ 451 int i; 452 453 for (i = 0; i < NOBJLISTS; ++i) 454 TAILQ_INIT(&swap_pager_object_list[i]); 455 mtx_init(&sw_alloc_mtx, "swap_pager list", NULL, MTX_DEF); 456 mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF); 457 458 /* 459 * Device Stripe, in PAGE_SIZE'd blocks 460 */ 461 dmmax = SWB_NPAGES * 2; 462 } 463 464 /* 465 * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process 466 * 467 * Expected to be started from pageout process once, prior to entering 468 * its main loop. 469 */ 470 void 471 swap_pager_swap_init(void) 472 { 473 int n, n2; 474 475 /* 476 * Number of in-transit swap bp operations. Don't 477 * exhaust the pbufs completely. Make sure we 478 * initialize workable values (0 will work for hysteresis 479 * but it isn't very efficient). 480 * 481 * The nsw_cluster_max is constrained by the bp->b_pages[] 482 * array (MAXPHYS/PAGE_SIZE) and our locally defined 483 * MAX_PAGEOUT_CLUSTER. Also be aware that swap ops are 484 * constrained by the swap device interleave stripe size. 485 * 486 * Currently we hardwire nsw_wcount_async to 4. This limit is 487 * designed to prevent other I/O from having high latencies due to 488 * our pageout I/O. The value 4 works well for one or two active swap 489 * devices but is probably a little low if you have more. Even so, 490 * a higher value would probably generate only a limited improvement 491 * with three or four active swap devices since the system does not 492 * typically have to pageout at extreme bandwidths. We will want 493 * at least 2 per swap devices, and 4 is a pretty good value if you 494 * have one NFS swap device due to the command/ack latency over NFS. 495 * So it all works out pretty well. 496 */ 497 nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER); 498 499 mtx_lock(&pbuf_mtx); 500 nsw_rcount = (nswbuf + 1) / 2; 501 nsw_wcount_sync = (nswbuf + 3) / 4; 502 nsw_wcount_async = 4; 503 nsw_wcount_async_max = nsw_wcount_async; 504 mtx_unlock(&pbuf_mtx); 505 506 /* 507 * Initialize our zone. Right now I'm just guessing on the number 508 * we need based on the number of pages in the system. Each swblock 509 * can hold 16 pages, so this is probably overkill. This reservation 510 * is typically limited to around 32MB by default. 511 */ 512 n = cnt.v_page_count / 2; 513 if (maxswzone && n > maxswzone / sizeof(struct swblock)) 514 n = maxswzone / sizeof(struct swblock); 515 n2 = n; 516 swap_zone = uma_zcreate("SWAPMETA", sizeof(struct swblock), NULL, NULL, 517 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); 518 if (swap_zone == NULL) 519 panic("failed to create swap_zone."); 520 do { 521 if (uma_zone_set_obj(swap_zone, &swap_zone_obj, n)) 522 break; 523 /* 524 * if the allocation failed, try a zone two thirds the 525 * size of the previous attempt. 526 */ 527 n -= ((n + 2) / 3); 528 } while (n > 0); 529 if (n2 != n) 530 printf("Swap zone entries reduced from %d to %d.\n", n2, n); 531 n2 = n; 532 533 /* 534 * Initialize our meta-data hash table. The swapper does not need to 535 * be quite as efficient as the VM system, so we do not use an 536 * oversized hash table. 537 * 538 * n: size of hash table, must be power of 2 539 * swhash_mask: hash table index mask 540 */ 541 for (n = 1; n < n2 / 8; n *= 2) 542 ; 543 swhash = malloc(sizeof(struct swblock *) * n, M_VMPGDATA, M_WAITOK | M_ZERO); 544 swhash_mask = n - 1; 545 mtx_init(&swhash_mtx, "swap_pager swhash", NULL, MTX_DEF); 546 } 547 548 /* 549 * SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate 550 * its metadata structures. 551 * 552 * This routine is called from the mmap and fork code to create a new 553 * OBJT_SWAP object. We do this by creating an OBJT_DEFAULT object 554 * and then converting it with swp_pager_meta_build(). 555 * 556 * This routine may block in vm_object_allocate() and create a named 557 * object lookup race, so we must interlock. We must also run at 558 * splvm() for the object lookup to handle races with interrupts, but 559 * we do not have to maintain splvm() in between the lookup and the 560 * add because (I believe) it is not possible to attempt to create 561 * a new swap object w/handle when a default object with that handle 562 * already exists. 563 * 564 * MPSAFE 565 */ 566 static vm_object_t 567 swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, 568 vm_ooffset_t offset, struct ucred *cred) 569 { 570 vm_object_t object; 571 vm_pindex_t pindex; 572 struct uidinfo *uip; 573 574 uip = NULL; 575 pindex = OFF_TO_IDX(offset + PAGE_MASK + size); 576 if (handle) { 577 mtx_lock(&Giant); 578 /* 579 * Reference existing named region or allocate new one. There 580 * should not be a race here against swp_pager_meta_build() 581 * as called from vm_page_remove() in regards to the lookup 582 * of the handle. 583 */ 584 sx_xlock(&sw_alloc_sx); 585 object = vm_pager_object_lookup(NOBJLIST(handle), handle); 586 if (object == NULL) { 587 if (cred != NULL) { 588 uip = cred->cr_ruidinfo; 589 if (!swap_reserve_by_uid(size, uip)) { 590 sx_xunlock(&sw_alloc_sx); 591 mtx_unlock(&Giant); 592 return (NULL); 593 } 594 uihold(uip); 595 } 596 object = vm_object_allocate(OBJT_DEFAULT, pindex); 597 VM_OBJECT_LOCK(object); 598 object->handle = handle; 599 if (cred != NULL) { 600 object->uip = uip; 601 object->charge = size; 602 } 603 swp_pager_meta_build(object, 0, SWAPBLK_NONE); 604 VM_OBJECT_UNLOCK(object); 605 } 606 sx_xunlock(&sw_alloc_sx); 607 mtx_unlock(&Giant); 608 } else { 609 if (cred != NULL) { 610 uip = cred->cr_ruidinfo; 611 if (!swap_reserve_by_uid(size, uip)) 612 return (NULL); 613 uihold(uip); 614 } 615 object = vm_object_allocate(OBJT_DEFAULT, pindex); 616 VM_OBJECT_LOCK(object); 617 if (cred != NULL) { 618 object->uip = uip; 619 object->charge = size; 620 } 621 swp_pager_meta_build(object, 0, SWAPBLK_NONE); 622 VM_OBJECT_UNLOCK(object); 623 } 624 return (object); 625 } 626 627 /* 628 * SWAP_PAGER_DEALLOC() - remove swap metadata from object 629 * 630 * The swap backing for the object is destroyed. The code is 631 * designed such that we can reinstantiate it later, but this 632 * routine is typically called only when the entire object is 633 * about to be destroyed. 634 * 635 * This routine may block, but no longer does. 636 * 637 * The object must be locked or unreferenceable. 638 */ 639 static void 640 swap_pager_dealloc(vm_object_t object) 641 { 642 643 /* 644 * Remove from list right away so lookups will fail if we block for 645 * pageout completion. 646 */ 647 if (object->handle != NULL) { 648 mtx_lock(&sw_alloc_mtx); 649 TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list); 650 mtx_unlock(&sw_alloc_mtx); 651 } 652 653 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 654 vm_object_pip_wait(object, "swpdea"); 655 656 /* 657 * Free all remaining metadata. We only bother to free it from 658 * the swap meta data. We do not attempt to free swapblk's still 659 * associated with vm_page_t's for this object. We do not care 660 * if paging is still in progress on some objects. 661 */ 662 swp_pager_meta_free_all(object); 663 } 664 665 /************************************************************************ 666 * SWAP PAGER BITMAP ROUTINES * 667 ************************************************************************/ 668 669 /* 670 * SWP_PAGER_GETSWAPSPACE() - allocate raw swap space 671 * 672 * Allocate swap for the requested number of pages. The starting 673 * swap block number (a page index) is returned or SWAPBLK_NONE 674 * if the allocation failed. 675 * 676 * Also has the side effect of advising that somebody made a mistake 677 * when they configured swap and didn't configure enough. 678 * 679 * Must be called at splvm() to avoid races with bitmap frees from 680 * vm_page_remove() aka swap_pager_page_removed(). 681 * 682 * This routine may not block 683 * This routine must be called at splvm(). 684 * 685 * We allocate in round-robin fashion from the configured devices. 686 */ 687 static daddr_t 688 swp_pager_getswapspace(int npages) 689 { 690 daddr_t blk; 691 struct swdevt *sp; 692 int i; 693 694 blk = SWAPBLK_NONE; 695 mtx_lock(&sw_dev_mtx); 696 sp = swdevhd; 697 for (i = 0; i < nswapdev; i++) { 698 if (sp == NULL) 699 sp = TAILQ_FIRST(&swtailq); 700 if (!(sp->sw_flags & SW_CLOSING)) { 701 blk = blist_alloc(sp->sw_blist, npages); 702 if (blk != SWAPBLK_NONE) { 703 blk += sp->sw_first; 704 sp->sw_used += npages; 705 swap_pager_avail -= npages; 706 swp_sizecheck(); 707 swdevhd = TAILQ_NEXT(sp, sw_list); 708 goto done; 709 } 710 } 711 sp = TAILQ_NEXT(sp, sw_list); 712 } 713 if (swap_pager_full != 2) { 714 printf("swap_pager_getswapspace(%d): failed\n", npages); 715 swap_pager_full = 2; 716 swap_pager_almost_full = 1; 717 } 718 swdevhd = NULL; 719 done: 720 mtx_unlock(&sw_dev_mtx); 721 return (blk); 722 } 723 724 static int 725 swp_pager_isondev(daddr_t blk, struct swdevt *sp) 726 { 727 728 return (blk >= sp->sw_first && blk < sp->sw_end); 729 } 730 731 static void 732 swp_pager_strategy(struct buf *bp) 733 { 734 struct swdevt *sp; 735 736 mtx_lock(&sw_dev_mtx); 737 TAILQ_FOREACH(sp, &swtailq, sw_list) { 738 if (bp->b_blkno >= sp->sw_first && bp->b_blkno < sp->sw_end) { 739 mtx_unlock(&sw_dev_mtx); 740 sp->sw_strategy(bp, sp); 741 return; 742 } 743 } 744 panic("Swapdev not found"); 745 } 746 747 748 /* 749 * SWP_PAGER_FREESWAPSPACE() - free raw swap space 750 * 751 * This routine returns the specified swap blocks back to the bitmap. 752 * 753 * Note: This routine may not block (it could in the old swap code), 754 * and through the use of the new blist routines it does not block. 755 * 756 * We must be called at splvm() to avoid races with bitmap frees from 757 * vm_page_remove() aka swap_pager_page_removed(). 758 * 759 * This routine may not block 760 * This routine must be called at splvm(). 761 */ 762 static void 763 swp_pager_freeswapspace(daddr_t blk, int npages) 764 { 765 struct swdevt *sp; 766 767 mtx_lock(&sw_dev_mtx); 768 TAILQ_FOREACH(sp, &swtailq, sw_list) { 769 if (blk >= sp->sw_first && blk < sp->sw_end) { 770 sp->sw_used -= npages; 771 /* 772 * If we are attempting to stop swapping on 773 * this device, we don't want to mark any 774 * blocks free lest they be reused. 775 */ 776 if ((sp->sw_flags & SW_CLOSING) == 0) { 777 blist_free(sp->sw_blist, blk - sp->sw_first, 778 npages); 779 swap_pager_avail += npages; 780 swp_sizecheck(); 781 } 782 mtx_unlock(&sw_dev_mtx); 783 return; 784 } 785 } 786 panic("Swapdev not found"); 787 } 788 789 /* 790 * SWAP_PAGER_FREESPACE() - frees swap blocks associated with a page 791 * range within an object. 792 * 793 * This is a globally accessible routine. 794 * 795 * This routine removes swapblk assignments from swap metadata. 796 * 797 * The external callers of this routine typically have already destroyed 798 * or renamed vm_page_t's associated with this range in the object so 799 * we should be ok. 800 * 801 * This routine may be called at any spl. We up our spl to splvm temporarily 802 * in order to perform the metadata removal. 803 */ 804 void 805 swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size) 806 { 807 808 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 809 swp_pager_meta_free(object, start, size); 810 } 811 812 /* 813 * SWAP_PAGER_RESERVE() - reserve swap blocks in object 814 * 815 * Assigns swap blocks to the specified range within the object. The 816 * swap blocks are not zerod. Any previous swap assignment is destroyed. 817 * 818 * Returns 0 on success, -1 on failure. 819 */ 820 int 821 swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size) 822 { 823 int n = 0; 824 daddr_t blk = SWAPBLK_NONE; 825 vm_pindex_t beg = start; /* save start index */ 826 827 VM_OBJECT_LOCK(object); 828 while (size) { 829 if (n == 0) { 830 n = BLIST_MAX_ALLOC; 831 while ((blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE) { 832 n >>= 1; 833 if (n == 0) { 834 swp_pager_meta_free(object, beg, start - beg); 835 VM_OBJECT_UNLOCK(object); 836 return (-1); 837 } 838 } 839 } 840 swp_pager_meta_build(object, start, blk); 841 --size; 842 ++start; 843 ++blk; 844 --n; 845 } 846 swp_pager_meta_free(object, start, n); 847 VM_OBJECT_UNLOCK(object); 848 return (0); 849 } 850 851 /* 852 * SWAP_PAGER_COPY() - copy blocks from source pager to destination pager 853 * and destroy the source. 854 * 855 * Copy any valid swapblks from the source to the destination. In 856 * cases where both the source and destination have a valid swapblk, 857 * we keep the destination's. 858 * 859 * This routine is allowed to block. It may block allocating metadata 860 * indirectly through swp_pager_meta_build() or if paging is still in 861 * progress on the source. 862 * 863 * This routine can be called at any spl 864 * 865 * XXX vm_page_collapse() kinda expects us not to block because we 866 * supposedly do not need to allocate memory, but for the moment we 867 * *may* have to get a little memory from the zone allocator, but 868 * it is taken from the interrupt memory. We should be ok. 869 * 870 * The source object contains no vm_page_t's (which is just as well) 871 * 872 * The source object is of type OBJT_SWAP. 873 * 874 * The source and destination objects must be locked or 875 * inaccessible (XXX are they ?) 876 */ 877 void 878 swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject, 879 vm_pindex_t offset, int destroysource) 880 { 881 vm_pindex_t i; 882 883 VM_OBJECT_LOCK_ASSERT(srcobject, MA_OWNED); 884 VM_OBJECT_LOCK_ASSERT(dstobject, MA_OWNED); 885 886 /* 887 * If destroysource is set, we remove the source object from the 888 * swap_pager internal queue now. 889 */ 890 if (destroysource) { 891 if (srcobject->handle != NULL) { 892 mtx_lock(&sw_alloc_mtx); 893 TAILQ_REMOVE( 894 NOBJLIST(srcobject->handle), 895 srcobject, 896 pager_object_list 897 ); 898 mtx_unlock(&sw_alloc_mtx); 899 } 900 } 901 902 /* 903 * transfer source to destination. 904 */ 905 for (i = 0; i < dstobject->size; ++i) { 906 daddr_t dstaddr; 907 908 /* 909 * Locate (without changing) the swapblk on the destination, 910 * unless it is invalid in which case free it silently, or 911 * if the destination is a resident page, in which case the 912 * source is thrown away. 913 */ 914 dstaddr = swp_pager_meta_ctl(dstobject, i, 0); 915 916 if (dstaddr == SWAPBLK_NONE) { 917 /* 918 * Destination has no swapblk and is not resident, 919 * copy source. 920 */ 921 daddr_t srcaddr; 922 923 srcaddr = swp_pager_meta_ctl( 924 srcobject, 925 i + offset, 926 SWM_POP 927 ); 928 929 if (srcaddr != SWAPBLK_NONE) { 930 /* 931 * swp_pager_meta_build() can sleep. 932 */ 933 vm_object_pip_add(srcobject, 1); 934 VM_OBJECT_UNLOCK(srcobject); 935 vm_object_pip_add(dstobject, 1); 936 swp_pager_meta_build(dstobject, i, srcaddr); 937 vm_object_pip_wakeup(dstobject); 938 VM_OBJECT_LOCK(srcobject); 939 vm_object_pip_wakeup(srcobject); 940 } 941 } else { 942 /* 943 * Destination has valid swapblk or it is represented 944 * by a resident page. We destroy the sourceblock. 945 */ 946 947 swp_pager_meta_ctl(srcobject, i + offset, SWM_FREE); 948 } 949 } 950 951 /* 952 * Free left over swap blocks in source. 953 * 954 * We have to revert the type to OBJT_DEFAULT so we do not accidently 955 * double-remove the object from the swap queues. 956 */ 957 if (destroysource) { 958 swp_pager_meta_free_all(srcobject); 959 /* 960 * Reverting the type is not necessary, the caller is going 961 * to destroy srcobject directly, but I'm doing it here 962 * for consistency since we've removed the object from its 963 * queues. 964 */ 965 srcobject->type = OBJT_DEFAULT; 966 } 967 } 968 969 /* 970 * SWAP_PAGER_HASPAGE() - determine if we have good backing store for 971 * the requested page. 972 * 973 * We determine whether good backing store exists for the requested 974 * page and return TRUE if it does, FALSE if it doesn't. 975 * 976 * If TRUE, we also try to determine how much valid, contiguous backing 977 * store exists before and after the requested page within a reasonable 978 * distance. We do not try to restrict it to the swap device stripe 979 * (that is handled in getpages/putpages). It probably isn't worth 980 * doing here. 981 */ 982 static boolean_t 983 swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) 984 { 985 daddr_t blk0; 986 987 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 988 /* 989 * do we have good backing store at the requested index ? 990 */ 991 blk0 = swp_pager_meta_ctl(object, pindex, 0); 992 993 if (blk0 == SWAPBLK_NONE) { 994 if (before) 995 *before = 0; 996 if (after) 997 *after = 0; 998 return (FALSE); 999 } 1000 1001 /* 1002 * find backwards-looking contiguous good backing store 1003 */ 1004 if (before != NULL) { 1005 int i; 1006 1007 for (i = 1; i < (SWB_NPAGES/2); ++i) { 1008 daddr_t blk; 1009 1010 if (i > pindex) 1011 break; 1012 blk = swp_pager_meta_ctl(object, pindex - i, 0); 1013 if (blk != blk0 - i) 1014 break; 1015 } 1016 *before = (i - 1); 1017 } 1018 1019 /* 1020 * find forward-looking contiguous good backing store 1021 */ 1022 if (after != NULL) { 1023 int i; 1024 1025 for (i = 1; i < (SWB_NPAGES/2); ++i) { 1026 daddr_t blk; 1027 1028 blk = swp_pager_meta_ctl(object, pindex + i, 0); 1029 if (blk != blk0 + i) 1030 break; 1031 } 1032 *after = (i - 1); 1033 } 1034 return (TRUE); 1035 } 1036 1037 /* 1038 * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page 1039 * 1040 * This removes any associated swap backing store, whether valid or 1041 * not, from the page. 1042 * 1043 * This routine is typically called when a page is made dirty, at 1044 * which point any associated swap can be freed. MADV_FREE also 1045 * calls us in a special-case situation 1046 * 1047 * NOTE!!! If the page is clean and the swap was valid, the caller 1048 * should make the page dirty before calling this routine. This routine 1049 * does NOT change the m->dirty status of the page. Also: MADV_FREE 1050 * depends on it. 1051 * 1052 * This routine may not block 1053 * This routine must be called at splvm() 1054 */ 1055 static void 1056 swap_pager_unswapped(vm_page_t m) 1057 { 1058 1059 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 1060 swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE); 1061 } 1062 1063 /* 1064 * SWAP_PAGER_GETPAGES() - bring pages in from swap 1065 * 1066 * Attempt to retrieve (m, count) pages from backing store, but make 1067 * sure we retrieve at least m[reqpage]. We try to load in as large 1068 * a chunk surrounding m[reqpage] as is contiguous in swap and which 1069 * belongs to the same object. 1070 * 1071 * The code is designed for asynchronous operation and 1072 * immediate-notification of 'reqpage' but tends not to be 1073 * used that way. Please do not optimize-out this algorithmic 1074 * feature, I intend to improve on it in the future. 1075 * 1076 * The parent has a single vm_object_pip_add() reference prior to 1077 * calling us and we should return with the same. 1078 * 1079 * The parent has BUSY'd the pages. We should return with 'm' 1080 * left busy, but the others adjusted. 1081 */ 1082 static int 1083 swap_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage) 1084 { 1085 struct buf *bp; 1086 vm_page_t mreq; 1087 int i; 1088 int j; 1089 daddr_t blk; 1090 1091 mreq = m[reqpage]; 1092 1093 KASSERT(mreq->object == object, 1094 ("swap_pager_getpages: object mismatch %p/%p", 1095 object, mreq->object)); 1096 1097 /* 1098 * Calculate range to retrieve. The pages have already been assigned 1099 * their swapblks. We require a *contiguous* range but we know it to 1100 * not span devices. If we do not supply it, bad things 1101 * happen. Note that blk, iblk & jblk can be SWAPBLK_NONE, but the 1102 * loops are set up such that the case(s) are handled implicitly. 1103 * 1104 * The swp_*() calls must be made with the object locked. 1105 */ 1106 blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0); 1107 1108 for (i = reqpage - 1; i >= 0; --i) { 1109 daddr_t iblk; 1110 1111 iblk = swp_pager_meta_ctl(m[i]->object, m[i]->pindex, 0); 1112 if (blk != iblk + (reqpage - i)) 1113 break; 1114 } 1115 ++i; 1116 1117 for (j = reqpage + 1; j < count; ++j) { 1118 daddr_t jblk; 1119 1120 jblk = swp_pager_meta_ctl(m[j]->object, m[j]->pindex, 0); 1121 if (blk != jblk - (j - reqpage)) 1122 break; 1123 } 1124 1125 /* 1126 * free pages outside our collection range. Note: we never free 1127 * mreq, it must remain busy throughout. 1128 */ 1129 if (0 < i || j < count) { 1130 int k; 1131 1132 vm_page_lock_queues(); 1133 for (k = 0; k < i; ++k) 1134 vm_page_free(m[k]); 1135 for (k = j; k < count; ++k) 1136 vm_page_free(m[k]); 1137 vm_page_unlock_queues(); 1138 } 1139 1140 /* 1141 * Return VM_PAGER_FAIL if we have nothing to do. Return mreq 1142 * still busy, but the others unbusied. 1143 */ 1144 if (blk == SWAPBLK_NONE) 1145 return (VM_PAGER_FAIL); 1146 1147 /* 1148 * Getpbuf() can sleep. 1149 */ 1150 VM_OBJECT_UNLOCK(object); 1151 /* 1152 * Get a swap buffer header to perform the IO 1153 */ 1154 bp = getpbuf(&nsw_rcount); 1155 bp->b_flags |= B_PAGING; 1156 1157 /* 1158 * map our page(s) into kva for input 1159 */ 1160 pmap_qenter((vm_offset_t)bp->b_data, m + i, j - i); 1161 1162 bp->b_iocmd = BIO_READ; 1163 bp->b_iodone = swp_pager_async_iodone; 1164 bp->b_rcred = crhold(thread0.td_ucred); 1165 bp->b_wcred = crhold(thread0.td_ucred); 1166 bp->b_blkno = blk - (reqpage - i); 1167 bp->b_bcount = PAGE_SIZE * (j - i); 1168 bp->b_bufsize = PAGE_SIZE * (j - i); 1169 bp->b_pager.pg_reqpage = reqpage - i; 1170 1171 VM_OBJECT_LOCK(object); 1172 { 1173 int k; 1174 1175 for (k = i; k < j; ++k) { 1176 bp->b_pages[k - i] = m[k]; 1177 m[k]->oflags |= VPO_SWAPINPROG; 1178 } 1179 } 1180 bp->b_npages = j - i; 1181 1182 PCPU_INC(cnt.v_swapin); 1183 PCPU_ADD(cnt.v_swappgsin, bp->b_npages); 1184 1185 /* 1186 * We still hold the lock on mreq, and our automatic completion routine 1187 * does not remove it. 1188 */ 1189 vm_object_pip_add(object, bp->b_npages); 1190 VM_OBJECT_UNLOCK(object); 1191 1192 /* 1193 * perform the I/O. NOTE!!! bp cannot be considered valid after 1194 * this point because we automatically release it on completion. 1195 * Instead, we look at the one page we are interested in which we 1196 * still hold a lock on even through the I/O completion. 1197 * 1198 * The other pages in our m[] array are also released on completion, 1199 * so we cannot assume they are valid anymore either. 1200 * 1201 * NOTE: b_blkno is destroyed by the call to swapdev_strategy 1202 */ 1203 BUF_KERNPROC(bp); 1204 swp_pager_strategy(bp); 1205 1206 /* 1207 * wait for the page we want to complete. VPO_SWAPINPROG is always 1208 * cleared on completion. If an I/O error occurs, SWAPBLK_NONE 1209 * is set in the meta-data. 1210 */ 1211 VM_OBJECT_LOCK(object); 1212 while ((mreq->oflags & VPO_SWAPINPROG) != 0) { 1213 mreq->oflags |= VPO_WANTED; 1214 PCPU_INC(cnt.v_intrans); 1215 if (msleep(mreq, VM_OBJECT_MTX(object), PSWP, "swread", hz*20)) { 1216 printf( 1217 "swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n", 1218 bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount); 1219 } 1220 } 1221 1222 /* 1223 * mreq is left busied after completion, but all the other pages 1224 * are freed. If we had an unrecoverable read error the page will 1225 * not be valid. 1226 */ 1227 if (mreq->valid != VM_PAGE_BITS_ALL) { 1228 return (VM_PAGER_ERROR); 1229 } else { 1230 return (VM_PAGER_OK); 1231 } 1232 1233 /* 1234 * A final note: in a low swap situation, we cannot deallocate swap 1235 * and mark a page dirty here because the caller is likely to mark 1236 * the page clean when we return, causing the page to possibly revert 1237 * to all-zero's later. 1238 */ 1239 } 1240 1241 /* 1242 * swap_pager_putpages: 1243 * 1244 * Assign swap (if necessary) and initiate I/O on the specified pages. 1245 * 1246 * We support both OBJT_DEFAULT and OBJT_SWAP objects. DEFAULT objects 1247 * are automatically converted to SWAP objects. 1248 * 1249 * In a low memory situation we may block in VOP_STRATEGY(), but the new 1250 * vm_page reservation system coupled with properly written VFS devices 1251 * should ensure that no low-memory deadlock occurs. This is an area 1252 * which needs work. 1253 * 1254 * The parent has N vm_object_pip_add() references prior to 1255 * calling us and will remove references for rtvals[] that are 1256 * not set to VM_PAGER_PEND. We need to remove the rest on I/O 1257 * completion. 1258 * 1259 * The parent has soft-busy'd the pages it passes us and will unbusy 1260 * those whos rtvals[] entry is not set to VM_PAGER_PEND on return. 1261 * We need to unbusy the rest on I/O completion. 1262 */ 1263 void 1264 swap_pager_putpages(vm_object_t object, vm_page_t *m, int count, 1265 boolean_t sync, int *rtvals) 1266 { 1267 int i; 1268 int n = 0; 1269 1270 if (count && m[0]->object != object) { 1271 panic("swap_pager_putpages: object mismatch %p/%p", 1272 object, 1273 m[0]->object 1274 ); 1275 } 1276 1277 /* 1278 * Step 1 1279 * 1280 * Turn object into OBJT_SWAP 1281 * check for bogus sysops 1282 * force sync if not pageout process 1283 */ 1284 if (object->type != OBJT_SWAP) 1285 swp_pager_meta_build(object, 0, SWAPBLK_NONE); 1286 VM_OBJECT_UNLOCK(object); 1287 1288 if (curproc != pageproc) 1289 sync = TRUE; 1290 1291 /* 1292 * Step 2 1293 * 1294 * Update nsw parameters from swap_async_max sysctl values. 1295 * Do not let the sysop crash the machine with bogus numbers. 1296 */ 1297 mtx_lock(&pbuf_mtx); 1298 if (swap_async_max != nsw_wcount_async_max) { 1299 int n; 1300 1301 /* 1302 * limit range 1303 */ 1304 if ((n = swap_async_max) > nswbuf / 2) 1305 n = nswbuf / 2; 1306 if (n < 1) 1307 n = 1; 1308 swap_async_max = n; 1309 1310 /* 1311 * Adjust difference ( if possible ). If the current async 1312 * count is too low, we may not be able to make the adjustment 1313 * at this time. 1314 */ 1315 n -= nsw_wcount_async_max; 1316 if (nsw_wcount_async + n >= 0) { 1317 nsw_wcount_async += n; 1318 nsw_wcount_async_max += n; 1319 wakeup(&nsw_wcount_async); 1320 } 1321 } 1322 mtx_unlock(&pbuf_mtx); 1323 1324 /* 1325 * Step 3 1326 * 1327 * Assign swap blocks and issue I/O. We reallocate swap on the fly. 1328 * The page is left dirty until the pageout operation completes 1329 * successfully. 1330 */ 1331 for (i = 0; i < count; i += n) { 1332 int j; 1333 struct buf *bp; 1334 daddr_t blk; 1335 1336 /* 1337 * Maximum I/O size is limited by a number of factors. 1338 */ 1339 n = min(BLIST_MAX_ALLOC, count - i); 1340 n = min(n, nsw_cluster_max); 1341 1342 /* 1343 * Get biggest block of swap we can. If we fail, fall 1344 * back and try to allocate a smaller block. Don't go 1345 * overboard trying to allocate space if it would overly 1346 * fragment swap. 1347 */ 1348 while ( 1349 (blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE && 1350 n > 4 1351 ) { 1352 n >>= 1; 1353 } 1354 if (blk == SWAPBLK_NONE) { 1355 for (j = 0; j < n; ++j) 1356 rtvals[i+j] = VM_PAGER_FAIL; 1357 continue; 1358 } 1359 1360 /* 1361 * All I/O parameters have been satisfied, build the I/O 1362 * request and assign the swap space. 1363 */ 1364 if (sync == TRUE) { 1365 bp = getpbuf(&nsw_wcount_sync); 1366 } else { 1367 bp = getpbuf(&nsw_wcount_async); 1368 bp->b_flags = B_ASYNC; 1369 } 1370 bp->b_flags |= B_PAGING; 1371 bp->b_iocmd = BIO_WRITE; 1372 1373 pmap_qenter((vm_offset_t)bp->b_data, &m[i], n); 1374 1375 bp->b_rcred = crhold(thread0.td_ucred); 1376 bp->b_wcred = crhold(thread0.td_ucred); 1377 bp->b_bcount = PAGE_SIZE * n; 1378 bp->b_bufsize = PAGE_SIZE * n; 1379 bp->b_blkno = blk; 1380 1381 VM_OBJECT_LOCK(object); 1382 for (j = 0; j < n; ++j) { 1383 vm_page_t mreq = m[i+j]; 1384 1385 swp_pager_meta_build( 1386 mreq->object, 1387 mreq->pindex, 1388 blk + j 1389 ); 1390 vm_page_dirty(mreq); 1391 rtvals[i+j] = VM_PAGER_OK; 1392 1393 mreq->oflags |= VPO_SWAPINPROG; 1394 bp->b_pages[j] = mreq; 1395 } 1396 VM_OBJECT_UNLOCK(object); 1397 bp->b_npages = n; 1398 /* 1399 * Must set dirty range for NFS to work. 1400 */ 1401 bp->b_dirtyoff = 0; 1402 bp->b_dirtyend = bp->b_bcount; 1403 1404 PCPU_INC(cnt.v_swapout); 1405 PCPU_ADD(cnt.v_swappgsout, bp->b_npages); 1406 1407 /* 1408 * asynchronous 1409 * 1410 * NOTE: b_blkno is destroyed by the call to swapdev_strategy 1411 */ 1412 if (sync == FALSE) { 1413 bp->b_iodone = swp_pager_async_iodone; 1414 BUF_KERNPROC(bp); 1415 swp_pager_strategy(bp); 1416 1417 for (j = 0; j < n; ++j) 1418 rtvals[i+j] = VM_PAGER_PEND; 1419 /* restart outter loop */ 1420 continue; 1421 } 1422 1423 /* 1424 * synchronous 1425 * 1426 * NOTE: b_blkno is destroyed by the call to swapdev_strategy 1427 */ 1428 bp->b_iodone = bdone; 1429 swp_pager_strategy(bp); 1430 1431 /* 1432 * Wait for the sync I/O to complete, then update rtvals. 1433 * We just set the rtvals[] to VM_PAGER_PEND so we can call 1434 * our async completion routine at the end, thus avoiding a 1435 * double-free. 1436 */ 1437 bwait(bp, PVM, "swwrt"); 1438 for (j = 0; j < n; ++j) 1439 rtvals[i+j] = VM_PAGER_PEND; 1440 /* 1441 * Now that we are through with the bp, we can call the 1442 * normal async completion, which frees everything up. 1443 */ 1444 swp_pager_async_iodone(bp); 1445 } 1446 VM_OBJECT_LOCK(object); 1447 } 1448 1449 /* 1450 * swp_pager_async_iodone: 1451 * 1452 * Completion routine for asynchronous reads and writes from/to swap. 1453 * Also called manually by synchronous code to finish up a bp. 1454 * 1455 * For READ operations, the pages are PG_BUSY'd. For WRITE operations, 1456 * the pages are vm_page_t->busy'd. For READ operations, we PG_BUSY 1457 * unbusy all pages except the 'main' request page. For WRITE 1458 * operations, we vm_page_t->busy'd unbusy all pages ( we can do this 1459 * because we marked them all VM_PAGER_PEND on return from putpages ). 1460 * 1461 * This routine may not block. 1462 */ 1463 static void 1464 swp_pager_async_iodone(struct buf *bp) 1465 { 1466 int i; 1467 vm_object_t object = NULL; 1468 1469 /* 1470 * report error 1471 */ 1472 if (bp->b_ioflags & BIO_ERROR) { 1473 printf( 1474 "swap_pager: I/O error - %s failed; blkno %ld," 1475 "size %ld, error %d\n", 1476 ((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"), 1477 (long)bp->b_blkno, 1478 (long)bp->b_bcount, 1479 bp->b_error 1480 ); 1481 } 1482 1483 /* 1484 * remove the mapping for kernel virtual 1485 */ 1486 pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages); 1487 1488 if (bp->b_npages) { 1489 object = bp->b_pages[0]->object; 1490 VM_OBJECT_LOCK(object); 1491 } 1492 vm_page_lock_queues(); 1493 /* 1494 * cleanup pages. If an error occurs writing to swap, we are in 1495 * very serious trouble. If it happens to be a disk error, though, 1496 * we may be able to recover by reassigning the swap later on. So 1497 * in this case we remove the m->swapblk assignment for the page 1498 * but do not free it in the rlist. The errornous block(s) are thus 1499 * never reallocated as swap. Redirty the page and continue. 1500 */ 1501 for (i = 0; i < bp->b_npages; ++i) { 1502 vm_page_t m = bp->b_pages[i]; 1503 1504 m->oflags &= ~VPO_SWAPINPROG; 1505 1506 if (bp->b_ioflags & BIO_ERROR) { 1507 /* 1508 * If an error occurs I'd love to throw the swapblk 1509 * away without freeing it back to swapspace, so it 1510 * can never be used again. But I can't from an 1511 * interrupt. 1512 */ 1513 if (bp->b_iocmd == BIO_READ) { 1514 /* 1515 * When reading, reqpage needs to stay 1516 * locked for the parent, but all other 1517 * pages can be freed. We still want to 1518 * wakeup the parent waiting on the page, 1519 * though. ( also: pg_reqpage can be -1 and 1520 * not match anything ). 1521 * 1522 * We have to wake specifically requested pages 1523 * up too because we cleared VPO_SWAPINPROG and 1524 * someone may be waiting for that. 1525 * 1526 * NOTE: for reads, m->dirty will probably 1527 * be overridden by the original caller of 1528 * getpages so don't play cute tricks here. 1529 */ 1530 m->valid = 0; 1531 if (i != bp->b_pager.pg_reqpage) 1532 vm_page_free(m); 1533 else 1534 vm_page_flash(m); 1535 /* 1536 * If i == bp->b_pager.pg_reqpage, do not wake 1537 * the page up. The caller needs to. 1538 */ 1539 } else { 1540 /* 1541 * If a write error occurs, reactivate page 1542 * so it doesn't clog the inactive list, 1543 * then finish the I/O. 1544 */ 1545 vm_page_dirty(m); 1546 vm_page_activate(m); 1547 vm_page_io_finish(m); 1548 } 1549 } else if (bp->b_iocmd == BIO_READ) { 1550 /* 1551 * NOTE: for reads, m->dirty will probably be 1552 * overridden by the original caller of getpages so 1553 * we cannot set them in order to free the underlying 1554 * swap in a low-swap situation. I don't think we'd 1555 * want to do that anyway, but it was an optimization 1556 * that existed in the old swapper for a time before 1557 * it got ripped out due to precisely this problem. 1558 * 1559 * If not the requested page then deactivate it. 1560 * 1561 * Note that the requested page, reqpage, is left 1562 * busied, but we still have to wake it up. The 1563 * other pages are released (unbusied) by 1564 * vm_page_wakeup(). 1565 */ 1566 KASSERT(!pmap_page_is_mapped(m), 1567 ("swp_pager_async_iodone: page %p is mapped", m)); 1568 m->valid = VM_PAGE_BITS_ALL; 1569 KASSERT(m->dirty == 0, 1570 ("swp_pager_async_iodone: page %p is dirty", m)); 1571 1572 /* 1573 * We have to wake specifically requested pages 1574 * up too because we cleared VPO_SWAPINPROG and 1575 * could be waiting for it in getpages. However, 1576 * be sure to not unbusy getpages specifically 1577 * requested page - getpages expects it to be 1578 * left busy. 1579 */ 1580 if (i != bp->b_pager.pg_reqpage) { 1581 vm_page_deactivate(m); 1582 vm_page_wakeup(m); 1583 } else { 1584 vm_page_flash(m); 1585 } 1586 } else { 1587 /* 1588 * For write success, clear the dirty 1589 * status, then finish the I/O ( which decrements the 1590 * busy count and possibly wakes waiter's up ). 1591 */ 1592 KASSERT((m->flags & PG_WRITEABLE) == 0, 1593 ("swp_pager_async_iodone: page %p is not write" 1594 " protected", m)); 1595 vm_page_undirty(m); 1596 vm_page_io_finish(m); 1597 if (vm_page_count_severe()) 1598 vm_page_try_to_cache(m); 1599 } 1600 } 1601 vm_page_unlock_queues(); 1602 1603 /* 1604 * adjust pip. NOTE: the original parent may still have its own 1605 * pip refs on the object. 1606 */ 1607 if (object != NULL) { 1608 vm_object_pip_wakeupn(object, bp->b_npages); 1609 VM_OBJECT_UNLOCK(object); 1610 } 1611 1612 /* 1613 * swapdev_strategy() manually sets b_vp and b_bufobj before calling 1614 * bstrategy(). Set them back to NULL now we're done with it, or we'll 1615 * trigger a KASSERT in relpbuf(). 1616 */ 1617 if (bp->b_vp) { 1618 bp->b_vp = NULL; 1619 bp->b_bufobj = NULL; 1620 } 1621 /* 1622 * release the physical I/O buffer 1623 */ 1624 relpbuf( 1625 bp, 1626 ((bp->b_iocmd == BIO_READ) ? &nsw_rcount : 1627 ((bp->b_flags & B_ASYNC) ? 1628 &nsw_wcount_async : 1629 &nsw_wcount_sync 1630 ) 1631 ) 1632 ); 1633 } 1634 1635 /* 1636 * swap_pager_isswapped: 1637 * 1638 * Return 1 if at least one page in the given object is paged 1639 * out to the given swap device. 1640 * 1641 * This routine may not block. 1642 */ 1643 int 1644 swap_pager_isswapped(vm_object_t object, struct swdevt *sp) 1645 { 1646 daddr_t index = 0; 1647 int bcount; 1648 int i; 1649 1650 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 1651 if (object->type != OBJT_SWAP) 1652 return (0); 1653 1654 mtx_lock(&swhash_mtx); 1655 for (bcount = 0; bcount < object->un_pager.swp.swp_bcount; bcount++) { 1656 struct swblock *swap; 1657 1658 if ((swap = *swp_pager_hash(object, index)) != NULL) { 1659 for (i = 0; i < SWAP_META_PAGES; ++i) { 1660 if (swp_pager_isondev(swap->swb_pages[i], sp)) { 1661 mtx_unlock(&swhash_mtx); 1662 return (1); 1663 } 1664 } 1665 } 1666 index += SWAP_META_PAGES; 1667 if (index > 0x20000000) 1668 panic("swap_pager_isswapped: failed to locate all swap meta blocks"); 1669 } 1670 mtx_unlock(&swhash_mtx); 1671 return (0); 1672 } 1673 1674 /* 1675 * SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in 1676 * 1677 * This routine dissociates the page at the given index within a 1678 * swap block from its backing store, paging it in if necessary. 1679 * If the page is paged in, it is placed in the inactive queue, 1680 * since it had its backing store ripped out from under it. 1681 * We also attempt to swap in all other pages in the swap block, 1682 * we only guarantee that the one at the specified index is 1683 * paged in. 1684 * 1685 * XXX - The code to page the whole block in doesn't work, so we 1686 * revert to the one-by-one behavior for now. Sigh. 1687 */ 1688 static inline void 1689 swp_pager_force_pagein(vm_object_t object, vm_pindex_t pindex) 1690 { 1691 vm_page_t m; 1692 1693 vm_object_pip_add(object, 1); 1694 m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL|VM_ALLOC_RETRY); 1695 if (m->valid == VM_PAGE_BITS_ALL) { 1696 vm_object_pip_subtract(object, 1); 1697 vm_page_lock_queues(); 1698 vm_page_activate(m); 1699 vm_page_dirty(m); 1700 vm_page_unlock_queues(); 1701 vm_page_wakeup(m); 1702 vm_pager_page_unswapped(m); 1703 return; 1704 } 1705 1706 if (swap_pager_getpages(object, &m, 1, 0) != VM_PAGER_OK) 1707 panic("swap_pager_force_pagein: read from swap failed");/*XXX*/ 1708 vm_object_pip_subtract(object, 1); 1709 vm_page_lock_queues(); 1710 vm_page_dirty(m); 1711 vm_page_dontneed(m); 1712 vm_page_unlock_queues(); 1713 vm_page_wakeup(m); 1714 vm_pager_page_unswapped(m); 1715 } 1716 1717 /* 1718 * swap_pager_swapoff: 1719 * 1720 * Page in all of the pages that have been paged out to the 1721 * given device. The corresponding blocks in the bitmap must be 1722 * marked as allocated and the device must be flagged SW_CLOSING. 1723 * There may be no processes swapped out to the device. 1724 * 1725 * This routine may block. 1726 */ 1727 static void 1728 swap_pager_swapoff(struct swdevt *sp) 1729 { 1730 struct swblock *swap; 1731 int i, j, retries; 1732 1733 GIANT_REQUIRED; 1734 1735 retries = 0; 1736 full_rescan: 1737 mtx_lock(&swhash_mtx); 1738 for (i = 0; i <= swhash_mask; i++) { /* '<=' is correct here */ 1739 restart: 1740 for (swap = swhash[i]; swap != NULL; swap = swap->swb_hnext) { 1741 vm_object_t object = swap->swb_object; 1742 vm_pindex_t pindex = swap->swb_index; 1743 for (j = 0; j < SWAP_META_PAGES; ++j) { 1744 if (swp_pager_isondev(swap->swb_pages[j], sp)) { 1745 /* avoid deadlock */ 1746 if (!VM_OBJECT_TRYLOCK(object)) { 1747 break; 1748 } else { 1749 mtx_unlock(&swhash_mtx); 1750 swp_pager_force_pagein(object, 1751 pindex + j); 1752 VM_OBJECT_UNLOCK(object); 1753 mtx_lock(&swhash_mtx); 1754 goto restart; 1755 } 1756 } 1757 } 1758 } 1759 } 1760 mtx_unlock(&swhash_mtx); 1761 if (sp->sw_used) { 1762 /* 1763 * Objects may be locked or paging to the device being 1764 * removed, so we will miss their pages and need to 1765 * make another pass. We have marked this device as 1766 * SW_CLOSING, so the activity should finish soon. 1767 */ 1768 retries++; 1769 if (retries > 100) { 1770 panic("swapoff: failed to locate %d swap blocks", 1771 sp->sw_used); 1772 } 1773 pause("swpoff", hz / 20); 1774 goto full_rescan; 1775 } 1776 } 1777 1778 /************************************************************************ 1779 * SWAP META DATA * 1780 ************************************************************************ 1781 * 1782 * These routines manipulate the swap metadata stored in the 1783 * OBJT_SWAP object. All swp_*() routines must be called at 1784 * splvm() because swap can be freed up by the low level vm_page 1785 * code which might be called from interrupts beyond what splbio() covers. 1786 * 1787 * Swap metadata is implemented with a global hash and not directly 1788 * linked into the object. Instead the object simply contains 1789 * appropriate tracking counters. 1790 */ 1791 1792 /* 1793 * SWP_PAGER_META_BUILD() - add swap block to swap meta data for object 1794 * 1795 * We first convert the object to a swap object if it is a default 1796 * object. 1797 * 1798 * The specified swapblk is added to the object's swap metadata. If 1799 * the swapblk is not valid, it is freed instead. Any previously 1800 * assigned swapblk is freed. 1801 * 1802 * This routine must be called at splvm(), except when used to convert 1803 * an OBJT_DEFAULT object into an OBJT_SWAP object. 1804 */ 1805 static void 1806 swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk) 1807 { 1808 struct swblock *swap; 1809 struct swblock **pswap; 1810 int idx; 1811 1812 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 1813 /* 1814 * Convert default object to swap object if necessary 1815 */ 1816 if (object->type != OBJT_SWAP) { 1817 object->type = OBJT_SWAP; 1818 object->un_pager.swp.swp_bcount = 0; 1819 1820 if (object->handle != NULL) { 1821 mtx_lock(&sw_alloc_mtx); 1822 TAILQ_INSERT_TAIL( 1823 NOBJLIST(object->handle), 1824 object, 1825 pager_object_list 1826 ); 1827 mtx_unlock(&sw_alloc_mtx); 1828 } 1829 } 1830 1831 /* 1832 * Locate hash entry. If not found create, but if we aren't adding 1833 * anything just return. If we run out of space in the map we wait 1834 * and, since the hash table may have changed, retry. 1835 */ 1836 retry: 1837 mtx_lock(&swhash_mtx); 1838 pswap = swp_pager_hash(object, pindex); 1839 1840 if ((swap = *pswap) == NULL) { 1841 int i; 1842 1843 if (swapblk == SWAPBLK_NONE) 1844 goto done; 1845 1846 swap = *pswap = uma_zalloc(swap_zone, M_NOWAIT); 1847 if (swap == NULL) { 1848 mtx_unlock(&swhash_mtx); 1849 VM_OBJECT_UNLOCK(object); 1850 if (uma_zone_exhausted(swap_zone)) { 1851 printf("swap zone exhausted, increase kern.maxswzone\n"); 1852 vm_pageout_oom(VM_OOM_SWAPZ); 1853 pause("swzonex", 10); 1854 } else 1855 VM_WAIT; 1856 VM_OBJECT_LOCK(object); 1857 goto retry; 1858 } 1859 1860 swap->swb_hnext = NULL; 1861 swap->swb_object = object; 1862 swap->swb_index = pindex & ~(vm_pindex_t)SWAP_META_MASK; 1863 swap->swb_count = 0; 1864 1865 ++object->un_pager.swp.swp_bcount; 1866 1867 for (i = 0; i < SWAP_META_PAGES; ++i) 1868 swap->swb_pages[i] = SWAPBLK_NONE; 1869 } 1870 1871 /* 1872 * Delete prior contents of metadata 1873 */ 1874 idx = pindex & SWAP_META_MASK; 1875 1876 if (swap->swb_pages[idx] != SWAPBLK_NONE) { 1877 swp_pager_freeswapspace(swap->swb_pages[idx], 1); 1878 --swap->swb_count; 1879 } 1880 1881 /* 1882 * Enter block into metadata 1883 */ 1884 swap->swb_pages[idx] = swapblk; 1885 if (swapblk != SWAPBLK_NONE) 1886 ++swap->swb_count; 1887 done: 1888 mtx_unlock(&swhash_mtx); 1889 } 1890 1891 /* 1892 * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata 1893 * 1894 * The requested range of blocks is freed, with any associated swap 1895 * returned to the swap bitmap. 1896 * 1897 * This routine will free swap metadata structures as they are cleaned 1898 * out. This routine does *NOT* operate on swap metadata associated 1899 * with resident pages. 1900 * 1901 * This routine must be called at splvm() 1902 */ 1903 static void 1904 swp_pager_meta_free(vm_object_t object, vm_pindex_t index, daddr_t count) 1905 { 1906 1907 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 1908 if (object->type != OBJT_SWAP) 1909 return; 1910 1911 while (count > 0) { 1912 struct swblock **pswap; 1913 struct swblock *swap; 1914 1915 mtx_lock(&swhash_mtx); 1916 pswap = swp_pager_hash(object, index); 1917 1918 if ((swap = *pswap) != NULL) { 1919 daddr_t v = swap->swb_pages[index & SWAP_META_MASK]; 1920 1921 if (v != SWAPBLK_NONE) { 1922 swp_pager_freeswapspace(v, 1); 1923 swap->swb_pages[index & SWAP_META_MASK] = 1924 SWAPBLK_NONE; 1925 if (--swap->swb_count == 0) { 1926 *pswap = swap->swb_hnext; 1927 uma_zfree(swap_zone, swap); 1928 --object->un_pager.swp.swp_bcount; 1929 } 1930 } 1931 --count; 1932 ++index; 1933 } else { 1934 int n = SWAP_META_PAGES - (index & SWAP_META_MASK); 1935 count -= n; 1936 index += n; 1937 } 1938 mtx_unlock(&swhash_mtx); 1939 } 1940 } 1941 1942 /* 1943 * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object 1944 * 1945 * This routine locates and destroys all swap metadata associated with 1946 * an object. 1947 * 1948 * This routine must be called at splvm() 1949 */ 1950 static void 1951 swp_pager_meta_free_all(vm_object_t object) 1952 { 1953 daddr_t index = 0; 1954 1955 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 1956 if (object->type != OBJT_SWAP) 1957 return; 1958 1959 while (object->un_pager.swp.swp_bcount) { 1960 struct swblock **pswap; 1961 struct swblock *swap; 1962 1963 mtx_lock(&swhash_mtx); 1964 pswap = swp_pager_hash(object, index); 1965 if ((swap = *pswap) != NULL) { 1966 int i; 1967 1968 for (i = 0; i < SWAP_META_PAGES; ++i) { 1969 daddr_t v = swap->swb_pages[i]; 1970 if (v != SWAPBLK_NONE) { 1971 --swap->swb_count; 1972 swp_pager_freeswapspace(v, 1); 1973 } 1974 } 1975 if (swap->swb_count != 0) 1976 panic("swap_pager_meta_free_all: swb_count != 0"); 1977 *pswap = swap->swb_hnext; 1978 uma_zfree(swap_zone, swap); 1979 --object->un_pager.swp.swp_bcount; 1980 } 1981 mtx_unlock(&swhash_mtx); 1982 index += SWAP_META_PAGES; 1983 if (index > 0x20000000) 1984 panic("swp_pager_meta_free_all: failed to locate all swap meta blocks"); 1985 } 1986 } 1987 1988 /* 1989 * SWP_PAGER_METACTL() - misc control of swap and vm_page_t meta data. 1990 * 1991 * This routine is capable of looking up, popping, or freeing 1992 * swapblk assignments in the swap meta data or in the vm_page_t. 1993 * The routine typically returns the swapblk being looked-up, or popped, 1994 * or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block 1995 * was invalid. This routine will automatically free any invalid 1996 * meta-data swapblks. 1997 * 1998 * It is not possible to store invalid swapblks in the swap meta data 1999 * (other then a literal 'SWAPBLK_NONE'), so we don't bother checking. 2000 * 2001 * When acting on a busy resident page and paging is in progress, we 2002 * have to wait until paging is complete but otherwise can act on the 2003 * busy page. 2004 * 2005 * This routine must be called at splvm(). 2006 * 2007 * SWM_FREE remove and free swap block from metadata 2008 * SWM_POP remove from meta data but do not free.. pop it out 2009 */ 2010 static daddr_t 2011 swp_pager_meta_ctl(vm_object_t object, vm_pindex_t pindex, int flags) 2012 { 2013 struct swblock **pswap; 2014 struct swblock *swap; 2015 daddr_t r1; 2016 int idx; 2017 2018 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 2019 /* 2020 * The meta data only exists of the object is OBJT_SWAP 2021 * and even then might not be allocated yet. 2022 */ 2023 if (object->type != OBJT_SWAP) 2024 return (SWAPBLK_NONE); 2025 2026 r1 = SWAPBLK_NONE; 2027 mtx_lock(&swhash_mtx); 2028 pswap = swp_pager_hash(object, pindex); 2029 2030 if ((swap = *pswap) != NULL) { 2031 idx = pindex & SWAP_META_MASK; 2032 r1 = swap->swb_pages[idx]; 2033 2034 if (r1 != SWAPBLK_NONE) { 2035 if (flags & SWM_FREE) { 2036 swp_pager_freeswapspace(r1, 1); 2037 r1 = SWAPBLK_NONE; 2038 } 2039 if (flags & (SWM_FREE|SWM_POP)) { 2040 swap->swb_pages[idx] = SWAPBLK_NONE; 2041 if (--swap->swb_count == 0) { 2042 *pswap = swap->swb_hnext; 2043 uma_zfree(swap_zone, swap); 2044 --object->un_pager.swp.swp_bcount; 2045 } 2046 } 2047 } 2048 } 2049 mtx_unlock(&swhash_mtx); 2050 return (r1); 2051 } 2052 2053 /* 2054 * System call swapon(name) enables swapping on device name, 2055 * which must be in the swdevsw. Return EBUSY 2056 * if already swapping on this device. 2057 */ 2058 #ifndef _SYS_SYSPROTO_H_ 2059 struct swapon_args { 2060 char *name; 2061 }; 2062 #endif 2063 2064 /* 2065 * MPSAFE 2066 */ 2067 /* ARGSUSED */ 2068 int 2069 swapon(struct thread *td, struct swapon_args *uap) 2070 { 2071 struct vattr attr; 2072 struct vnode *vp; 2073 struct nameidata nd; 2074 int error; 2075 2076 error = priv_check(td, PRIV_SWAPON); 2077 if (error) 2078 return (error); 2079 2080 mtx_lock(&Giant); 2081 while (swdev_syscall_active) 2082 tsleep(&swdev_syscall_active, PUSER - 1, "swpon", 0); 2083 swdev_syscall_active = 1; 2084 2085 /* 2086 * Swap metadata may not fit in the KVM if we have physical 2087 * memory of >1GB. 2088 */ 2089 if (swap_zone == NULL) { 2090 error = ENOMEM; 2091 goto done; 2092 } 2093 2094 NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | AUDITVNODE1, UIO_USERSPACE, 2095 uap->name, td); 2096 error = namei(&nd); 2097 if (error) 2098 goto done; 2099 2100 NDFREE(&nd, NDF_ONLY_PNBUF); 2101 vp = nd.ni_vp; 2102 2103 if (vn_isdisk(vp, &error)) { 2104 error = swapongeom(td, vp); 2105 } else if (vp->v_type == VREG && 2106 (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && 2107 (error = VOP_GETATTR(vp, &attr, td->td_ucred)) == 0) { 2108 /* 2109 * Allow direct swapping to NFS regular files in the same 2110 * way that nfs_mountroot() sets up diskless swapping. 2111 */ 2112 error = swaponvp(td, vp, attr.va_size / DEV_BSIZE); 2113 } 2114 2115 if (error) 2116 vrele(vp); 2117 done: 2118 swdev_syscall_active = 0; 2119 wakeup_one(&swdev_syscall_active); 2120 mtx_unlock(&Giant); 2121 return (error); 2122 } 2123 2124 static void 2125 swaponsomething(struct vnode *vp, void *id, u_long nblks, sw_strategy_t *strategy, sw_close_t *close, dev_t dev) 2126 { 2127 struct swdevt *sp, *tsp; 2128 swblk_t dvbase; 2129 u_long mblocks; 2130 2131 /* 2132 * If we go beyond this, we get overflows in the radix 2133 * tree bitmap code. 2134 */ 2135 mblocks = 0x40000000 / BLIST_META_RADIX; 2136 if (nblks > mblocks) { 2137 printf("WARNING: reducing size to maximum of %lu blocks per swap unit\n", 2138 mblocks); 2139 nblks = mblocks; 2140 } 2141 /* 2142 * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks. 2143 * First chop nblks off to page-align it, then convert. 2144 * 2145 * sw->sw_nblks is in page-sized chunks now too. 2146 */ 2147 nblks &= ~(ctodb(1) - 1); 2148 nblks = dbtoc(nblks); 2149 2150 sp = malloc(sizeof *sp, M_VMPGDATA, M_WAITOK | M_ZERO); 2151 sp->sw_vp = vp; 2152 sp->sw_id = id; 2153 sp->sw_dev = dev; 2154 sp->sw_flags = 0; 2155 sp->sw_nblks = nblks; 2156 sp->sw_used = 0; 2157 sp->sw_strategy = strategy; 2158 sp->sw_close = close; 2159 2160 sp->sw_blist = blist_create(nblks, M_WAITOK); 2161 /* 2162 * Do not free the first two block in order to avoid overwriting 2163 * any bsd label at the front of the partition 2164 */ 2165 blist_free(sp->sw_blist, 2, nblks - 2); 2166 2167 dvbase = 0; 2168 mtx_lock(&sw_dev_mtx); 2169 TAILQ_FOREACH(tsp, &swtailq, sw_list) { 2170 if (tsp->sw_end >= dvbase) { 2171 /* 2172 * We put one uncovered page between the devices 2173 * in order to definitively prevent any cross-device 2174 * I/O requests 2175 */ 2176 dvbase = tsp->sw_end + 1; 2177 } 2178 } 2179 sp->sw_first = dvbase; 2180 sp->sw_end = dvbase + nblks; 2181 TAILQ_INSERT_TAIL(&swtailq, sp, sw_list); 2182 nswapdev++; 2183 swap_pager_avail += nblks; 2184 swap_total += (vm_ooffset_t)nblks * PAGE_SIZE; 2185 swp_sizecheck(); 2186 mtx_unlock(&sw_dev_mtx); 2187 } 2188 2189 /* 2190 * SYSCALL: swapoff(devname) 2191 * 2192 * Disable swapping on the given device. 2193 * 2194 * XXX: Badly designed system call: it should use a device index 2195 * rather than filename as specification. We keep sw_vp around 2196 * only to make this work. 2197 */ 2198 #ifndef _SYS_SYSPROTO_H_ 2199 struct swapoff_args { 2200 char *name; 2201 }; 2202 #endif 2203 2204 /* 2205 * MPSAFE 2206 */ 2207 /* ARGSUSED */ 2208 int 2209 swapoff(struct thread *td, struct swapoff_args *uap) 2210 { 2211 struct vnode *vp; 2212 struct nameidata nd; 2213 struct swdevt *sp; 2214 int error; 2215 2216 error = priv_check(td, PRIV_SWAPOFF); 2217 if (error) 2218 return (error); 2219 2220 mtx_lock(&Giant); 2221 while (swdev_syscall_active) 2222 tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0); 2223 swdev_syscall_active = 1; 2224 2225 NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name, 2226 td); 2227 error = namei(&nd); 2228 if (error) 2229 goto done; 2230 NDFREE(&nd, NDF_ONLY_PNBUF); 2231 vp = nd.ni_vp; 2232 2233 mtx_lock(&sw_dev_mtx); 2234 TAILQ_FOREACH(sp, &swtailq, sw_list) { 2235 if (sp->sw_vp == vp) 2236 break; 2237 } 2238 mtx_unlock(&sw_dev_mtx); 2239 if (sp == NULL) { 2240 error = EINVAL; 2241 goto done; 2242 } 2243 error = swapoff_one(sp, td->td_ucred); 2244 done: 2245 swdev_syscall_active = 0; 2246 wakeup_one(&swdev_syscall_active); 2247 mtx_unlock(&Giant); 2248 return (error); 2249 } 2250 2251 static int 2252 swapoff_one(struct swdevt *sp, struct ucred *cred) 2253 { 2254 u_long nblks, dvbase; 2255 #ifdef MAC 2256 int error; 2257 #endif 2258 2259 mtx_assert(&Giant, MA_OWNED); 2260 #ifdef MAC 2261 (void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY); 2262 error = mac_system_check_swapoff(cred, sp->sw_vp); 2263 (void) VOP_UNLOCK(sp->sw_vp, 0); 2264 if (error != 0) 2265 return (error); 2266 #endif 2267 nblks = sp->sw_nblks; 2268 2269 /* 2270 * We can turn off this swap device safely only if the 2271 * available virtual memory in the system will fit the amount 2272 * of data we will have to page back in, plus an epsilon so 2273 * the system doesn't become critically low on swap space. 2274 */ 2275 if (cnt.v_free_count + cnt.v_cache_count + swap_pager_avail < 2276 nblks + nswap_lowat) { 2277 return (ENOMEM); 2278 } 2279 2280 /* 2281 * Prevent further allocations on this device. 2282 */ 2283 mtx_lock(&sw_dev_mtx); 2284 sp->sw_flags |= SW_CLOSING; 2285 for (dvbase = 0; dvbase < sp->sw_end; dvbase += dmmax) { 2286 swap_pager_avail -= blist_fill(sp->sw_blist, 2287 dvbase, dmmax); 2288 } 2289 swap_total -= (vm_ooffset_t)nblks * PAGE_SIZE; 2290 mtx_unlock(&sw_dev_mtx); 2291 2292 /* 2293 * Page in the contents of the device and close it. 2294 */ 2295 swap_pager_swapoff(sp); 2296 2297 sp->sw_close(curthread, sp); 2298 sp->sw_id = NULL; 2299 mtx_lock(&sw_dev_mtx); 2300 TAILQ_REMOVE(&swtailq, sp, sw_list); 2301 nswapdev--; 2302 if (nswapdev == 0) { 2303 swap_pager_full = 2; 2304 swap_pager_almost_full = 1; 2305 } 2306 if (swdevhd == sp) 2307 swdevhd = NULL; 2308 mtx_unlock(&sw_dev_mtx); 2309 blist_destroy(sp->sw_blist); 2310 free(sp, M_VMPGDATA); 2311 return (0); 2312 } 2313 2314 void 2315 swapoff_all(void) 2316 { 2317 struct swdevt *sp, *spt; 2318 const char *devname; 2319 int error; 2320 2321 mtx_lock(&Giant); 2322 while (swdev_syscall_active) 2323 tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0); 2324 swdev_syscall_active = 1; 2325 2326 mtx_lock(&sw_dev_mtx); 2327 TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) { 2328 mtx_unlock(&sw_dev_mtx); 2329 if (vn_isdisk(sp->sw_vp, NULL)) 2330 devname = sp->sw_vp->v_rdev->si_name; 2331 else 2332 devname = "[file]"; 2333 error = swapoff_one(sp, thread0.td_ucred); 2334 if (error != 0) { 2335 printf("Cannot remove swap device %s (error=%d), " 2336 "skipping.\n", devname, error); 2337 } else if (bootverbose) { 2338 printf("Swap device %s removed.\n", devname); 2339 } 2340 mtx_lock(&sw_dev_mtx); 2341 } 2342 mtx_unlock(&sw_dev_mtx); 2343 2344 swdev_syscall_active = 0; 2345 wakeup_one(&swdev_syscall_active); 2346 mtx_unlock(&Giant); 2347 } 2348 2349 void 2350 swap_pager_status(int *total, int *used) 2351 { 2352 struct swdevt *sp; 2353 2354 *total = 0; 2355 *used = 0; 2356 mtx_lock(&sw_dev_mtx); 2357 TAILQ_FOREACH(sp, &swtailq, sw_list) { 2358 *total += sp->sw_nblks; 2359 *used += sp->sw_used; 2360 } 2361 mtx_unlock(&sw_dev_mtx); 2362 } 2363 2364 static int 2365 sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS) 2366 { 2367 int *name = (int *)arg1; 2368 int error, n; 2369 struct xswdev xs; 2370 struct swdevt *sp; 2371 2372 if (arg2 != 1) /* name length */ 2373 return (EINVAL); 2374 2375 n = 0; 2376 mtx_lock(&sw_dev_mtx); 2377 TAILQ_FOREACH(sp, &swtailq, sw_list) { 2378 if (n == *name) { 2379 mtx_unlock(&sw_dev_mtx); 2380 xs.xsw_version = XSWDEV_VERSION; 2381 xs.xsw_dev = sp->sw_dev; 2382 xs.xsw_flags = sp->sw_flags; 2383 xs.xsw_nblks = sp->sw_nblks; 2384 xs.xsw_used = sp->sw_used; 2385 2386 error = SYSCTL_OUT(req, &xs, sizeof(xs)); 2387 return (error); 2388 } 2389 n++; 2390 } 2391 mtx_unlock(&sw_dev_mtx); 2392 return (ENOENT); 2393 } 2394 2395 SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0, 2396 "Number of swap devices"); 2397 SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD, sysctl_vm_swap_info, 2398 "Swap statistics by device"); 2399 2400 /* 2401 * vmspace_swap_count() - count the approximate swap usage in pages for a 2402 * vmspace. 2403 * 2404 * The map must be locked. 2405 * 2406 * Swap usage is determined by taking the proportional swap used by 2407 * VM objects backing the VM map. To make up for fractional losses, 2408 * if the VM object has any swap use at all the associated map entries 2409 * count for at least 1 swap page. 2410 */ 2411 int 2412 vmspace_swap_count(struct vmspace *vmspace) 2413 { 2414 vm_map_t map = &vmspace->vm_map; 2415 vm_map_entry_t cur; 2416 int count = 0; 2417 2418 for (cur = map->header.next; cur != &map->header; cur = cur->next) { 2419 vm_object_t object; 2420 2421 if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 && 2422 (object = cur->object.vm_object) != NULL) { 2423 VM_OBJECT_LOCK(object); 2424 if (object->type == OBJT_SWAP && 2425 object->un_pager.swp.swp_bcount != 0) { 2426 int n = (cur->end - cur->start) / PAGE_SIZE; 2427 2428 count += object->un_pager.swp.swp_bcount * 2429 SWAP_META_PAGES * n / object->size + 1; 2430 } 2431 VM_OBJECT_UNLOCK(object); 2432 } 2433 } 2434 return (count); 2435 } 2436 2437 /* 2438 * GEOM backend 2439 * 2440 * Swapping onto disk devices. 2441 * 2442 */ 2443 2444 static g_orphan_t swapgeom_orphan; 2445 2446 static struct g_class g_swap_class = { 2447 .name = "SWAP", 2448 .version = G_VERSION, 2449 .orphan = swapgeom_orphan, 2450 }; 2451 2452 DECLARE_GEOM_CLASS(g_swap_class, g_class); 2453 2454 2455 static void 2456 swapgeom_done(struct bio *bp2) 2457 { 2458 struct buf *bp; 2459 2460 bp = bp2->bio_caller2; 2461 bp->b_ioflags = bp2->bio_flags; 2462 if (bp2->bio_error) 2463 bp->b_ioflags |= BIO_ERROR; 2464 bp->b_resid = bp->b_bcount - bp2->bio_completed; 2465 bp->b_error = bp2->bio_error; 2466 bufdone(bp); 2467 g_destroy_bio(bp2); 2468 } 2469 2470 static void 2471 swapgeom_strategy(struct buf *bp, struct swdevt *sp) 2472 { 2473 struct bio *bio; 2474 struct g_consumer *cp; 2475 2476 cp = sp->sw_id; 2477 if (cp == NULL) { 2478 bp->b_error = ENXIO; 2479 bp->b_ioflags |= BIO_ERROR; 2480 bufdone(bp); 2481 return; 2482 } 2483 if (bp->b_iocmd == BIO_WRITE) 2484 bio = g_new_bio(); 2485 else 2486 bio = g_alloc_bio(); 2487 if (bio == NULL) { 2488 bp->b_error = ENOMEM; 2489 bp->b_ioflags |= BIO_ERROR; 2490 bufdone(bp); 2491 return; 2492 } 2493 2494 bio->bio_caller2 = bp; 2495 bio->bio_cmd = bp->b_iocmd; 2496 bio->bio_data = bp->b_data; 2497 bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE; 2498 bio->bio_length = bp->b_bcount; 2499 bio->bio_done = swapgeom_done; 2500 g_io_request(bio, cp); 2501 return; 2502 } 2503 2504 static void 2505 swapgeom_orphan(struct g_consumer *cp) 2506 { 2507 struct swdevt *sp; 2508 2509 mtx_lock(&sw_dev_mtx); 2510 TAILQ_FOREACH(sp, &swtailq, sw_list) 2511 if (sp->sw_id == cp) 2512 sp->sw_id = NULL; 2513 mtx_unlock(&sw_dev_mtx); 2514 } 2515 2516 static void 2517 swapgeom_close_ev(void *arg, int flags) 2518 { 2519 struct g_consumer *cp; 2520 2521 cp = arg; 2522 g_access(cp, -1, -1, 0); 2523 g_detach(cp); 2524 g_destroy_consumer(cp); 2525 } 2526 2527 static void 2528 swapgeom_close(struct thread *td, struct swdevt *sw) 2529 { 2530 2531 /* XXX: direct call when Giant untangled */ 2532 g_waitfor_event(swapgeom_close_ev, sw->sw_id, M_WAITOK, NULL); 2533 } 2534 2535 2536 struct swh0h0 { 2537 struct cdev *dev; 2538 struct vnode *vp; 2539 int error; 2540 }; 2541 2542 static void 2543 swapongeom_ev(void *arg, int flags) 2544 { 2545 struct swh0h0 *swh; 2546 struct g_provider *pp; 2547 struct g_consumer *cp; 2548 static struct g_geom *gp; 2549 struct swdevt *sp; 2550 u_long nblks; 2551 int error; 2552 2553 swh = arg; 2554 swh->error = 0; 2555 pp = g_dev_getprovider(swh->dev); 2556 if (pp == NULL) { 2557 swh->error = ENODEV; 2558 return; 2559 } 2560 mtx_lock(&sw_dev_mtx); 2561 TAILQ_FOREACH(sp, &swtailq, sw_list) { 2562 cp = sp->sw_id; 2563 if (cp != NULL && cp->provider == pp) { 2564 mtx_unlock(&sw_dev_mtx); 2565 swh->error = EBUSY; 2566 return; 2567 } 2568 } 2569 mtx_unlock(&sw_dev_mtx); 2570 if (gp == NULL) 2571 gp = g_new_geomf(&g_swap_class, "swap", NULL); 2572 cp = g_new_consumer(gp); 2573 g_attach(cp, pp); 2574 /* 2575 * XXX: Everytime you think you can improve the margin for 2576 * footshooting, somebody depends on the ability to do so: 2577 * savecore(8) wants to write to our swapdev so we cannot 2578 * set an exclusive count :-( 2579 */ 2580 error = g_access(cp, 1, 1, 0); 2581 if (error) { 2582 g_detach(cp); 2583 g_destroy_consumer(cp); 2584 swh->error = error; 2585 return; 2586 } 2587 nblks = pp->mediasize / DEV_BSIZE; 2588 swaponsomething(swh->vp, cp, nblks, swapgeom_strategy, 2589 swapgeom_close, dev2udev(swh->dev)); 2590 swh->error = 0; 2591 return; 2592 } 2593 2594 static int 2595 swapongeom(struct thread *td, struct vnode *vp) 2596 { 2597 int error; 2598 struct swh0h0 swh; 2599 2600 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2601 2602 swh.dev = vp->v_rdev; 2603 swh.vp = vp; 2604 swh.error = 0; 2605 /* XXX: direct call when Giant untangled */ 2606 error = g_waitfor_event(swapongeom_ev, &swh, M_WAITOK, NULL); 2607 if (!error) 2608 error = swh.error; 2609 VOP_UNLOCK(vp, 0); 2610 return (error); 2611 } 2612 2613 /* 2614 * VNODE backend 2615 * 2616 * This is used mainly for network filesystem (read: probably only tested 2617 * with NFS) swapfiles. 2618 * 2619 */ 2620 2621 static void 2622 swapdev_strategy(struct buf *bp, struct swdevt *sp) 2623 { 2624 struct vnode *vp2; 2625 2626 bp->b_blkno = ctodb(bp->b_blkno - sp->sw_first); 2627 2628 vp2 = sp->sw_id; 2629 vhold(vp2); 2630 if (bp->b_iocmd == BIO_WRITE) { 2631 if (bp->b_bufobj) 2632 bufobj_wdrop(bp->b_bufobj); 2633 bufobj_wref(&vp2->v_bufobj); 2634 } 2635 if (bp->b_bufobj != &vp2->v_bufobj) 2636 bp->b_bufobj = &vp2->v_bufobj; 2637 bp->b_vp = vp2; 2638 bp->b_iooffset = dbtob(bp->b_blkno); 2639 bstrategy(bp); 2640 return; 2641 } 2642 2643 static void 2644 swapdev_close(struct thread *td, struct swdevt *sp) 2645 { 2646 2647 VOP_CLOSE(sp->sw_vp, FREAD | FWRITE, td->td_ucred, td); 2648 vrele(sp->sw_vp); 2649 } 2650 2651 2652 static int 2653 swaponvp(struct thread *td, struct vnode *vp, u_long nblks) 2654 { 2655 struct swdevt *sp; 2656 int error; 2657 2658 if (nblks == 0) 2659 return (ENXIO); 2660 mtx_lock(&sw_dev_mtx); 2661 TAILQ_FOREACH(sp, &swtailq, sw_list) { 2662 if (sp->sw_id == vp) { 2663 mtx_unlock(&sw_dev_mtx); 2664 return (EBUSY); 2665 } 2666 } 2667 mtx_unlock(&sw_dev_mtx); 2668 2669 (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2670 #ifdef MAC 2671 error = mac_system_check_swapon(td->td_ucred, vp); 2672 if (error == 0) 2673 #endif 2674 error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, NULL); 2675 (void) VOP_UNLOCK(vp, 0); 2676 if (error) 2677 return (error); 2678 2679 swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close, 2680 NODEV); 2681 return (0); 2682 } 2683