1 /* 2 * Copyright (c) 1998 Matthew Dillon, 3 * Copyright (c) 1994 John S. Dyson 4 * Copyright (c) 1990 University of Utah. 5 * Copyright (c) 1991, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * the Systems Programming Group of the University of Utah Computer 10 * Science Department. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. All advertising materials mentioning features or use of this software 21 * must display the following acknowledgement: 22 * This product includes software developed by the University of 23 * California, Berkeley and its contributors. 24 * 4. Neither the name of the University nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * New Swap System 41 * Matthew Dillon 42 * 43 * Radix Bitmap 'blists'. 44 * 45 * - The new swapper uses the new radix bitmap code. This should scale 46 * to arbitrarily small or arbitrarily large swap spaces and an almost 47 * arbitrary degree of fragmentation. 48 * 49 * Features: 50 * 51 * - on the fly reallocation of swap during putpages. The new system 52 * does not try to keep previously allocated swap blocks for dirty 53 * pages. 54 * 55 * - on the fly deallocation of swap 56 * 57 * - No more garbage collection required. Unnecessarily allocated swap 58 * blocks only exist for dirty vm_page_t's now and these are already 59 * cycled (in a high-load system) by the pager. We also do on-the-fly 60 * removal of invalidated swap blocks when a page is destroyed 61 * or renamed. 62 * 63 * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$ 64 * 65 * @(#)swap_pager.c 8.9 (Berkeley) 3/21/94 66 * 67 * $FreeBSD$ 68 */ 69 70 #include <sys/param.h> 71 #include <sys/systm.h> 72 #include <sys/conf.h> 73 #include <sys/kernel.h> 74 #include <sys/proc.h> 75 #include <sys/bio.h> 76 #include <sys/buf.h> 77 #include <sys/vnode.h> 78 #include <sys/malloc.h> 79 #include <sys/vmmeter.h> 80 #include <sys/sysctl.h> 81 #include <sys/blist.h> 82 #include <sys/lock.h> 83 84 #ifndef MAX_PAGEOUT_CLUSTER 85 #define MAX_PAGEOUT_CLUSTER 16 86 #endif 87 88 #define SWB_NPAGES MAX_PAGEOUT_CLUSTER 89 90 #include "opt_swap.h" 91 #include <vm/vm.h> 92 #include <vm/vm_object.h> 93 #include <vm/vm_page.h> 94 #include <vm/vm_pager.h> 95 #include <vm/vm_pageout.h> 96 #include <vm/swap_pager.h> 97 #include <vm/vm_extern.h> 98 #include <vm/vm_zone.h> 99 100 #define SWM_FREE 0x02 /* free, period */ 101 #define SWM_POP 0x04 /* pop out */ 102 103 /* 104 * vm_swap_size is in page-sized chunks now. It was DEV_BSIZE'd chunks 105 * in the old system. 106 */ 107 108 extern int vm_swap_size; /* number of free swap blocks, in pages */ 109 110 int swap_pager_full; /* swap space exhaustion (task killing) */ 111 static int swap_pager_almost_full; /* swap space exhaustion (w/ hysteresis)*/ 112 static int nsw_rcount; /* free read buffers */ 113 static int nsw_wcount_sync; /* limit write buffers / synchronous */ 114 static int nsw_wcount_async; /* limit write buffers / asynchronous */ 115 static int nsw_wcount_async_max;/* assigned maximum */ 116 static int nsw_cluster_max; /* maximum VOP I/O allowed */ 117 static int sw_alloc_interlock; /* swap pager allocation interlock */ 118 119 struct blist *swapblist; 120 static struct swblock **swhash; 121 static int swhash_mask; 122 static int swap_async_max = 4; /* maximum in-progress async I/O's */ 123 124 extern struct vnode *swapdev_vp; /* from vm_swap.c */ 125 126 SYSCTL_INT(_vm, OID_AUTO, swap_async_max, 127 CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops"); 128 129 /* 130 * "named" and "unnamed" anon region objects. Try to reduce the overhead 131 * of searching a named list by hashing it just a little. 132 */ 133 134 #define NOBJLISTS 8 135 136 #define NOBJLIST(handle) \ 137 (&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)]) 138 139 static struct pagerlst swap_pager_object_list[NOBJLISTS]; 140 struct pagerlst swap_pager_un_object_list; 141 vm_zone_t swap_zone; 142 143 /* 144 * pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure 145 * calls hooked from other parts of the VM system and do not appear here. 146 * (see vm/swap_pager.h). 147 */ 148 149 static vm_object_t 150 swap_pager_alloc __P((void *handle, vm_ooffset_t size, 151 vm_prot_t prot, vm_ooffset_t offset)); 152 static void swap_pager_dealloc __P((vm_object_t object)); 153 static int swap_pager_getpages __P((vm_object_t, vm_page_t *, int, int)); 154 static void swap_pager_init __P((void)); 155 static void swap_pager_unswapped __P((vm_page_t)); 156 static void swap_pager_strategy __P((vm_object_t, struct bio *)); 157 158 struct pagerops swappagerops = { 159 swap_pager_init, /* early system initialization of pager */ 160 swap_pager_alloc, /* allocate an OBJT_SWAP object */ 161 swap_pager_dealloc, /* deallocate an OBJT_SWAP object */ 162 swap_pager_getpages, /* pagein */ 163 swap_pager_putpages, /* pageout */ 164 swap_pager_haspage, /* get backing store status for page */ 165 swap_pager_unswapped, /* remove swap related to page */ 166 swap_pager_strategy /* pager strategy call */ 167 }; 168 169 static struct buf *getchainbuf(struct bio *bp, struct vnode *vp, int flags); 170 static void flushchainbuf(struct buf *nbp); 171 static void waitchainbuf(struct bio *bp, int count, int done); 172 173 /* 174 * dmmax is in page-sized chunks with the new swap system. It was 175 * dev-bsized chunks in the old. 176 * 177 * swap_*() routines are externally accessible. swp_*() routines are 178 * internal. 179 */ 180 181 int dmmax; 182 static int dmmax_mask; 183 int nswap_lowat = 128; /* in pages, swap_pager_almost_full warn */ 184 int nswap_hiwat = 512; /* in pages, swap_pager_almost_full warn */ 185 186 static __inline void swp_sizecheck __P((void)); 187 static void swp_pager_sync_iodone __P((struct buf *bp)); 188 static void swp_pager_async_iodone __P((struct buf *bp)); 189 190 /* 191 * Swap bitmap functions 192 */ 193 194 static __inline void swp_pager_freeswapspace __P((daddr_t blk, int npages)); 195 static __inline daddr_t swp_pager_getswapspace __P((int npages)); 196 197 /* 198 * Metadata functions 199 */ 200 201 static void swp_pager_meta_build __P((vm_object_t, vm_pindex_t, daddr_t)); 202 static void swp_pager_meta_free __P((vm_object_t, vm_pindex_t, daddr_t)); 203 static void swp_pager_meta_free_all __P((vm_object_t)); 204 static daddr_t swp_pager_meta_ctl __P((vm_object_t, vm_pindex_t, int)); 205 206 /* 207 * SWP_SIZECHECK() - update swap_pager_full indication 208 * 209 * update the swap_pager_almost_full indication and warn when we are 210 * about to run out of swap space, using lowat/hiwat hysteresis. 211 * 212 * Clear swap_pager_full ( task killing ) indication when lowat is met. 213 * 214 * No restrictions on call 215 * This routine may not block. 216 * This routine must be called at splvm() 217 */ 218 219 static __inline void 220 swp_sizecheck() 221 { 222 if (vm_swap_size < nswap_lowat) { 223 if (swap_pager_almost_full == 0) { 224 printf("swap_pager: out of swap space\n"); 225 swap_pager_almost_full = 1; 226 } 227 } else { 228 swap_pager_full = 0; 229 if (vm_swap_size > nswap_hiwat) 230 swap_pager_almost_full = 0; 231 } 232 } 233 234 /* 235 * SWAP_PAGER_INIT() - initialize the swap pager! 236 * 237 * Expected to be started from system init. NOTE: This code is run 238 * before much else so be careful what you depend on. Most of the VM 239 * system has yet to be initialized at this point. 240 */ 241 242 static void 243 swap_pager_init() 244 { 245 /* 246 * Initialize object lists 247 */ 248 int i; 249 250 for (i = 0; i < NOBJLISTS; ++i) 251 TAILQ_INIT(&swap_pager_object_list[i]); 252 TAILQ_INIT(&swap_pager_un_object_list); 253 254 /* 255 * Device Stripe, in PAGE_SIZE'd blocks 256 */ 257 258 dmmax = SWB_NPAGES * 2; 259 dmmax_mask = ~(dmmax - 1); 260 } 261 262 /* 263 * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process 264 * 265 * Expected to be started from pageout process once, prior to entering 266 * its main loop. 267 */ 268 269 void 270 swap_pager_swap_init() 271 { 272 int n; 273 274 /* 275 * Number of in-transit swap bp operations. Don't 276 * exhaust the pbufs completely. Make sure we 277 * initialize workable values (0 will work for hysteresis 278 * but it isn't very efficient). 279 * 280 * The nsw_cluster_max is constrained by the bp->b_pages[] 281 * array (MAXPHYS/PAGE_SIZE) and our locally defined 282 * MAX_PAGEOUT_CLUSTER. Also be aware that swap ops are 283 * constrained by the swap device interleave stripe size. 284 * 285 * Currently we hardwire nsw_wcount_async to 4. This limit is 286 * designed to prevent other I/O from having high latencies due to 287 * our pageout I/O. The value 4 works well for one or two active swap 288 * devices but is probably a little low if you have more. Even so, 289 * a higher value would probably generate only a limited improvement 290 * with three or four active swap devices since the system does not 291 * typically have to pageout at extreme bandwidths. We will want 292 * at least 2 per swap devices, and 4 is a pretty good value if you 293 * have one NFS swap device due to the command/ack latency over NFS. 294 * So it all works out pretty well. 295 */ 296 297 nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER); 298 299 nsw_rcount = (nswbuf + 1) / 2; 300 nsw_wcount_sync = (nswbuf + 3) / 4; 301 nsw_wcount_async = 4; 302 nsw_wcount_async_max = nsw_wcount_async; 303 304 /* 305 * Initialize our zone. Right now I'm just guessing on the number 306 * we need based on the number of pages in the system. Each swblock 307 * can hold 16 pages, so this is probably overkill. 308 */ 309 310 n = cnt.v_page_count * 2; 311 312 swap_zone = zinit( 313 "SWAPMETA", 314 sizeof(struct swblock), 315 n, 316 ZONE_INTERRUPT, 317 1 318 ); 319 320 /* 321 * Initialize our meta-data hash table. The swapper does not need to 322 * be quite as efficient as the VM system, so we do not use an 323 * oversized hash table. 324 * 325 * n: size of hash table, must be power of 2 326 * swhash_mask: hash table index mask 327 */ 328 329 for (n = 1; n < cnt.v_page_count / 4; n <<= 1) 330 ; 331 332 swhash = malloc(sizeof(struct swblock *) * n, M_VMPGDATA, M_WAITOK); 333 bzero(swhash, sizeof(struct swblock *) * n); 334 335 swhash_mask = n - 1; 336 } 337 338 /* 339 * SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate 340 * its metadata structures. 341 * 342 * This routine is called from the mmap and fork code to create a new 343 * OBJT_SWAP object. We do this by creating an OBJT_DEFAULT object 344 * and then converting it with swp_pager_meta_build(). 345 * 346 * This routine may block in vm_object_allocate() and create a named 347 * object lookup race, so we must interlock. We must also run at 348 * splvm() for the object lookup to handle races with interrupts, but 349 * we do not have to maintain splvm() in between the lookup and the 350 * add because (I believe) it is not possible to attempt to create 351 * a new swap object w/handle when a default object with that handle 352 * already exists. 353 */ 354 355 static vm_object_t 356 swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, 357 vm_ooffset_t offset) 358 { 359 vm_object_t object; 360 361 if (handle) { 362 /* 363 * Reference existing named region or allocate new one. There 364 * should not be a race here against swp_pager_meta_build() 365 * as called from vm_page_remove() in regards to the lookup 366 * of the handle. 367 */ 368 369 while (sw_alloc_interlock) { 370 sw_alloc_interlock = -1; 371 tsleep(&sw_alloc_interlock, PVM, "swpalc", 0); 372 } 373 sw_alloc_interlock = 1; 374 375 object = vm_pager_object_lookup(NOBJLIST(handle), handle); 376 377 if (object != NULL) { 378 vm_object_reference(object); 379 } else { 380 object = vm_object_allocate(OBJT_DEFAULT, 381 OFF_TO_IDX(offset + PAGE_MASK + size)); 382 object->handle = handle; 383 384 swp_pager_meta_build(object, 0, SWAPBLK_NONE); 385 } 386 387 if (sw_alloc_interlock < 0) 388 wakeup(&sw_alloc_interlock); 389 390 sw_alloc_interlock = 0; 391 } else { 392 object = vm_object_allocate(OBJT_DEFAULT, 393 OFF_TO_IDX(offset + PAGE_MASK + size)); 394 395 swp_pager_meta_build(object, 0, SWAPBLK_NONE); 396 } 397 398 return (object); 399 } 400 401 /* 402 * SWAP_PAGER_DEALLOC() - remove swap metadata from object 403 * 404 * The swap backing for the object is destroyed. The code is 405 * designed such that we can reinstantiate it later, but this 406 * routine is typically called only when the entire object is 407 * about to be destroyed. 408 * 409 * This routine may block, but no longer does. 410 * 411 * The object must be locked or unreferenceable. 412 */ 413 414 static void 415 swap_pager_dealloc(object) 416 vm_object_t object; 417 { 418 int s; 419 420 /* 421 * Remove from list right away so lookups will fail if we block for 422 * pageout completion. 423 */ 424 425 if (object->handle == NULL) { 426 TAILQ_REMOVE(&swap_pager_un_object_list, object, pager_object_list); 427 } else { 428 TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list); 429 } 430 431 vm_object_pip_wait(object, "swpdea"); 432 433 /* 434 * Free all remaining metadata. We only bother to free it from 435 * the swap meta data. We do not attempt to free swapblk's still 436 * associated with vm_page_t's for this object. We do not care 437 * if paging is still in progress on some objects. 438 */ 439 s = splvm(); 440 swp_pager_meta_free_all(object); 441 splx(s); 442 } 443 444 /************************************************************************ 445 * SWAP PAGER BITMAP ROUTINES * 446 ************************************************************************/ 447 448 /* 449 * SWP_PAGER_GETSWAPSPACE() - allocate raw swap space 450 * 451 * Allocate swap for the requested number of pages. The starting 452 * swap block number (a page index) is returned or SWAPBLK_NONE 453 * if the allocation failed. 454 * 455 * Also has the side effect of advising that somebody made a mistake 456 * when they configured swap and didn't configure enough. 457 * 458 * Must be called at splvm() to avoid races with bitmap frees from 459 * vm_page_remove() aka swap_pager_page_removed(). 460 * 461 * This routine may not block 462 * This routine must be called at splvm(). 463 */ 464 465 static __inline daddr_t 466 swp_pager_getswapspace(npages) 467 int npages; 468 { 469 daddr_t blk; 470 471 if ((blk = blist_alloc(swapblist, npages)) == SWAPBLK_NONE) { 472 if (swap_pager_full != 2) { 473 printf("swap_pager_getswapspace: failed\n"); 474 swap_pager_full = 2; 475 swap_pager_almost_full = 1; 476 } 477 } else { 478 vm_swap_size -= npages; 479 swp_sizecheck(); 480 } 481 return(blk); 482 } 483 484 /* 485 * SWP_PAGER_FREESWAPSPACE() - free raw swap space 486 * 487 * This routine returns the specified swap blocks back to the bitmap. 488 * 489 * Note: This routine may not block (it could in the old swap code), 490 * and through the use of the new blist routines it does not block. 491 * 492 * We must be called at splvm() to avoid races with bitmap frees from 493 * vm_page_remove() aka swap_pager_page_removed(). 494 * 495 * This routine may not block 496 * This routine must be called at splvm(). 497 */ 498 499 static __inline void 500 swp_pager_freeswapspace(blk, npages) 501 daddr_t blk; 502 int npages; 503 { 504 blist_free(swapblist, blk, npages); 505 vm_swap_size += npages; 506 swp_sizecheck(); 507 } 508 509 /* 510 * SWAP_PAGER_FREESPACE() - frees swap blocks associated with a page 511 * range within an object. 512 * 513 * This is a globally accessible routine. 514 * 515 * This routine removes swapblk assignments from swap metadata. 516 * 517 * The external callers of this routine typically have already destroyed 518 * or renamed vm_page_t's associated with this range in the object so 519 * we should be ok. 520 * 521 * This routine may be called at any spl. We up our spl to splvm temporarily 522 * in order to perform the metadata removal. 523 */ 524 525 void 526 swap_pager_freespace(object, start, size) 527 vm_object_t object; 528 vm_pindex_t start; 529 vm_size_t size; 530 { 531 int s = splvm(); 532 swp_pager_meta_free(object, start, size); 533 splx(s); 534 } 535 536 /* 537 * SWAP_PAGER_RESERVE() - reserve swap blocks in object 538 * 539 * Assigns swap blocks to the specified range within the object. The 540 * swap blocks are not zerod. Any previous swap assignment is destroyed. 541 * 542 * Returns 0 on success, -1 on failure. 543 */ 544 545 int 546 swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size) 547 { 548 int s; 549 int n = 0; 550 daddr_t blk = SWAPBLK_NONE; 551 vm_pindex_t beg = start; /* save start index */ 552 553 s = splvm(); 554 while (size) { 555 if (n == 0) { 556 n = BLIST_MAX_ALLOC; 557 while ((blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE) { 558 n >>= 1; 559 if (n == 0) { 560 swp_pager_meta_free(object, beg, start - beg); 561 splx(s); 562 return(-1); 563 } 564 } 565 } 566 swp_pager_meta_build(object, start, blk); 567 --size; 568 ++start; 569 ++blk; 570 --n; 571 } 572 swp_pager_meta_free(object, start, n); 573 splx(s); 574 return(0); 575 } 576 577 /* 578 * SWAP_PAGER_COPY() - copy blocks from source pager to destination pager 579 * and destroy the source. 580 * 581 * Copy any valid swapblks from the source to the destination. In 582 * cases where both the source and destination have a valid swapblk, 583 * we keep the destination's. 584 * 585 * This routine is allowed to block. It may block allocating metadata 586 * indirectly through swp_pager_meta_build() or if paging is still in 587 * progress on the source. 588 * 589 * This routine can be called at any spl 590 * 591 * XXX vm_page_collapse() kinda expects us not to block because we 592 * supposedly do not need to allocate memory, but for the moment we 593 * *may* have to get a little memory from the zone allocator, but 594 * it is taken from the interrupt memory. We should be ok. 595 * 596 * The source object contains no vm_page_t's (which is just as well) 597 * 598 * The source object is of type OBJT_SWAP. 599 * 600 * The source and destination objects must be locked or 601 * inaccessible (XXX are they ?) 602 */ 603 604 void 605 swap_pager_copy(srcobject, dstobject, offset, destroysource) 606 vm_object_t srcobject; 607 vm_object_t dstobject; 608 vm_pindex_t offset; 609 int destroysource; 610 { 611 vm_pindex_t i; 612 int s; 613 614 s = splvm(); 615 616 /* 617 * If destroysource is set, we remove the source object from the 618 * swap_pager internal queue now. 619 */ 620 621 if (destroysource) { 622 if (srcobject->handle == NULL) { 623 TAILQ_REMOVE( 624 &swap_pager_un_object_list, 625 srcobject, 626 pager_object_list 627 ); 628 } else { 629 TAILQ_REMOVE( 630 NOBJLIST(srcobject->handle), 631 srcobject, 632 pager_object_list 633 ); 634 } 635 } 636 637 /* 638 * transfer source to destination. 639 */ 640 641 for (i = 0; i < dstobject->size; ++i) { 642 daddr_t dstaddr; 643 644 /* 645 * Locate (without changing) the swapblk on the destination, 646 * unless it is invalid in which case free it silently, or 647 * if the destination is a resident page, in which case the 648 * source is thrown away. 649 */ 650 651 dstaddr = swp_pager_meta_ctl(dstobject, i, 0); 652 653 if (dstaddr == SWAPBLK_NONE) { 654 /* 655 * Destination has no swapblk and is not resident, 656 * copy source. 657 */ 658 daddr_t srcaddr; 659 660 srcaddr = swp_pager_meta_ctl( 661 srcobject, 662 i + offset, 663 SWM_POP 664 ); 665 666 if (srcaddr != SWAPBLK_NONE) 667 swp_pager_meta_build(dstobject, i, srcaddr); 668 } else { 669 /* 670 * Destination has valid swapblk or it is represented 671 * by a resident page. We destroy the sourceblock. 672 */ 673 674 swp_pager_meta_ctl(srcobject, i + offset, SWM_FREE); 675 } 676 } 677 678 /* 679 * Free left over swap blocks in source. 680 * 681 * We have to revert the type to OBJT_DEFAULT so we do not accidently 682 * double-remove the object from the swap queues. 683 */ 684 685 if (destroysource) { 686 swp_pager_meta_free_all(srcobject); 687 /* 688 * Reverting the type is not necessary, the caller is going 689 * to destroy srcobject directly, but I'm doing it here 690 * for consistency since we've removed the object from its 691 * queues. 692 */ 693 srcobject->type = OBJT_DEFAULT; 694 } 695 splx(s); 696 } 697 698 /* 699 * SWAP_PAGER_HASPAGE() - determine if we have good backing store for 700 * the requested page. 701 * 702 * We determine whether good backing store exists for the requested 703 * page and return TRUE if it does, FALSE if it doesn't. 704 * 705 * If TRUE, we also try to determine how much valid, contiguous backing 706 * store exists before and after the requested page within a reasonable 707 * distance. We do not try to restrict it to the swap device stripe 708 * (that is handled in getpages/putpages). It probably isn't worth 709 * doing here. 710 */ 711 712 boolean_t 713 swap_pager_haspage(object, pindex, before, after) 714 vm_object_t object; 715 vm_pindex_t pindex; 716 int *before; 717 int *after; 718 { 719 daddr_t blk0; 720 int s; 721 722 /* 723 * do we have good backing store at the requested index ? 724 */ 725 726 s = splvm(); 727 blk0 = swp_pager_meta_ctl(object, pindex, 0); 728 729 if (blk0 == SWAPBLK_NONE) { 730 splx(s); 731 if (before) 732 *before = 0; 733 if (after) 734 *after = 0; 735 return (FALSE); 736 } 737 738 /* 739 * find backwards-looking contiguous good backing store 740 */ 741 742 if (before != NULL) { 743 int i; 744 745 for (i = 1; i < (SWB_NPAGES/2); ++i) { 746 daddr_t blk; 747 748 if (i > pindex) 749 break; 750 blk = swp_pager_meta_ctl(object, pindex - i, 0); 751 if (blk != blk0 - i) 752 break; 753 } 754 *before = (i - 1); 755 } 756 757 /* 758 * find forward-looking contiguous good backing store 759 */ 760 761 if (after != NULL) { 762 int i; 763 764 for (i = 1; i < (SWB_NPAGES/2); ++i) { 765 daddr_t blk; 766 767 blk = swp_pager_meta_ctl(object, pindex + i, 0); 768 if (blk != blk0 + i) 769 break; 770 } 771 *after = (i - 1); 772 } 773 splx(s); 774 return (TRUE); 775 } 776 777 /* 778 * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page 779 * 780 * This removes any associated swap backing store, whether valid or 781 * not, from the page. 782 * 783 * This routine is typically called when a page is made dirty, at 784 * which point any associated swap can be freed. MADV_FREE also 785 * calls us in a special-case situation 786 * 787 * NOTE!!! If the page is clean and the swap was valid, the caller 788 * should make the page dirty before calling this routine. This routine 789 * does NOT change the m->dirty status of the page. Also: MADV_FREE 790 * depends on it. 791 * 792 * This routine may not block 793 * This routine must be called at splvm() 794 */ 795 796 static void 797 swap_pager_unswapped(m) 798 vm_page_t m; 799 { 800 swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE); 801 } 802 803 /* 804 * SWAP_PAGER_STRATEGY() - read, write, free blocks 805 * 806 * This implements the vm_pager_strategy() interface to swap and allows 807 * other parts of the system to directly access swap as backing store 808 * through vm_objects of type OBJT_SWAP. This is intended to be a 809 * cacheless interface ( i.e. caching occurs at higher levels ). 810 * Therefore we do not maintain any resident pages. All I/O goes 811 * directly to and from the swap device. 812 * 813 * Note that b_blkno is scaled for PAGE_SIZE 814 * 815 * We currently attempt to run I/O synchronously or asynchronously as 816 * the caller requests. This isn't perfect because we loose error 817 * sequencing when we run multiple ops in parallel to satisfy a request. 818 * But this is swap, so we let it all hang out. 819 */ 820 821 static void 822 swap_pager_strategy(vm_object_t object, struct bio *bp) 823 { 824 vm_pindex_t start; 825 int count; 826 int s; 827 char *data; 828 struct buf *nbp = NULL; 829 830 /* XXX: KASSERT instead ? */ 831 if (bp->bio_bcount & PAGE_MASK) { 832 bp->bio_error = EINVAL; 833 bp->bio_flags |= BIO_ERROR; 834 biodone(bp); 835 printf("swap_pager_strategy: bp %p blk %d size %d, not page bounded\n", bp, (int)bp->bio_pblkno, (int)bp->bio_bcount); 836 return; 837 } 838 839 /* 840 * Clear error indication, initialize page index, count, data pointer. 841 */ 842 843 bp->bio_error = 0; 844 bp->bio_flags &= ~BIO_ERROR; 845 bp->bio_resid = bp->bio_bcount; 846 847 start = bp->bio_pblkno; 848 count = howmany(bp->bio_bcount, PAGE_SIZE); 849 data = bp->bio_data; 850 851 s = splvm(); 852 853 /* 854 * Deal with BIO_DELETE 855 */ 856 857 if (bp->bio_cmd == BIO_DELETE) { 858 /* 859 * FREE PAGE(s) - destroy underlying swap that is no longer 860 * needed. 861 */ 862 swp_pager_meta_free(object, start, count); 863 splx(s); 864 bp->bio_resid = 0; 865 biodone(bp); 866 return; 867 } 868 869 /* 870 * Execute read or write 871 */ 872 873 while (count > 0) { 874 daddr_t blk; 875 876 /* 877 * Obtain block. If block not found and writing, allocate a 878 * new block and build it into the object. 879 */ 880 881 blk = swp_pager_meta_ctl(object, start, 0); 882 if ((blk == SWAPBLK_NONE) && (bp->bio_cmd == BIO_WRITE)) { 883 blk = swp_pager_getswapspace(1); 884 if (blk == SWAPBLK_NONE) { 885 bp->bio_error = ENOMEM; 886 bp->bio_flags |= BIO_ERROR; 887 break; 888 } 889 swp_pager_meta_build(object, start, blk); 890 } 891 892 /* 893 * Do we have to flush our current collection? Yes if: 894 * 895 * - no swap block at this index 896 * - swap block is not contiguous 897 * - we cross a physical disk boundry in the 898 * stripe. 899 */ 900 901 if ( 902 nbp && (nbp->b_blkno + btoc(nbp->b_bcount) != blk || 903 ((nbp->b_blkno ^ blk) & dmmax_mask) 904 ) 905 ) { 906 splx(s); 907 if (bp->bio_cmd == BIO_READ) { 908 ++cnt.v_swapin; 909 cnt.v_swappgsin += btoc(nbp->b_bcount); 910 } else { 911 ++cnt.v_swapout; 912 cnt.v_swappgsout += btoc(nbp->b_bcount); 913 nbp->b_dirtyend = nbp->b_bcount; 914 } 915 flushchainbuf(nbp); 916 s = splvm(); 917 nbp = NULL; 918 } 919 920 /* 921 * Add new swapblk to nbp, instantiating nbp if necessary. 922 * Zero-fill reads are able to take a shortcut. 923 */ 924 925 if (blk == SWAPBLK_NONE) { 926 /* 927 * We can only get here if we are reading. Since 928 * we are at splvm() we can safely modify b_resid, 929 * even if chain ops are in progress. 930 */ 931 bzero(data, PAGE_SIZE); 932 bp->bio_resid -= PAGE_SIZE; 933 } else { 934 if (nbp == NULL) { 935 nbp = getchainbuf(bp, swapdev_vp, B_ASYNC); 936 nbp->b_blkno = blk; 937 nbp->b_bcount = 0; 938 nbp->b_data = data; 939 } 940 nbp->b_bcount += PAGE_SIZE; 941 } 942 --count; 943 ++start; 944 data += PAGE_SIZE; 945 } 946 947 /* 948 * Flush out last buffer 949 */ 950 951 splx(s); 952 953 if (nbp) { 954 if (nbp->b_iocmd == BIO_READ) { 955 ++cnt.v_swapin; 956 cnt.v_swappgsin += btoc(nbp->b_bcount); 957 } else { 958 ++cnt.v_swapout; 959 cnt.v_swappgsout += btoc(nbp->b_bcount); 960 nbp->b_dirtyend = nbp->b_bcount; 961 } 962 flushchainbuf(nbp); 963 /* nbp = NULL; */ 964 } 965 966 /* 967 * Wait for completion. 968 */ 969 970 waitchainbuf(bp, 0, 1); 971 } 972 973 /* 974 * SWAP_PAGER_GETPAGES() - bring pages in from swap 975 * 976 * Attempt to retrieve (m, count) pages from backing store, but make 977 * sure we retrieve at least m[reqpage]. We try to load in as large 978 * a chunk surrounding m[reqpage] as is contiguous in swap and which 979 * belongs to the same object. 980 * 981 * The code is designed for asynchronous operation and 982 * immediate-notification of 'reqpage' but tends not to be 983 * used that way. Please do not optimize-out this algorithmic 984 * feature, I intend to improve on it in the future. 985 * 986 * The parent has a single vm_object_pip_add() reference prior to 987 * calling us and we should return with the same. 988 * 989 * The parent has BUSY'd the pages. We should return with 'm' 990 * left busy, but the others adjusted. 991 */ 992 993 static int 994 swap_pager_getpages(object, m, count, reqpage) 995 vm_object_t object; 996 vm_page_t *m; 997 int count, reqpage; 998 { 999 struct buf *bp; 1000 vm_page_t mreq; 1001 int s; 1002 int i; 1003 int j; 1004 daddr_t blk; 1005 vm_offset_t kva; 1006 vm_pindex_t lastpindex; 1007 1008 mreq = m[reqpage]; 1009 1010 if (mreq->object != object) { 1011 panic("swap_pager_getpages: object mismatch %p/%p", 1012 object, 1013 mreq->object 1014 ); 1015 } 1016 /* 1017 * Calculate range to retrieve. The pages have already been assigned 1018 * their swapblks. We require a *contiguous* range that falls entirely 1019 * within a single device stripe. If we do not supply it, bad things 1020 * happen. Note that blk, iblk & jblk can be SWAPBLK_NONE, but the 1021 * loops are set up such that the case(s) are handled implicitly. 1022 * 1023 * The swp_*() calls must be made at splvm(). vm_page_free() does 1024 * not need to be, but it will go a little faster if it is. 1025 */ 1026 1027 s = splvm(); 1028 blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0); 1029 1030 for (i = reqpage - 1; i >= 0; --i) { 1031 daddr_t iblk; 1032 1033 iblk = swp_pager_meta_ctl(m[i]->object, m[i]->pindex, 0); 1034 if (blk != iblk + (reqpage - i)) 1035 break; 1036 if ((blk ^ iblk) & dmmax_mask) 1037 break; 1038 } 1039 ++i; 1040 1041 for (j = reqpage + 1; j < count; ++j) { 1042 daddr_t jblk; 1043 1044 jblk = swp_pager_meta_ctl(m[j]->object, m[j]->pindex, 0); 1045 if (blk != jblk - (j - reqpage)) 1046 break; 1047 if ((blk ^ jblk) & dmmax_mask) 1048 break; 1049 } 1050 1051 /* 1052 * free pages outside our collection range. Note: we never free 1053 * mreq, it must remain busy throughout. 1054 */ 1055 1056 { 1057 int k; 1058 1059 for (k = 0; k < i; ++k) 1060 vm_page_free(m[k]); 1061 for (k = j; k < count; ++k) 1062 vm_page_free(m[k]); 1063 } 1064 splx(s); 1065 1066 1067 /* 1068 * Return VM_PAGER_FAIL if we have nothing to do. Return mreq 1069 * still busy, but the others unbusied. 1070 */ 1071 1072 if (blk == SWAPBLK_NONE) 1073 return(VM_PAGER_FAIL); 1074 1075 /* 1076 * Get a swap buffer header to perform the IO 1077 */ 1078 1079 bp = getpbuf(&nsw_rcount); 1080 kva = (vm_offset_t) bp->b_data; 1081 1082 /* 1083 * map our page(s) into kva for input 1084 * 1085 * NOTE: B_PAGING is set by pbgetvp() 1086 */ 1087 1088 pmap_qenter(kva, m + i, j - i); 1089 1090 bp->b_iocmd = BIO_READ; 1091 bp->b_iodone = swp_pager_async_iodone; 1092 bp->b_rcred = bp->b_wcred = proc0.p_ucred; 1093 bp->b_data = (caddr_t) kva; 1094 crhold(bp->b_rcred); 1095 crhold(bp->b_wcred); 1096 bp->b_blkno = blk - (reqpage - i); 1097 bp->b_bcount = PAGE_SIZE * (j - i); 1098 bp->b_bufsize = PAGE_SIZE * (j - i); 1099 bp->b_pager.pg_reqpage = reqpage - i; 1100 1101 { 1102 int k; 1103 1104 for (k = i; k < j; ++k) { 1105 bp->b_pages[k - i] = m[k]; 1106 vm_page_flag_set(m[k], PG_SWAPINPROG); 1107 } 1108 } 1109 bp->b_npages = j - i; 1110 1111 pbgetvp(swapdev_vp, bp); 1112 1113 cnt.v_swapin++; 1114 cnt.v_swappgsin += bp->b_npages; 1115 1116 /* 1117 * We still hold the lock on mreq, and our automatic completion routine 1118 * does not remove it. 1119 */ 1120 1121 vm_object_pip_add(mreq->object, bp->b_npages); 1122 lastpindex = m[j-1]->pindex; 1123 1124 /* 1125 * perform the I/O. NOTE!!! bp cannot be considered valid after 1126 * this point because we automatically release it on completion. 1127 * Instead, we look at the one page we are interested in which we 1128 * still hold a lock on even through the I/O completion. 1129 * 1130 * The other pages in our m[] array are also released on completion, 1131 * so we cannot assume they are valid anymore either. 1132 * 1133 * NOTE: b_blkno is destroyed by the call to VOP_STRATEGY 1134 */ 1135 1136 BUF_KERNPROC(bp); 1137 BUF_STRATEGY(bp); 1138 1139 /* 1140 * wait for the page we want to complete. PG_SWAPINPROG is always 1141 * cleared on completion. If an I/O error occurs, SWAPBLK_NONE 1142 * is set in the meta-data. 1143 */ 1144 1145 s = splvm(); 1146 1147 while ((mreq->flags & PG_SWAPINPROG) != 0) { 1148 vm_page_flag_set(mreq, PG_WANTED | PG_REFERENCED); 1149 cnt.v_intrans++; 1150 if (tsleep(mreq, PSWP, "swread", hz*20)) { 1151 printf( 1152 "swap_pager: indefinite wait buffer: device:" 1153 " %s, blkno: %ld, size: %ld\n", 1154 devtoname(bp->b_dev), (long)bp->b_blkno, 1155 bp->b_bcount 1156 ); 1157 } 1158 } 1159 1160 splx(s); 1161 1162 /* 1163 * mreq is left bussied after completion, but all the other pages 1164 * are freed. If we had an unrecoverable read error the page will 1165 * not be valid. 1166 */ 1167 1168 if (mreq->valid != VM_PAGE_BITS_ALL) { 1169 return(VM_PAGER_ERROR); 1170 } else { 1171 return(VM_PAGER_OK); 1172 } 1173 1174 /* 1175 * A final note: in a low swap situation, we cannot deallocate swap 1176 * and mark a page dirty here because the caller is likely to mark 1177 * the page clean when we return, causing the page to possibly revert 1178 * to all-zero's later. 1179 */ 1180 } 1181 1182 /* 1183 * swap_pager_putpages: 1184 * 1185 * Assign swap (if necessary) and initiate I/O on the specified pages. 1186 * 1187 * We support both OBJT_DEFAULT and OBJT_SWAP objects. DEFAULT objects 1188 * are automatically converted to SWAP objects. 1189 * 1190 * In a low memory situation we may block in VOP_STRATEGY(), but the new 1191 * vm_page reservation system coupled with properly written VFS devices 1192 * should ensure that no low-memory deadlock occurs. This is an area 1193 * which needs work. 1194 * 1195 * The parent has N vm_object_pip_add() references prior to 1196 * calling us and will remove references for rtvals[] that are 1197 * not set to VM_PAGER_PEND. We need to remove the rest on I/O 1198 * completion. 1199 * 1200 * The parent has soft-busy'd the pages it passes us and will unbusy 1201 * those whos rtvals[] entry is not set to VM_PAGER_PEND on return. 1202 * We need to unbusy the rest on I/O completion. 1203 */ 1204 1205 void 1206 swap_pager_putpages(object, m, count, sync, rtvals) 1207 vm_object_t object; 1208 vm_page_t *m; 1209 int count; 1210 boolean_t sync; 1211 int *rtvals; 1212 { 1213 int i; 1214 int n = 0; 1215 1216 if (count && m[0]->object != object) { 1217 panic("swap_pager_getpages: object mismatch %p/%p", 1218 object, 1219 m[0]->object 1220 ); 1221 } 1222 /* 1223 * Step 1 1224 * 1225 * Turn object into OBJT_SWAP 1226 * check for bogus sysops 1227 * force sync if not pageout process 1228 */ 1229 1230 if (object->type != OBJT_SWAP) 1231 swp_pager_meta_build(object, 0, SWAPBLK_NONE); 1232 1233 if (curproc != pageproc) 1234 sync = TRUE; 1235 1236 /* 1237 * Step 2 1238 * 1239 * Update nsw parameters from swap_async_max sysctl values. 1240 * Do not let the sysop crash the machine with bogus numbers. 1241 */ 1242 1243 if (swap_async_max != nsw_wcount_async_max) { 1244 int n; 1245 int s; 1246 1247 /* 1248 * limit range 1249 */ 1250 if ((n = swap_async_max) > nswbuf / 2) 1251 n = nswbuf / 2; 1252 if (n < 1) 1253 n = 1; 1254 swap_async_max = n; 1255 1256 /* 1257 * Adjust difference ( if possible ). If the current async 1258 * count is too low, we may not be able to make the adjustment 1259 * at this time. 1260 */ 1261 s = splvm(); 1262 n -= nsw_wcount_async_max; 1263 if (nsw_wcount_async + n >= 0) { 1264 nsw_wcount_async += n; 1265 nsw_wcount_async_max += n; 1266 wakeup(&nsw_wcount_async); 1267 } 1268 splx(s); 1269 } 1270 1271 /* 1272 * Step 3 1273 * 1274 * Assign swap blocks and issue I/O. We reallocate swap on the fly. 1275 * The page is left dirty until the pageout operation completes 1276 * successfully. 1277 */ 1278 1279 for (i = 0; i < count; i += n) { 1280 int s; 1281 int j; 1282 struct buf *bp; 1283 daddr_t blk; 1284 1285 /* 1286 * Maximum I/O size is limited by a number of factors. 1287 */ 1288 1289 n = min(BLIST_MAX_ALLOC, count - i); 1290 n = min(n, nsw_cluster_max); 1291 1292 s = splvm(); 1293 1294 /* 1295 * Get biggest block of swap we can. If we fail, fall 1296 * back and try to allocate a smaller block. Don't go 1297 * overboard trying to allocate space if it would overly 1298 * fragment swap. 1299 */ 1300 while ( 1301 (blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE && 1302 n > 4 1303 ) { 1304 n >>= 1; 1305 } 1306 if (blk == SWAPBLK_NONE) { 1307 for (j = 0; j < n; ++j) 1308 rtvals[i+j] = VM_PAGER_FAIL; 1309 splx(s); 1310 continue; 1311 } 1312 1313 /* 1314 * The I/O we are constructing cannot cross a physical 1315 * disk boundry in the swap stripe. Note: we are still 1316 * at splvm(). 1317 */ 1318 if ((blk ^ (blk + n)) & dmmax_mask) { 1319 j = ((blk + dmmax) & dmmax_mask) - blk; 1320 swp_pager_freeswapspace(blk + j, n - j); 1321 n = j; 1322 } 1323 1324 /* 1325 * All I/O parameters have been satisfied, build the I/O 1326 * request and assign the swap space. 1327 * 1328 * NOTE: B_PAGING is set by pbgetvp() 1329 */ 1330 1331 if (sync == TRUE) { 1332 bp = getpbuf(&nsw_wcount_sync); 1333 } else { 1334 bp = getpbuf(&nsw_wcount_async); 1335 bp->b_flags = B_ASYNC; 1336 } 1337 bp->b_iocmd = BIO_WRITE; 1338 bp->b_spc = NULL; /* not used, but NULL-out anyway */ 1339 1340 pmap_qenter((vm_offset_t)bp->b_data, &m[i], n); 1341 1342 bp->b_rcred = bp->b_wcred = proc0.p_ucred; 1343 bp->b_bcount = PAGE_SIZE * n; 1344 bp->b_bufsize = PAGE_SIZE * n; 1345 bp->b_blkno = blk; 1346 1347 crhold(bp->b_rcred); 1348 crhold(bp->b_wcred); 1349 1350 pbgetvp(swapdev_vp, bp); 1351 1352 for (j = 0; j < n; ++j) { 1353 vm_page_t mreq = m[i+j]; 1354 1355 swp_pager_meta_build( 1356 mreq->object, 1357 mreq->pindex, 1358 blk + j 1359 ); 1360 vm_page_dirty(mreq); 1361 rtvals[i+j] = VM_PAGER_OK; 1362 1363 vm_page_flag_set(mreq, PG_SWAPINPROG); 1364 bp->b_pages[j] = mreq; 1365 } 1366 bp->b_npages = n; 1367 /* 1368 * Must set dirty range for NFS to work. 1369 */ 1370 bp->b_dirtyoff = 0; 1371 bp->b_dirtyend = bp->b_bcount; 1372 1373 cnt.v_swapout++; 1374 cnt.v_swappgsout += bp->b_npages; 1375 swapdev_vp->v_numoutput++; 1376 1377 splx(s); 1378 1379 /* 1380 * asynchronous 1381 * 1382 * NOTE: b_blkno is destroyed by the call to VOP_STRATEGY 1383 */ 1384 1385 if (sync == FALSE) { 1386 bp->b_iodone = swp_pager_async_iodone; 1387 BUF_KERNPROC(bp); 1388 BUF_STRATEGY(bp); 1389 1390 for (j = 0; j < n; ++j) 1391 rtvals[i+j] = VM_PAGER_PEND; 1392 continue; 1393 } 1394 1395 /* 1396 * synchronous 1397 * 1398 * NOTE: b_blkno is destroyed by the call to VOP_STRATEGY 1399 */ 1400 1401 bp->b_iodone = swp_pager_sync_iodone; 1402 BUF_STRATEGY(bp); 1403 1404 /* 1405 * Wait for the sync I/O to complete, then update rtvals. 1406 * We just set the rtvals[] to VM_PAGER_PEND so we can call 1407 * our async completion routine at the end, thus avoiding a 1408 * double-free. 1409 */ 1410 s = splbio(); 1411 1412 while ((bp->b_flags & B_DONE) == 0) { 1413 tsleep(bp, PVM, "swwrt", 0); 1414 } 1415 1416 for (j = 0; j < n; ++j) 1417 rtvals[i+j] = VM_PAGER_PEND; 1418 1419 /* 1420 * Now that we are through with the bp, we can call the 1421 * normal async completion, which frees everything up. 1422 */ 1423 1424 swp_pager_async_iodone(bp); 1425 1426 splx(s); 1427 } 1428 } 1429 1430 /* 1431 * swap_pager_sync_iodone: 1432 * 1433 * Completion routine for synchronous reads and writes from/to swap. 1434 * We just mark the bp is complete and wake up anyone waiting on it. 1435 * 1436 * This routine may not block. This routine is called at splbio() or better. 1437 */ 1438 1439 static void 1440 swp_pager_sync_iodone(bp) 1441 struct buf *bp; 1442 { 1443 bp->b_flags |= B_DONE; 1444 bp->b_flags &= ~B_ASYNC; 1445 wakeup(bp); 1446 } 1447 1448 /* 1449 * swp_pager_async_iodone: 1450 * 1451 * Completion routine for asynchronous reads and writes from/to swap. 1452 * Also called manually by synchronous code to finish up a bp. 1453 * 1454 * For READ operations, the pages are PG_BUSY'd. For WRITE operations, 1455 * the pages are vm_page_t->busy'd. For READ operations, we PG_BUSY 1456 * unbusy all pages except the 'main' request page. For WRITE 1457 * operations, we vm_page_t->busy'd unbusy all pages ( we can do this 1458 * because we marked them all VM_PAGER_PEND on return from putpages ). 1459 * 1460 * This routine may not block. 1461 * This routine is called at splbio() or better 1462 * 1463 * We up ourselves to splvm() as required for various vm_page related 1464 * calls. 1465 */ 1466 1467 static void 1468 swp_pager_async_iodone(bp) 1469 register struct buf *bp; 1470 { 1471 int s; 1472 int i; 1473 vm_object_t object = NULL; 1474 1475 bp->b_flags |= B_DONE; 1476 1477 /* 1478 * report error 1479 */ 1480 1481 if (bp->b_ioflags & BIO_ERROR) { 1482 printf( 1483 "swap_pager: I/O error - %s failed; blkno %ld," 1484 "size %ld, error %d\n", 1485 ((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"), 1486 (long)bp->b_blkno, 1487 (long)bp->b_bcount, 1488 bp->b_error 1489 ); 1490 } 1491 1492 /* 1493 * set object, raise to splvm(). 1494 */ 1495 1496 if (bp->b_npages) 1497 object = bp->b_pages[0]->object; 1498 s = splvm(); 1499 1500 /* 1501 * remove the mapping for kernel virtual 1502 */ 1503 1504 pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages); 1505 1506 /* 1507 * cleanup pages. If an error occurs writing to swap, we are in 1508 * very serious trouble. If it happens to be a disk error, though, 1509 * we may be able to recover by reassigning the swap later on. So 1510 * in this case we remove the m->swapblk assignment for the page 1511 * but do not free it in the rlist. The errornous block(s) are thus 1512 * never reallocated as swap. Redirty the page and continue. 1513 */ 1514 1515 for (i = 0; i < bp->b_npages; ++i) { 1516 vm_page_t m = bp->b_pages[i]; 1517 1518 vm_page_flag_clear(m, PG_SWAPINPROG); 1519 1520 if (bp->b_ioflags & BIO_ERROR) { 1521 /* 1522 * If an error occurs I'd love to throw the swapblk 1523 * away without freeing it back to swapspace, so it 1524 * can never be used again. But I can't from an 1525 * interrupt. 1526 */ 1527 1528 if (bp->b_iocmd == BIO_READ) { 1529 /* 1530 * When reading, reqpage needs to stay 1531 * locked for the parent, but all other 1532 * pages can be freed. We still want to 1533 * wakeup the parent waiting on the page, 1534 * though. ( also: pg_reqpage can be -1 and 1535 * not match anything ). 1536 * 1537 * We have to wake specifically requested pages 1538 * up too because we cleared PG_SWAPINPROG and 1539 * someone may be waiting for that. 1540 * 1541 * NOTE: for reads, m->dirty will probably 1542 * be overridden by the original caller of 1543 * getpages so don't play cute tricks here. 1544 * 1545 * XXX it may not be legal to free the page 1546 * here as this messes with the object->memq's. 1547 */ 1548 1549 m->valid = 0; 1550 vm_page_flag_clear(m, PG_ZERO); 1551 1552 if (i != bp->b_pager.pg_reqpage) 1553 vm_page_free(m); 1554 else 1555 vm_page_flash(m); 1556 /* 1557 * If i == bp->b_pager.pg_reqpage, do not wake 1558 * the page up. The caller needs to. 1559 */ 1560 } else { 1561 /* 1562 * If a write error occurs, reactivate page 1563 * so it doesn't clog the inactive list, 1564 * then finish the I/O. 1565 */ 1566 vm_page_dirty(m); 1567 vm_page_activate(m); 1568 vm_page_io_finish(m); 1569 } 1570 } else if (bp->b_iocmd == BIO_READ) { 1571 /* 1572 * For read success, clear dirty bits. Nobody should 1573 * have this page mapped but don't take any chances, 1574 * make sure the pmap modify bits are also cleared. 1575 * 1576 * NOTE: for reads, m->dirty will probably be 1577 * overridden by the original caller of getpages so 1578 * we cannot set them in order to free the underlying 1579 * swap in a low-swap situation. I don't think we'd 1580 * want to do that anyway, but it was an optimization 1581 * that existed in the old swapper for a time before 1582 * it got ripped out due to precisely this problem. 1583 * 1584 * clear PG_ZERO in page. 1585 * 1586 * If not the requested page then deactivate it. 1587 * 1588 * Note that the requested page, reqpage, is left 1589 * busied, but we still have to wake it up. The 1590 * other pages are released (unbusied) by 1591 * vm_page_wakeup(). We do not set reqpage's 1592 * valid bits here, it is up to the caller. 1593 */ 1594 1595 pmap_clear_modify(m); 1596 m->valid = VM_PAGE_BITS_ALL; 1597 vm_page_undirty(m); 1598 vm_page_flag_clear(m, PG_ZERO); 1599 1600 /* 1601 * We have to wake specifically requested pages 1602 * up too because we cleared PG_SWAPINPROG and 1603 * could be waiting for it in getpages. However, 1604 * be sure to not unbusy getpages specifically 1605 * requested page - getpages expects it to be 1606 * left busy. 1607 */ 1608 if (i != bp->b_pager.pg_reqpage) { 1609 vm_page_deactivate(m); 1610 vm_page_wakeup(m); 1611 } else { 1612 vm_page_flash(m); 1613 } 1614 } else { 1615 /* 1616 * For write success, clear the modify and dirty 1617 * status, then finish the I/O ( which decrements the 1618 * busy count and possibly wakes waiter's up ). 1619 */ 1620 vm_page_protect(m, VM_PROT_READ); 1621 pmap_clear_modify(m); 1622 vm_page_undirty(m); 1623 vm_page_io_finish(m); 1624 } 1625 } 1626 1627 /* 1628 * adjust pip. NOTE: the original parent may still have its own 1629 * pip refs on the object. 1630 */ 1631 1632 if (object) 1633 vm_object_pip_wakeupn(object, bp->b_npages); 1634 1635 /* 1636 * release the physical I/O buffer 1637 */ 1638 1639 relpbuf( 1640 bp, 1641 ((bp->b_iocmd == BIO_READ) ? &nsw_rcount : 1642 ((bp->b_flags & B_ASYNC) ? 1643 &nsw_wcount_async : 1644 &nsw_wcount_sync 1645 ) 1646 ) 1647 ); 1648 splx(s); 1649 } 1650 1651 /************************************************************************ 1652 * SWAP META DATA * 1653 ************************************************************************ 1654 * 1655 * These routines manipulate the swap metadata stored in the 1656 * OBJT_SWAP object. All swp_*() routines must be called at 1657 * splvm() because swap can be freed up by the low level vm_page 1658 * code which might be called from interrupts beyond what splbio() covers. 1659 * 1660 * Swap metadata is implemented with a global hash and not directly 1661 * linked into the object. Instead the object simply contains 1662 * appropriate tracking counters. 1663 */ 1664 1665 /* 1666 * SWP_PAGER_HASH() - hash swap meta data 1667 * 1668 * This is an inline helper function which hashes the swapblk given 1669 * the object and page index. It returns a pointer to a pointer 1670 * to the object, or a pointer to a NULL pointer if it could not 1671 * find a swapblk. 1672 * 1673 * This routine must be called at splvm(). 1674 */ 1675 1676 static __inline struct swblock ** 1677 swp_pager_hash(vm_object_t object, vm_pindex_t index) 1678 { 1679 struct swblock **pswap; 1680 struct swblock *swap; 1681 1682 index &= ~SWAP_META_MASK; 1683 pswap = &swhash[(index ^ (int)(intptr_t)object) & swhash_mask]; 1684 1685 while ((swap = *pswap) != NULL) { 1686 if (swap->swb_object == object && 1687 swap->swb_index == index 1688 ) { 1689 break; 1690 } 1691 pswap = &swap->swb_hnext; 1692 } 1693 return(pswap); 1694 } 1695 1696 /* 1697 * SWP_PAGER_META_BUILD() - add swap block to swap meta data for object 1698 * 1699 * We first convert the object to a swap object if it is a default 1700 * object. 1701 * 1702 * The specified swapblk is added to the object's swap metadata. If 1703 * the swapblk is not valid, it is freed instead. Any previously 1704 * assigned swapblk is freed. 1705 * 1706 * This routine must be called at splvm(), except when used to convert 1707 * an OBJT_DEFAULT object into an OBJT_SWAP object. 1708 1709 */ 1710 1711 static void 1712 swp_pager_meta_build( 1713 vm_object_t object, 1714 vm_pindex_t index, 1715 daddr_t swapblk 1716 ) { 1717 struct swblock *swap; 1718 struct swblock **pswap; 1719 1720 /* 1721 * Convert default object to swap object if necessary 1722 */ 1723 1724 if (object->type != OBJT_SWAP) { 1725 object->type = OBJT_SWAP; 1726 object->un_pager.swp.swp_bcount = 0; 1727 1728 if (object->handle != NULL) { 1729 TAILQ_INSERT_TAIL( 1730 NOBJLIST(object->handle), 1731 object, 1732 pager_object_list 1733 ); 1734 } else { 1735 TAILQ_INSERT_TAIL( 1736 &swap_pager_un_object_list, 1737 object, 1738 pager_object_list 1739 ); 1740 } 1741 } 1742 1743 /* 1744 * Locate hash entry. If not found create, but if we aren't adding 1745 * anything just return. If we run out of space in the map we wait 1746 * and, since the hash table may have changed, retry. 1747 */ 1748 1749 retry: 1750 pswap = swp_pager_hash(object, index); 1751 1752 if ((swap = *pswap) == NULL) { 1753 int i; 1754 1755 if (swapblk == SWAPBLK_NONE) 1756 return; 1757 1758 swap = *pswap = zalloc(swap_zone); 1759 if (swap == NULL) { 1760 VM_WAIT; 1761 goto retry; 1762 } 1763 swap->swb_hnext = NULL; 1764 swap->swb_object = object; 1765 swap->swb_index = index & ~SWAP_META_MASK; 1766 swap->swb_count = 0; 1767 1768 ++object->un_pager.swp.swp_bcount; 1769 1770 for (i = 0; i < SWAP_META_PAGES; ++i) 1771 swap->swb_pages[i] = SWAPBLK_NONE; 1772 } 1773 1774 /* 1775 * Delete prior contents of metadata 1776 */ 1777 1778 index &= SWAP_META_MASK; 1779 1780 if (swap->swb_pages[index] != SWAPBLK_NONE) { 1781 swp_pager_freeswapspace(swap->swb_pages[index], 1); 1782 --swap->swb_count; 1783 } 1784 1785 /* 1786 * Enter block into metadata 1787 */ 1788 1789 swap->swb_pages[index] = swapblk; 1790 if (swapblk != SWAPBLK_NONE) 1791 ++swap->swb_count; 1792 } 1793 1794 /* 1795 * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata 1796 * 1797 * The requested range of blocks is freed, with any associated swap 1798 * returned to the swap bitmap. 1799 * 1800 * This routine will free swap metadata structures as they are cleaned 1801 * out. This routine does *NOT* operate on swap metadata associated 1802 * with resident pages. 1803 * 1804 * This routine must be called at splvm() 1805 */ 1806 1807 static void 1808 swp_pager_meta_free(vm_object_t object, vm_pindex_t index, daddr_t count) 1809 { 1810 if (object->type != OBJT_SWAP) 1811 return; 1812 1813 while (count > 0) { 1814 struct swblock **pswap; 1815 struct swblock *swap; 1816 1817 pswap = swp_pager_hash(object, index); 1818 1819 if ((swap = *pswap) != NULL) { 1820 daddr_t v = swap->swb_pages[index & SWAP_META_MASK]; 1821 1822 if (v != SWAPBLK_NONE) { 1823 swp_pager_freeswapspace(v, 1); 1824 swap->swb_pages[index & SWAP_META_MASK] = 1825 SWAPBLK_NONE; 1826 if (--swap->swb_count == 0) { 1827 *pswap = swap->swb_hnext; 1828 zfree(swap_zone, swap); 1829 --object->un_pager.swp.swp_bcount; 1830 } 1831 } 1832 --count; 1833 ++index; 1834 } else { 1835 int n = SWAP_META_PAGES - (index & SWAP_META_MASK); 1836 count -= n; 1837 index += n; 1838 } 1839 } 1840 } 1841 1842 /* 1843 * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object 1844 * 1845 * This routine locates and destroys all swap metadata associated with 1846 * an object. 1847 * 1848 * This routine must be called at splvm() 1849 */ 1850 1851 static void 1852 swp_pager_meta_free_all(vm_object_t object) 1853 { 1854 daddr_t index = 0; 1855 1856 if (object->type != OBJT_SWAP) 1857 return; 1858 1859 while (object->un_pager.swp.swp_bcount) { 1860 struct swblock **pswap; 1861 struct swblock *swap; 1862 1863 pswap = swp_pager_hash(object, index); 1864 if ((swap = *pswap) != NULL) { 1865 int i; 1866 1867 for (i = 0; i < SWAP_META_PAGES; ++i) { 1868 daddr_t v = swap->swb_pages[i]; 1869 if (v != SWAPBLK_NONE) { 1870 --swap->swb_count; 1871 swp_pager_freeswapspace(v, 1); 1872 } 1873 } 1874 if (swap->swb_count != 0) 1875 panic("swap_pager_meta_free_all: swb_count != 0"); 1876 *pswap = swap->swb_hnext; 1877 zfree(swap_zone, swap); 1878 --object->un_pager.swp.swp_bcount; 1879 } 1880 index += SWAP_META_PAGES; 1881 if (index > 0x20000000) 1882 panic("swp_pager_meta_free_all: failed to locate all swap meta blocks"); 1883 } 1884 } 1885 1886 /* 1887 * SWP_PAGER_METACTL() - misc control of swap and vm_page_t meta data. 1888 * 1889 * This routine is capable of looking up, popping, or freeing 1890 * swapblk assignments in the swap meta data or in the vm_page_t. 1891 * The routine typically returns the swapblk being looked-up, or popped, 1892 * or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block 1893 * was invalid. This routine will automatically free any invalid 1894 * meta-data swapblks. 1895 * 1896 * It is not possible to store invalid swapblks in the swap meta data 1897 * (other then a literal 'SWAPBLK_NONE'), so we don't bother checking. 1898 * 1899 * When acting on a busy resident page and paging is in progress, we 1900 * have to wait until paging is complete but otherwise can act on the 1901 * busy page. 1902 * 1903 * This routine must be called at splvm(). 1904 * 1905 * SWM_FREE remove and free swap block from metadata 1906 * SWM_POP remove from meta data but do not free.. pop it out 1907 */ 1908 1909 static daddr_t 1910 swp_pager_meta_ctl( 1911 vm_object_t object, 1912 vm_pindex_t index, 1913 int flags 1914 ) { 1915 struct swblock **pswap; 1916 struct swblock *swap; 1917 daddr_t r1; 1918 1919 /* 1920 * The meta data only exists of the object is OBJT_SWAP 1921 * and even then might not be allocated yet. 1922 */ 1923 1924 if (object->type != OBJT_SWAP) 1925 return(SWAPBLK_NONE); 1926 1927 r1 = SWAPBLK_NONE; 1928 pswap = swp_pager_hash(object, index); 1929 1930 if ((swap = *pswap) != NULL) { 1931 index &= SWAP_META_MASK; 1932 r1 = swap->swb_pages[index]; 1933 1934 if (r1 != SWAPBLK_NONE) { 1935 if (flags & SWM_FREE) { 1936 swp_pager_freeswapspace(r1, 1); 1937 r1 = SWAPBLK_NONE; 1938 } 1939 if (flags & (SWM_FREE|SWM_POP)) { 1940 swap->swb_pages[index] = SWAPBLK_NONE; 1941 if (--swap->swb_count == 0) { 1942 *pswap = swap->swb_hnext; 1943 zfree(swap_zone, swap); 1944 --object->un_pager.swp.swp_bcount; 1945 } 1946 } 1947 } 1948 } 1949 return(r1); 1950 } 1951 1952 /******************************************************** 1953 * CHAINING FUNCTIONS * 1954 ******************************************************** 1955 * 1956 * These functions support recursion of I/O operations 1957 * on bp's, typically by chaining one or more 'child' bp's 1958 * to the parent. Synchronous, asynchronous, and semi-synchronous 1959 * chaining is possible. 1960 */ 1961 1962 /* 1963 * vm_pager_chain_iodone: 1964 * 1965 * io completion routine for child bp. Currently we fudge a bit 1966 * on dealing with b_resid. Since users of these routines may issue 1967 * multiple children simultaneously, sequencing of the error can be lost. 1968 */ 1969 1970 static void 1971 vm_pager_chain_iodone(struct buf *nbp) 1972 { 1973 struct bio *bp; 1974 u_int *count; 1975 1976 bp = nbp->b_caller1; 1977 count = (u_int *)&(bp->bio_caller1); 1978 if (bp != NULL) { 1979 if (nbp->b_ioflags & BIO_ERROR) { 1980 bp->bio_flags |= BIO_ERROR; 1981 bp->bio_error = nbp->b_error; 1982 } else if (nbp->b_resid != 0) { 1983 bp->bio_flags |= BIO_ERROR; 1984 bp->bio_error = EINVAL; 1985 } else { 1986 bp->bio_resid -= nbp->b_bcount; 1987 } 1988 nbp->b_caller1 = NULL; 1989 --(*count); 1990 if (bp->bio_flags & BIO_FLAG1) { 1991 bp->bio_flags &= ~BIO_FLAG1; 1992 wakeup(bp); 1993 } 1994 } 1995 nbp->b_flags |= B_DONE; 1996 nbp->b_flags &= ~B_ASYNC; 1997 relpbuf(nbp, NULL); 1998 } 1999 2000 /* 2001 * getchainbuf: 2002 * 2003 * Obtain a physical buffer and chain it to its parent buffer. When 2004 * I/O completes, the parent buffer will be B_SIGNAL'd. Errors are 2005 * automatically propagated to the parent 2006 */ 2007 2008 struct buf * 2009 getchainbuf(struct bio *bp, struct vnode *vp, int flags) 2010 { 2011 struct buf *nbp = getpbuf(NULL); 2012 u_int *count = (u_int *)&(bp->bio_caller1); 2013 2014 nbp->b_caller1 = bp; 2015 ++(*count); 2016 2017 if (*count > 4) 2018 waitchainbuf(bp, 4, 0); 2019 2020 nbp->b_iocmd = bp->bio_cmd; 2021 nbp->b_ioflags = bp->bio_flags & BIO_ORDERED; 2022 nbp->b_flags = flags; 2023 nbp->b_rcred = nbp->b_wcred = proc0.p_ucred; 2024 nbp->b_iodone = vm_pager_chain_iodone; 2025 2026 crhold(nbp->b_rcred); 2027 crhold(nbp->b_wcred); 2028 2029 if (vp) 2030 pbgetvp(vp, nbp); 2031 return(nbp); 2032 } 2033 2034 void 2035 flushchainbuf(struct buf *nbp) 2036 { 2037 if (nbp->b_bcount) { 2038 nbp->b_bufsize = nbp->b_bcount; 2039 if (nbp->b_iocmd == BIO_WRITE) 2040 nbp->b_dirtyend = nbp->b_bcount; 2041 BUF_KERNPROC(nbp); 2042 BUF_STRATEGY(nbp); 2043 } else { 2044 bufdone(nbp); 2045 } 2046 } 2047 2048 void 2049 waitchainbuf(struct bio *bp, int limit, int done) 2050 { 2051 int s; 2052 u_int *count = (u_int *)&(bp->bio_caller1); 2053 2054 s = splbio(); 2055 while (*count > limit) { 2056 bp->bio_flags |= BIO_FLAG1; 2057 tsleep(bp, PRIBIO + 4, "bpchain", 0); 2058 } 2059 if (done) { 2060 if (bp->bio_resid != 0 && !(bp->bio_flags & BIO_ERROR)) { 2061 bp->bio_flags |= BIO_ERROR; 2062 bp->bio_error = EINVAL; 2063 } 2064 biodone(bp); 2065 } 2066 splx(s); 2067 } 2068 2069