1 /* 2 * linux/mm/swapfile.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 * Swap reorganised 29.12.95, Stephen Tweedie 6 */ 7 8 #include <linux/mm.h> 9 #include <linux/hugetlb.h> 10 #include <linux/mman.h> 11 #include <linux/slab.h> 12 #include <linux/kernel_stat.h> 13 #include <linux/swap.h> 14 #include <linux/vmalloc.h> 15 #include <linux/pagemap.h> 16 #include <linux/namei.h> 17 #include <linux/shm.h> 18 #include <linux/blkdev.h> 19 #include <linux/random.h> 20 #include <linux/writeback.h> 21 #include <linux/proc_fs.h> 22 #include <linux/seq_file.h> 23 #include <linux/init.h> 24 #include <linux/module.h> 25 #include <linux/rmap.h> 26 #include <linux/security.h> 27 #include <linux/backing-dev.h> 28 #include <linux/mutex.h> 29 #include <linux/capability.h> 30 #include <linux/syscalls.h> 31 #include <linux/memcontrol.h> 32 33 #include <asm/pgtable.h> 34 #include <asm/tlbflush.h> 35 #include <linux/swapops.h> 36 #include <linux/page_cgroup.h> 37 38 static DEFINE_SPINLOCK(swap_lock); 39 static unsigned int nr_swapfiles; 40 long nr_swap_pages; 41 long total_swap_pages; 42 static int swap_overflow; 43 static int least_priority; 44 45 static const char Bad_file[] = "Bad swap file entry "; 46 static const char Unused_file[] = "Unused swap file entry "; 47 static const char Bad_offset[] = "Bad swap offset entry "; 48 static const char Unused_offset[] = "Unused swap offset entry "; 49 50 static struct swap_list_t swap_list = {-1, -1}; 51 52 static struct swap_info_struct swap_info[MAX_SWAPFILES]; 53 54 static DEFINE_MUTEX(swapon_mutex); 55 56 /* For reference count accounting in swap_map */ 57 /* enum for swap_map[] handling. internal use only */ 58 enum { 59 SWAP_MAP = 0, /* ops for reference from swap users */ 60 SWAP_CACHE, /* ops for reference from swap cache */ 61 }; 62 63 static inline int swap_count(unsigned short ent) 64 { 65 return ent & SWAP_COUNT_MASK; 66 } 67 68 static inline bool swap_has_cache(unsigned short ent) 69 { 70 return !!(ent & SWAP_HAS_CACHE); 71 } 72 73 static inline unsigned short encode_swapmap(int count, bool has_cache) 74 { 75 unsigned short ret = count; 76 77 if (has_cache) 78 return SWAP_HAS_CACHE | ret; 79 return ret; 80 } 81 82 /* returnes 1 if swap entry is freed */ 83 static int 84 __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) 85 { 86 int type = si - swap_info; 87 swp_entry_t entry = swp_entry(type, offset); 88 struct page *page; 89 int ret = 0; 90 91 page = find_get_page(&swapper_space, entry.val); 92 if (!page) 93 return 0; 94 /* 95 * This function is called from scan_swap_map() and it's called 96 * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here. 97 * We have to use trylock for avoiding deadlock. This is a special 98 * case and you should use try_to_free_swap() with explicit lock_page() 99 * in usual operations. 100 */ 101 if (trylock_page(page)) { 102 ret = try_to_free_swap(page); 103 unlock_page(page); 104 } 105 page_cache_release(page); 106 return ret; 107 } 108 109 /* 110 * We need this because the bdev->unplug_fn can sleep and we cannot 111 * hold swap_lock while calling the unplug_fn. And swap_lock 112 * cannot be turned into a mutex. 113 */ 114 static DECLARE_RWSEM(swap_unplug_sem); 115 116 void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) 117 { 118 swp_entry_t entry; 119 120 down_read(&swap_unplug_sem); 121 entry.val = page_private(page); 122 if (PageSwapCache(page)) { 123 struct block_device *bdev = swap_info[swp_type(entry)].bdev; 124 struct backing_dev_info *bdi; 125 126 /* 127 * If the page is removed from swapcache from under us (with a 128 * racy try_to_unuse/swapoff) we need an additional reference 129 * count to avoid reading garbage from page_private(page) above. 130 * If the WARN_ON triggers during a swapoff it maybe the race 131 * condition and it's harmless. However if it triggers without 132 * swapoff it signals a problem. 133 */ 134 WARN_ON(page_count(page) <= 1); 135 136 bdi = bdev->bd_inode->i_mapping->backing_dev_info; 137 blk_run_backing_dev(bdi, page); 138 } 139 up_read(&swap_unplug_sem); 140 } 141 142 /* 143 * swapon tell device that all the old swap contents can be discarded, 144 * to allow the swap device to optimize its wear-levelling. 145 */ 146 static int discard_swap(struct swap_info_struct *si) 147 { 148 struct swap_extent *se; 149 int err = 0; 150 151 list_for_each_entry(se, &si->extent_list, list) { 152 sector_t start_block = se->start_block << (PAGE_SHIFT - 9); 153 sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); 154 155 if (se->start_page == 0) { 156 /* Do not discard the swap header page! */ 157 start_block += 1 << (PAGE_SHIFT - 9); 158 nr_blocks -= 1 << (PAGE_SHIFT - 9); 159 if (!nr_blocks) 160 continue; 161 } 162 163 err = blkdev_issue_discard(si->bdev, start_block, 164 nr_blocks, GFP_KERNEL, 165 DISCARD_FL_BARRIER); 166 if (err) 167 break; 168 169 cond_resched(); 170 } 171 return err; /* That will often be -EOPNOTSUPP */ 172 } 173 174 /* 175 * swap allocation tell device that a cluster of swap can now be discarded, 176 * to allow the swap device to optimize its wear-levelling. 177 */ 178 static void discard_swap_cluster(struct swap_info_struct *si, 179 pgoff_t start_page, pgoff_t nr_pages) 180 { 181 struct swap_extent *se = si->curr_swap_extent; 182 int found_extent = 0; 183 184 while (nr_pages) { 185 struct list_head *lh; 186 187 if (se->start_page <= start_page && 188 start_page < se->start_page + se->nr_pages) { 189 pgoff_t offset = start_page - se->start_page; 190 sector_t start_block = se->start_block + offset; 191 sector_t nr_blocks = se->nr_pages - offset; 192 193 if (nr_blocks > nr_pages) 194 nr_blocks = nr_pages; 195 start_page += nr_blocks; 196 nr_pages -= nr_blocks; 197 198 if (!found_extent++) 199 si->curr_swap_extent = se; 200 201 start_block <<= PAGE_SHIFT - 9; 202 nr_blocks <<= PAGE_SHIFT - 9; 203 if (blkdev_issue_discard(si->bdev, start_block, 204 nr_blocks, GFP_NOIO, 205 DISCARD_FL_BARRIER)) 206 break; 207 } 208 209 lh = se->list.next; 210 if (lh == &si->extent_list) 211 lh = lh->next; 212 se = list_entry(lh, struct swap_extent, list); 213 } 214 } 215 216 static int wait_for_discard(void *word) 217 { 218 schedule(); 219 return 0; 220 } 221 222 #define SWAPFILE_CLUSTER 256 223 #define LATENCY_LIMIT 256 224 225 static inline unsigned long scan_swap_map(struct swap_info_struct *si, 226 int cache) 227 { 228 unsigned long offset; 229 unsigned long scan_base; 230 unsigned long last_in_cluster = 0; 231 int latency_ration = LATENCY_LIMIT; 232 int found_free_cluster = 0; 233 234 /* 235 * We try to cluster swap pages by allocating them sequentially 236 * in swap. Once we've allocated SWAPFILE_CLUSTER pages this 237 * way, however, we resort to first-free allocation, starting 238 * a new cluster. This prevents us from scattering swap pages 239 * all over the entire swap partition, so that we reduce 240 * overall disk seek times between swap pages. -- sct 241 * But we do now try to find an empty cluster. -Andrea 242 * And we let swap pages go all over an SSD partition. Hugh 243 */ 244 245 si->flags += SWP_SCANNING; 246 scan_base = offset = si->cluster_next; 247 248 if (unlikely(!si->cluster_nr--)) { 249 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { 250 si->cluster_nr = SWAPFILE_CLUSTER - 1; 251 goto checks; 252 } 253 if (si->flags & SWP_DISCARDABLE) { 254 /* 255 * Start range check on racing allocations, in case 256 * they overlap the cluster we eventually decide on 257 * (we scan without swap_lock to allow preemption). 258 * It's hardly conceivable that cluster_nr could be 259 * wrapped during our scan, but don't depend on it. 260 */ 261 if (si->lowest_alloc) 262 goto checks; 263 si->lowest_alloc = si->max; 264 si->highest_alloc = 0; 265 } 266 spin_unlock(&swap_lock); 267 268 /* 269 * If seek is expensive, start searching for new cluster from 270 * start of partition, to minimize the span of allocated swap. 271 * But if seek is cheap, search from our current position, so 272 * that swap is allocated from all over the partition: if the 273 * Flash Translation Layer only remaps within limited zones, 274 * we don't want to wear out the first zone too quickly. 275 */ 276 if (!(si->flags & SWP_SOLIDSTATE)) 277 scan_base = offset = si->lowest_bit; 278 last_in_cluster = offset + SWAPFILE_CLUSTER - 1; 279 280 /* Locate the first empty (unaligned) cluster */ 281 for (; last_in_cluster <= si->highest_bit; offset++) { 282 if (si->swap_map[offset]) 283 last_in_cluster = offset + SWAPFILE_CLUSTER; 284 else if (offset == last_in_cluster) { 285 spin_lock(&swap_lock); 286 offset -= SWAPFILE_CLUSTER - 1; 287 si->cluster_next = offset; 288 si->cluster_nr = SWAPFILE_CLUSTER - 1; 289 found_free_cluster = 1; 290 goto checks; 291 } 292 if (unlikely(--latency_ration < 0)) { 293 cond_resched(); 294 latency_ration = LATENCY_LIMIT; 295 } 296 } 297 298 offset = si->lowest_bit; 299 last_in_cluster = offset + SWAPFILE_CLUSTER - 1; 300 301 /* Locate the first empty (unaligned) cluster */ 302 for (; last_in_cluster < scan_base; offset++) { 303 if (si->swap_map[offset]) 304 last_in_cluster = offset + SWAPFILE_CLUSTER; 305 else if (offset == last_in_cluster) { 306 spin_lock(&swap_lock); 307 offset -= SWAPFILE_CLUSTER - 1; 308 si->cluster_next = offset; 309 si->cluster_nr = SWAPFILE_CLUSTER - 1; 310 found_free_cluster = 1; 311 goto checks; 312 } 313 if (unlikely(--latency_ration < 0)) { 314 cond_resched(); 315 latency_ration = LATENCY_LIMIT; 316 } 317 } 318 319 offset = scan_base; 320 spin_lock(&swap_lock); 321 si->cluster_nr = SWAPFILE_CLUSTER - 1; 322 si->lowest_alloc = 0; 323 } 324 325 checks: 326 if (!(si->flags & SWP_WRITEOK)) 327 goto no_page; 328 if (!si->highest_bit) 329 goto no_page; 330 if (offset > si->highest_bit) 331 scan_base = offset = si->lowest_bit; 332 333 /* reuse swap entry of cache-only swap if not busy. */ 334 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 335 int swap_was_freed; 336 spin_unlock(&swap_lock); 337 swap_was_freed = __try_to_reclaim_swap(si, offset); 338 spin_lock(&swap_lock); 339 /* entry was freed successfully, try to use this again */ 340 if (swap_was_freed) 341 goto checks; 342 goto scan; /* check next one */ 343 } 344 345 if (si->swap_map[offset]) 346 goto scan; 347 348 if (offset == si->lowest_bit) 349 si->lowest_bit++; 350 if (offset == si->highest_bit) 351 si->highest_bit--; 352 si->inuse_pages++; 353 if (si->inuse_pages == si->pages) { 354 si->lowest_bit = si->max; 355 si->highest_bit = 0; 356 } 357 if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ 358 si->swap_map[offset] = encode_swapmap(0, true); 359 else /* at suspend */ 360 si->swap_map[offset] = encode_swapmap(1, false); 361 si->cluster_next = offset + 1; 362 si->flags -= SWP_SCANNING; 363 364 if (si->lowest_alloc) { 365 /* 366 * Only set when SWP_DISCARDABLE, and there's a scan 367 * for a free cluster in progress or just completed. 368 */ 369 if (found_free_cluster) { 370 /* 371 * To optimize wear-levelling, discard the 372 * old data of the cluster, taking care not to 373 * discard any of its pages that have already 374 * been allocated by racing tasks (offset has 375 * already stepped over any at the beginning). 376 */ 377 if (offset < si->highest_alloc && 378 si->lowest_alloc <= last_in_cluster) 379 last_in_cluster = si->lowest_alloc - 1; 380 si->flags |= SWP_DISCARDING; 381 spin_unlock(&swap_lock); 382 383 if (offset < last_in_cluster) 384 discard_swap_cluster(si, offset, 385 last_in_cluster - offset + 1); 386 387 spin_lock(&swap_lock); 388 si->lowest_alloc = 0; 389 si->flags &= ~SWP_DISCARDING; 390 391 smp_mb(); /* wake_up_bit advises this */ 392 wake_up_bit(&si->flags, ilog2(SWP_DISCARDING)); 393 394 } else if (si->flags & SWP_DISCARDING) { 395 /* 396 * Delay using pages allocated by racing tasks 397 * until the whole discard has been issued. We 398 * could defer that delay until swap_writepage, 399 * but it's easier to keep this self-contained. 400 */ 401 spin_unlock(&swap_lock); 402 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), 403 wait_for_discard, TASK_UNINTERRUPTIBLE); 404 spin_lock(&swap_lock); 405 } else { 406 /* 407 * Note pages allocated by racing tasks while 408 * scan for a free cluster is in progress, so 409 * that its final discard can exclude them. 410 */ 411 if (offset < si->lowest_alloc) 412 si->lowest_alloc = offset; 413 if (offset > si->highest_alloc) 414 si->highest_alloc = offset; 415 } 416 } 417 return offset; 418 419 scan: 420 spin_unlock(&swap_lock); 421 while (++offset <= si->highest_bit) { 422 if (!si->swap_map[offset]) { 423 spin_lock(&swap_lock); 424 goto checks; 425 } 426 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 427 spin_lock(&swap_lock); 428 goto checks; 429 } 430 if (unlikely(--latency_ration < 0)) { 431 cond_resched(); 432 latency_ration = LATENCY_LIMIT; 433 } 434 } 435 offset = si->lowest_bit; 436 while (++offset < scan_base) { 437 if (!si->swap_map[offset]) { 438 spin_lock(&swap_lock); 439 goto checks; 440 } 441 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 442 spin_lock(&swap_lock); 443 goto checks; 444 } 445 if (unlikely(--latency_ration < 0)) { 446 cond_resched(); 447 latency_ration = LATENCY_LIMIT; 448 } 449 } 450 spin_lock(&swap_lock); 451 452 no_page: 453 si->flags -= SWP_SCANNING; 454 return 0; 455 } 456 457 swp_entry_t get_swap_page(void) 458 { 459 struct swap_info_struct *si; 460 pgoff_t offset; 461 int type, next; 462 int wrapped = 0; 463 464 spin_lock(&swap_lock); 465 if (nr_swap_pages <= 0) 466 goto noswap; 467 nr_swap_pages--; 468 469 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { 470 si = swap_info + type; 471 next = si->next; 472 if (next < 0 || 473 (!wrapped && si->prio != swap_info[next].prio)) { 474 next = swap_list.head; 475 wrapped++; 476 } 477 478 if (!si->highest_bit) 479 continue; 480 if (!(si->flags & SWP_WRITEOK)) 481 continue; 482 483 swap_list.next = next; 484 /* This is called for allocating swap entry for cache */ 485 offset = scan_swap_map(si, SWAP_CACHE); 486 if (offset) { 487 spin_unlock(&swap_lock); 488 return swp_entry(type, offset); 489 } 490 next = swap_list.next; 491 } 492 493 nr_swap_pages++; 494 noswap: 495 spin_unlock(&swap_lock); 496 return (swp_entry_t) {0}; 497 } 498 499 /* The only caller of this function is now susupend routine */ 500 swp_entry_t get_swap_page_of_type(int type) 501 { 502 struct swap_info_struct *si; 503 pgoff_t offset; 504 505 spin_lock(&swap_lock); 506 si = swap_info + type; 507 if (si->flags & SWP_WRITEOK) { 508 nr_swap_pages--; 509 /* This is called for allocating swap entry, not cache */ 510 offset = scan_swap_map(si, SWAP_MAP); 511 if (offset) { 512 spin_unlock(&swap_lock); 513 return swp_entry(type, offset); 514 } 515 nr_swap_pages++; 516 } 517 spin_unlock(&swap_lock); 518 return (swp_entry_t) {0}; 519 } 520 521 static struct swap_info_struct * swap_info_get(swp_entry_t entry) 522 { 523 struct swap_info_struct * p; 524 unsigned long offset, type; 525 526 if (!entry.val) 527 goto out; 528 type = swp_type(entry); 529 if (type >= nr_swapfiles) 530 goto bad_nofile; 531 p = & swap_info[type]; 532 if (!(p->flags & SWP_USED)) 533 goto bad_device; 534 offset = swp_offset(entry); 535 if (offset >= p->max) 536 goto bad_offset; 537 if (!p->swap_map[offset]) 538 goto bad_free; 539 spin_lock(&swap_lock); 540 return p; 541 542 bad_free: 543 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); 544 goto out; 545 bad_offset: 546 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); 547 goto out; 548 bad_device: 549 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); 550 goto out; 551 bad_nofile: 552 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); 553 out: 554 return NULL; 555 } 556 557 static int swap_entry_free(struct swap_info_struct *p, 558 swp_entry_t ent, int cache) 559 { 560 unsigned long offset = swp_offset(ent); 561 int count = swap_count(p->swap_map[offset]); 562 bool has_cache; 563 564 has_cache = swap_has_cache(p->swap_map[offset]); 565 566 if (cache == SWAP_MAP) { /* dropping usage count of swap */ 567 if (count < SWAP_MAP_MAX) { 568 count--; 569 p->swap_map[offset] = encode_swapmap(count, has_cache); 570 } 571 } else { /* dropping swap cache flag */ 572 VM_BUG_ON(!has_cache); 573 p->swap_map[offset] = encode_swapmap(count, false); 574 575 } 576 /* return code. */ 577 count = p->swap_map[offset]; 578 /* free if no reference */ 579 if (!count) { 580 if (offset < p->lowest_bit) 581 p->lowest_bit = offset; 582 if (offset > p->highest_bit) 583 p->highest_bit = offset; 584 if (p->prio > swap_info[swap_list.next].prio) 585 swap_list.next = p - swap_info; 586 nr_swap_pages++; 587 p->inuse_pages--; 588 } 589 if (!swap_count(count)) 590 mem_cgroup_uncharge_swap(ent); 591 return count; 592 } 593 594 /* 595 * Caller has made sure that the swapdevice corresponding to entry 596 * is still around or has not been recycled. 597 */ 598 void swap_free(swp_entry_t entry) 599 { 600 struct swap_info_struct * p; 601 602 p = swap_info_get(entry); 603 if (p) { 604 swap_entry_free(p, entry, SWAP_MAP); 605 spin_unlock(&swap_lock); 606 } 607 } 608 609 /* 610 * Called after dropping swapcache to decrease refcnt to swap entries. 611 */ 612 void swapcache_free(swp_entry_t entry, struct page *page) 613 { 614 struct swap_info_struct *p; 615 int ret; 616 617 p = swap_info_get(entry); 618 if (p) { 619 ret = swap_entry_free(p, entry, SWAP_CACHE); 620 if (page) { 621 bool swapout; 622 if (ret) 623 swapout = true; /* the end of swap out */ 624 else 625 swapout = false; /* no more swap users! */ 626 mem_cgroup_uncharge_swapcache(page, entry, swapout); 627 } 628 spin_unlock(&swap_lock); 629 } 630 return; 631 } 632 633 /* 634 * How many references to page are currently swapped out? 635 */ 636 static inline int page_swapcount(struct page *page) 637 { 638 int count = 0; 639 struct swap_info_struct *p; 640 swp_entry_t entry; 641 642 entry.val = page_private(page); 643 p = swap_info_get(entry); 644 if (p) { 645 count = swap_count(p->swap_map[swp_offset(entry)]); 646 spin_unlock(&swap_lock); 647 } 648 return count; 649 } 650 651 /* 652 * We can write to an anon page without COW if there are no other references 653 * to it. And as a side-effect, free up its swap: because the old content 654 * on disk will never be read, and seeking back there to write new content 655 * later would only waste time away from clustering. 656 */ 657 int reuse_swap_page(struct page *page) 658 { 659 int count; 660 661 VM_BUG_ON(!PageLocked(page)); 662 count = page_mapcount(page); 663 if (count <= 1 && PageSwapCache(page)) { 664 count += page_swapcount(page); 665 if (count == 1 && !PageWriteback(page)) { 666 delete_from_swap_cache(page); 667 SetPageDirty(page); 668 } 669 } 670 return count == 1; 671 } 672 673 /* 674 * If swap is getting full, or if there are no more mappings of this page, 675 * then try_to_free_swap is called to free its swap space. 676 */ 677 int try_to_free_swap(struct page *page) 678 { 679 VM_BUG_ON(!PageLocked(page)); 680 681 if (!PageSwapCache(page)) 682 return 0; 683 if (PageWriteback(page)) 684 return 0; 685 if (page_swapcount(page)) 686 return 0; 687 688 delete_from_swap_cache(page); 689 SetPageDirty(page); 690 return 1; 691 } 692 693 /* 694 * Free the swap entry like above, but also try to 695 * free the page cache entry if it is the last user. 696 */ 697 int free_swap_and_cache(swp_entry_t entry) 698 { 699 struct swap_info_struct *p; 700 struct page *page = NULL; 701 702 if (non_swap_entry(entry)) 703 return 1; 704 705 p = swap_info_get(entry); 706 if (p) { 707 if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { 708 page = find_get_page(&swapper_space, entry.val); 709 if (page && !trylock_page(page)) { 710 page_cache_release(page); 711 page = NULL; 712 } 713 } 714 spin_unlock(&swap_lock); 715 } 716 if (page) { 717 /* 718 * Not mapped elsewhere, or swap space full? Free it! 719 * Also recheck PageSwapCache now page is locked (above). 720 */ 721 if (PageSwapCache(page) && !PageWriteback(page) && 722 (!page_mapped(page) || vm_swap_full())) { 723 delete_from_swap_cache(page); 724 SetPageDirty(page); 725 } 726 unlock_page(page); 727 page_cache_release(page); 728 } 729 return p != NULL; 730 } 731 732 #ifdef CONFIG_HIBERNATION 733 /* 734 * Find the swap type that corresponds to given device (if any). 735 * 736 * @offset - number of the PAGE_SIZE-sized block of the device, starting 737 * from 0, in which the swap header is expected to be located. 738 * 739 * This is needed for the suspend to disk (aka swsusp). 740 */ 741 int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) 742 { 743 struct block_device *bdev = NULL; 744 int i; 745 746 if (device) 747 bdev = bdget(device); 748 749 spin_lock(&swap_lock); 750 for (i = 0; i < nr_swapfiles; i++) { 751 struct swap_info_struct *sis = swap_info + i; 752 753 if (!(sis->flags & SWP_WRITEOK)) 754 continue; 755 756 if (!bdev) { 757 if (bdev_p) 758 *bdev_p = bdgrab(sis->bdev); 759 760 spin_unlock(&swap_lock); 761 return i; 762 } 763 if (bdev == sis->bdev) { 764 struct swap_extent *se; 765 766 se = list_entry(sis->extent_list.next, 767 struct swap_extent, list); 768 if (se->start_block == offset) { 769 if (bdev_p) 770 *bdev_p = bdgrab(sis->bdev); 771 772 spin_unlock(&swap_lock); 773 bdput(bdev); 774 return i; 775 } 776 } 777 } 778 spin_unlock(&swap_lock); 779 if (bdev) 780 bdput(bdev); 781 782 return -ENODEV; 783 } 784 785 /* 786 * Return either the total number of swap pages of given type, or the number 787 * of free pages of that type (depending on @free) 788 * 789 * This is needed for software suspend 790 */ 791 unsigned int count_swap_pages(int type, int free) 792 { 793 unsigned int n = 0; 794 795 if (type < nr_swapfiles) { 796 spin_lock(&swap_lock); 797 if (swap_info[type].flags & SWP_WRITEOK) { 798 n = swap_info[type].pages; 799 if (free) 800 n -= swap_info[type].inuse_pages; 801 } 802 spin_unlock(&swap_lock); 803 } 804 return n; 805 } 806 #endif 807 808 /* 809 * No need to decide whether this PTE shares the swap entry with others, 810 * just let do_wp_page work it out if a write is requested later - to 811 * force COW, vm_page_prot omits write permission from any private vma. 812 */ 813 static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, 814 unsigned long addr, swp_entry_t entry, struct page *page) 815 { 816 struct mem_cgroup *ptr = NULL; 817 spinlock_t *ptl; 818 pte_t *pte; 819 int ret = 1; 820 821 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) { 822 ret = -ENOMEM; 823 goto out_nolock; 824 } 825 826 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 827 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { 828 if (ret > 0) 829 mem_cgroup_cancel_charge_swapin(ptr); 830 ret = 0; 831 goto out; 832 } 833 834 inc_mm_counter(vma->vm_mm, anon_rss); 835 get_page(page); 836 set_pte_at(vma->vm_mm, addr, pte, 837 pte_mkold(mk_pte(page, vma->vm_page_prot))); 838 page_add_anon_rmap(page, vma, addr); 839 mem_cgroup_commit_charge_swapin(page, ptr); 840 swap_free(entry); 841 /* 842 * Move the page to the active list so it is not 843 * immediately swapped out again after swapon. 844 */ 845 activate_page(page); 846 out: 847 pte_unmap_unlock(pte, ptl); 848 out_nolock: 849 return ret; 850 } 851 852 static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 853 unsigned long addr, unsigned long end, 854 swp_entry_t entry, struct page *page) 855 { 856 pte_t swp_pte = swp_entry_to_pte(entry); 857 pte_t *pte; 858 int ret = 0; 859 860 /* 861 * We don't actually need pte lock while scanning for swp_pte: since 862 * we hold page lock and mmap_sem, swp_pte cannot be inserted into the 863 * page table while we're scanning; though it could get zapped, and on 864 * some architectures (e.g. x86_32 with PAE) we might catch a glimpse 865 * of unmatched parts which look like swp_pte, so unuse_pte must 866 * recheck under pte lock. Scanning without pte lock lets it be 867 * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. 868 */ 869 pte = pte_offset_map(pmd, addr); 870 do { 871 /* 872 * swapoff spends a _lot_ of time in this loop! 873 * Test inline before going to call unuse_pte. 874 */ 875 if (unlikely(pte_same(*pte, swp_pte))) { 876 pte_unmap(pte); 877 ret = unuse_pte(vma, pmd, addr, entry, page); 878 if (ret) 879 goto out; 880 pte = pte_offset_map(pmd, addr); 881 } 882 } while (pte++, addr += PAGE_SIZE, addr != end); 883 pte_unmap(pte - 1); 884 out: 885 return ret; 886 } 887 888 static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, 889 unsigned long addr, unsigned long end, 890 swp_entry_t entry, struct page *page) 891 { 892 pmd_t *pmd; 893 unsigned long next; 894 int ret; 895 896 pmd = pmd_offset(pud, addr); 897 do { 898 next = pmd_addr_end(addr, end); 899 if (pmd_none_or_clear_bad(pmd)) 900 continue; 901 ret = unuse_pte_range(vma, pmd, addr, next, entry, page); 902 if (ret) 903 return ret; 904 } while (pmd++, addr = next, addr != end); 905 return 0; 906 } 907 908 static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 909 unsigned long addr, unsigned long end, 910 swp_entry_t entry, struct page *page) 911 { 912 pud_t *pud; 913 unsigned long next; 914 int ret; 915 916 pud = pud_offset(pgd, addr); 917 do { 918 next = pud_addr_end(addr, end); 919 if (pud_none_or_clear_bad(pud)) 920 continue; 921 ret = unuse_pmd_range(vma, pud, addr, next, entry, page); 922 if (ret) 923 return ret; 924 } while (pud++, addr = next, addr != end); 925 return 0; 926 } 927 928 static int unuse_vma(struct vm_area_struct *vma, 929 swp_entry_t entry, struct page *page) 930 { 931 pgd_t *pgd; 932 unsigned long addr, end, next; 933 int ret; 934 935 if (page->mapping) { 936 addr = page_address_in_vma(page, vma); 937 if (addr == -EFAULT) 938 return 0; 939 else 940 end = addr + PAGE_SIZE; 941 } else { 942 addr = vma->vm_start; 943 end = vma->vm_end; 944 } 945 946 pgd = pgd_offset(vma->vm_mm, addr); 947 do { 948 next = pgd_addr_end(addr, end); 949 if (pgd_none_or_clear_bad(pgd)) 950 continue; 951 ret = unuse_pud_range(vma, pgd, addr, next, entry, page); 952 if (ret) 953 return ret; 954 } while (pgd++, addr = next, addr != end); 955 return 0; 956 } 957 958 static int unuse_mm(struct mm_struct *mm, 959 swp_entry_t entry, struct page *page) 960 { 961 struct vm_area_struct *vma; 962 int ret = 0; 963 964 if (!down_read_trylock(&mm->mmap_sem)) { 965 /* 966 * Activate page so shrink_inactive_list is unlikely to unmap 967 * its ptes while lock is dropped, so swapoff can make progress. 968 */ 969 activate_page(page); 970 unlock_page(page); 971 down_read(&mm->mmap_sem); 972 lock_page(page); 973 } 974 for (vma = mm->mmap; vma; vma = vma->vm_next) { 975 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) 976 break; 977 } 978 up_read(&mm->mmap_sem); 979 return (ret < 0)? ret: 0; 980 } 981 982 /* 983 * Scan swap_map from current position to next entry still in use. 984 * Recycle to start on reaching the end, returning 0 when empty. 985 */ 986 static unsigned int find_next_to_unuse(struct swap_info_struct *si, 987 unsigned int prev) 988 { 989 unsigned int max = si->max; 990 unsigned int i = prev; 991 int count; 992 993 /* 994 * No need for swap_lock here: we're just looking 995 * for whether an entry is in use, not modifying it; false 996 * hits are okay, and sys_swapoff() has already prevented new 997 * allocations from this area (while holding swap_lock). 998 */ 999 for (;;) { 1000 if (++i >= max) { 1001 if (!prev) { 1002 i = 0; 1003 break; 1004 } 1005 /* 1006 * No entries in use at top of swap_map, 1007 * loop back to start and recheck there. 1008 */ 1009 max = prev + 1; 1010 prev = 0; 1011 i = 1; 1012 } 1013 count = si->swap_map[i]; 1014 if (count && swap_count(count) != SWAP_MAP_BAD) 1015 break; 1016 } 1017 return i; 1018 } 1019 1020 /* 1021 * We completely avoid races by reading each swap page in advance, 1022 * and then search for the process using it. All the necessary 1023 * page table adjustments can then be made atomically. 1024 */ 1025 static int try_to_unuse(unsigned int type) 1026 { 1027 struct swap_info_struct * si = &swap_info[type]; 1028 struct mm_struct *start_mm; 1029 unsigned short *swap_map; 1030 unsigned short swcount; 1031 struct page *page; 1032 swp_entry_t entry; 1033 unsigned int i = 0; 1034 int retval = 0; 1035 int reset_overflow = 0; 1036 int shmem; 1037 1038 /* 1039 * When searching mms for an entry, a good strategy is to 1040 * start at the first mm we freed the previous entry from 1041 * (though actually we don't notice whether we or coincidence 1042 * freed the entry). Initialize this start_mm with a hold. 1043 * 1044 * A simpler strategy would be to start at the last mm we 1045 * freed the previous entry from; but that would take less 1046 * advantage of mmlist ordering, which clusters forked mms 1047 * together, child after parent. If we race with dup_mmap(), we 1048 * prefer to resolve parent before child, lest we miss entries 1049 * duplicated after we scanned child: using last mm would invert 1050 * that. Though it's only a serious concern when an overflowed 1051 * swap count is reset from SWAP_MAP_MAX, preventing a rescan. 1052 */ 1053 start_mm = &init_mm; 1054 atomic_inc(&init_mm.mm_users); 1055 1056 /* 1057 * Keep on scanning until all entries have gone. Usually, 1058 * one pass through swap_map is enough, but not necessarily: 1059 * there are races when an instance of an entry might be missed. 1060 */ 1061 while ((i = find_next_to_unuse(si, i)) != 0) { 1062 if (signal_pending(current)) { 1063 retval = -EINTR; 1064 break; 1065 } 1066 1067 /* 1068 * Get a page for the entry, using the existing swap 1069 * cache page if there is one. Otherwise, get a clean 1070 * page and read the swap into it. 1071 */ 1072 swap_map = &si->swap_map[i]; 1073 entry = swp_entry(type, i); 1074 page = read_swap_cache_async(entry, 1075 GFP_HIGHUSER_MOVABLE, NULL, 0); 1076 if (!page) { 1077 /* 1078 * Either swap_duplicate() failed because entry 1079 * has been freed independently, and will not be 1080 * reused since sys_swapoff() already disabled 1081 * allocation from here, or alloc_page() failed. 1082 */ 1083 if (!*swap_map) 1084 continue; 1085 retval = -ENOMEM; 1086 break; 1087 } 1088 1089 /* 1090 * Don't hold on to start_mm if it looks like exiting. 1091 */ 1092 if (atomic_read(&start_mm->mm_users) == 1) { 1093 mmput(start_mm); 1094 start_mm = &init_mm; 1095 atomic_inc(&init_mm.mm_users); 1096 } 1097 1098 /* 1099 * Wait for and lock page. When do_swap_page races with 1100 * try_to_unuse, do_swap_page can handle the fault much 1101 * faster than try_to_unuse can locate the entry. This 1102 * apparently redundant "wait_on_page_locked" lets try_to_unuse 1103 * defer to do_swap_page in such a case - in some tests, 1104 * do_swap_page and try_to_unuse repeatedly compete. 1105 */ 1106 wait_on_page_locked(page); 1107 wait_on_page_writeback(page); 1108 lock_page(page); 1109 wait_on_page_writeback(page); 1110 1111 /* 1112 * Remove all references to entry. 1113 * Whenever we reach init_mm, there's no address space 1114 * to search, but use it as a reminder to search shmem. 1115 */ 1116 shmem = 0; 1117 swcount = *swap_map; 1118 if (swap_count(swcount)) { 1119 if (start_mm == &init_mm) 1120 shmem = shmem_unuse(entry, page); 1121 else 1122 retval = unuse_mm(start_mm, entry, page); 1123 } 1124 if (swap_count(*swap_map)) { 1125 int set_start_mm = (*swap_map >= swcount); 1126 struct list_head *p = &start_mm->mmlist; 1127 struct mm_struct *new_start_mm = start_mm; 1128 struct mm_struct *prev_mm = start_mm; 1129 struct mm_struct *mm; 1130 1131 atomic_inc(&new_start_mm->mm_users); 1132 atomic_inc(&prev_mm->mm_users); 1133 spin_lock(&mmlist_lock); 1134 while (swap_count(*swap_map) && !retval && !shmem && 1135 (p = p->next) != &start_mm->mmlist) { 1136 mm = list_entry(p, struct mm_struct, mmlist); 1137 if (!atomic_inc_not_zero(&mm->mm_users)) 1138 continue; 1139 spin_unlock(&mmlist_lock); 1140 mmput(prev_mm); 1141 prev_mm = mm; 1142 1143 cond_resched(); 1144 1145 swcount = *swap_map; 1146 if (!swap_count(swcount)) /* any usage ? */ 1147 ; 1148 else if (mm == &init_mm) { 1149 set_start_mm = 1; 1150 shmem = shmem_unuse(entry, page); 1151 } else 1152 retval = unuse_mm(mm, entry, page); 1153 1154 if (set_start_mm && *swap_map < swcount) { 1155 mmput(new_start_mm); 1156 atomic_inc(&mm->mm_users); 1157 new_start_mm = mm; 1158 set_start_mm = 0; 1159 } 1160 spin_lock(&mmlist_lock); 1161 } 1162 spin_unlock(&mmlist_lock); 1163 mmput(prev_mm); 1164 mmput(start_mm); 1165 start_mm = new_start_mm; 1166 } 1167 if (shmem) { 1168 /* page has already been unlocked and released */ 1169 if (shmem > 0) 1170 continue; 1171 retval = shmem; 1172 break; 1173 } 1174 if (retval) { 1175 unlock_page(page); 1176 page_cache_release(page); 1177 break; 1178 } 1179 1180 /* 1181 * How could swap count reach 0x7ffe ? 1182 * There's no way to repeat a swap page within an mm 1183 * (except in shmem, where it's the shared object which takes 1184 * the reference count)? 1185 * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned 1186 * short is too small....) 1187 * If that's wrong, then we should worry more about 1188 * exit_mmap() and do_munmap() cases described above: 1189 * we might be resetting SWAP_MAP_MAX too early here. 1190 * We know "Undead"s can happen, they're okay, so don't 1191 * report them; but do report if we reset SWAP_MAP_MAX. 1192 */ 1193 /* We might release the lock_page() in unuse_mm(). */ 1194 if (!PageSwapCache(page) || page_private(page) != entry.val) 1195 goto retry; 1196 1197 if (swap_count(*swap_map) == SWAP_MAP_MAX) { 1198 spin_lock(&swap_lock); 1199 *swap_map = encode_swapmap(0, true); 1200 spin_unlock(&swap_lock); 1201 reset_overflow = 1; 1202 } 1203 1204 /* 1205 * If a reference remains (rare), we would like to leave 1206 * the page in the swap cache; but try_to_unmap could 1207 * then re-duplicate the entry once we drop page lock, 1208 * so we might loop indefinitely; also, that page could 1209 * not be swapped out to other storage meanwhile. So: 1210 * delete from cache even if there's another reference, 1211 * after ensuring that the data has been saved to disk - 1212 * since if the reference remains (rarer), it will be 1213 * read from disk into another page. Splitting into two 1214 * pages would be incorrect if swap supported "shared 1215 * private" pages, but they are handled by tmpfs files. 1216 */ 1217 if (swap_count(*swap_map) && 1218 PageDirty(page) && PageSwapCache(page)) { 1219 struct writeback_control wbc = { 1220 .sync_mode = WB_SYNC_NONE, 1221 }; 1222 1223 swap_writepage(page, &wbc); 1224 lock_page(page); 1225 wait_on_page_writeback(page); 1226 } 1227 1228 /* 1229 * It is conceivable that a racing task removed this page from 1230 * swap cache just before we acquired the page lock at the top, 1231 * or while we dropped it in unuse_mm(). The page might even 1232 * be back in swap cache on another swap area: that we must not 1233 * delete, since it may not have been written out to swap yet. 1234 */ 1235 if (PageSwapCache(page) && 1236 likely(page_private(page) == entry.val)) 1237 delete_from_swap_cache(page); 1238 1239 /* 1240 * So we could skip searching mms once swap count went 1241 * to 1, we did not mark any present ptes as dirty: must 1242 * mark page dirty so shrink_page_list will preserve it. 1243 */ 1244 SetPageDirty(page); 1245 retry: 1246 unlock_page(page); 1247 page_cache_release(page); 1248 1249 /* 1250 * Make sure that we aren't completely killing 1251 * interactive performance. 1252 */ 1253 cond_resched(); 1254 } 1255 1256 mmput(start_mm); 1257 if (reset_overflow) { 1258 printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); 1259 swap_overflow = 0; 1260 } 1261 return retval; 1262 } 1263 1264 /* 1265 * After a successful try_to_unuse, if no swap is now in use, we know 1266 * we can empty the mmlist. swap_lock must be held on entry and exit. 1267 * Note that mmlist_lock nests inside swap_lock, and an mm must be 1268 * added to the mmlist just after page_duplicate - before would be racy. 1269 */ 1270 static void drain_mmlist(void) 1271 { 1272 struct list_head *p, *next; 1273 unsigned int i; 1274 1275 for (i = 0; i < nr_swapfiles; i++) 1276 if (swap_info[i].inuse_pages) 1277 return; 1278 spin_lock(&mmlist_lock); 1279 list_for_each_safe(p, next, &init_mm.mmlist) 1280 list_del_init(p); 1281 spin_unlock(&mmlist_lock); 1282 } 1283 1284 /* 1285 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which 1286 * corresponds to page offset `offset'. 1287 */ 1288 sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) 1289 { 1290 struct swap_extent *se = sis->curr_swap_extent; 1291 struct swap_extent *start_se = se; 1292 1293 for ( ; ; ) { 1294 struct list_head *lh; 1295 1296 if (se->start_page <= offset && 1297 offset < (se->start_page + se->nr_pages)) { 1298 return se->start_block + (offset - se->start_page); 1299 } 1300 lh = se->list.next; 1301 if (lh == &sis->extent_list) 1302 lh = lh->next; 1303 se = list_entry(lh, struct swap_extent, list); 1304 sis->curr_swap_extent = se; 1305 BUG_ON(se == start_se); /* It *must* be present */ 1306 } 1307 } 1308 1309 #ifdef CONFIG_HIBERNATION 1310 /* 1311 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev 1312 * corresponding to given index in swap_info (swap type). 1313 */ 1314 sector_t swapdev_block(int swap_type, pgoff_t offset) 1315 { 1316 struct swap_info_struct *sis; 1317 1318 if (swap_type >= nr_swapfiles) 1319 return 0; 1320 1321 sis = swap_info + swap_type; 1322 return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0; 1323 } 1324 #endif /* CONFIG_HIBERNATION */ 1325 1326 /* 1327 * Free all of a swapdev's extent information 1328 */ 1329 static void destroy_swap_extents(struct swap_info_struct *sis) 1330 { 1331 while (!list_empty(&sis->extent_list)) { 1332 struct swap_extent *se; 1333 1334 se = list_entry(sis->extent_list.next, 1335 struct swap_extent, list); 1336 list_del(&se->list); 1337 kfree(se); 1338 } 1339 } 1340 1341 /* 1342 * Add a block range (and the corresponding page range) into this swapdev's 1343 * extent list. The extent list is kept sorted in page order. 1344 * 1345 * This function rather assumes that it is called in ascending page order. 1346 */ 1347 static int 1348 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, 1349 unsigned long nr_pages, sector_t start_block) 1350 { 1351 struct swap_extent *se; 1352 struct swap_extent *new_se; 1353 struct list_head *lh; 1354 1355 lh = sis->extent_list.prev; /* The highest page extent */ 1356 if (lh != &sis->extent_list) { 1357 se = list_entry(lh, struct swap_extent, list); 1358 BUG_ON(se->start_page + se->nr_pages != start_page); 1359 if (se->start_block + se->nr_pages == start_block) { 1360 /* Merge it */ 1361 se->nr_pages += nr_pages; 1362 return 0; 1363 } 1364 } 1365 1366 /* 1367 * No merge. Insert a new extent, preserving ordering. 1368 */ 1369 new_se = kmalloc(sizeof(*se), GFP_KERNEL); 1370 if (new_se == NULL) 1371 return -ENOMEM; 1372 new_se->start_page = start_page; 1373 new_se->nr_pages = nr_pages; 1374 new_se->start_block = start_block; 1375 1376 list_add_tail(&new_se->list, &sis->extent_list); 1377 return 1; 1378 } 1379 1380 /* 1381 * A `swap extent' is a simple thing which maps a contiguous range of pages 1382 * onto a contiguous range of disk blocks. An ordered list of swap extents 1383 * is built at swapon time and is then used at swap_writepage/swap_readpage 1384 * time for locating where on disk a page belongs. 1385 * 1386 * If the swapfile is an S_ISBLK block device, a single extent is installed. 1387 * This is done so that the main operating code can treat S_ISBLK and S_ISREG 1388 * swap files identically. 1389 * 1390 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap 1391 * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK 1392 * swapfiles are handled *identically* after swapon time. 1393 * 1394 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks 1395 * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If 1396 * some stray blocks are found which do not fall within the PAGE_SIZE alignment 1397 * requirements, they are simply tossed out - we will never use those blocks 1398 * for swapping. 1399 * 1400 * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon. This 1401 * prevents root from shooting her foot off by ftruncating an in-use swapfile, 1402 * which will scribble on the fs. 1403 * 1404 * The amount of disk space which a single swap extent represents varies. 1405 * Typically it is in the 1-4 megabyte range. So we can have hundreds of 1406 * extents in the list. To avoid much list walking, we cache the previous 1407 * search location in `curr_swap_extent', and start new searches from there. 1408 * This is extremely effective. The average number of iterations in 1409 * map_swap_page() has been measured at about 0.3 per page. - akpm. 1410 */ 1411 static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) 1412 { 1413 struct inode *inode; 1414 unsigned blocks_per_page; 1415 unsigned long page_no; 1416 unsigned blkbits; 1417 sector_t probe_block; 1418 sector_t last_block; 1419 sector_t lowest_block = -1; 1420 sector_t highest_block = 0; 1421 int nr_extents = 0; 1422 int ret; 1423 1424 inode = sis->swap_file->f_mapping->host; 1425 if (S_ISBLK(inode->i_mode)) { 1426 ret = add_swap_extent(sis, 0, sis->max, 0); 1427 *span = sis->pages; 1428 goto done; 1429 } 1430 1431 blkbits = inode->i_blkbits; 1432 blocks_per_page = PAGE_SIZE >> blkbits; 1433 1434 /* 1435 * Map all the blocks into the extent list. This code doesn't try 1436 * to be very smart. 1437 */ 1438 probe_block = 0; 1439 page_no = 0; 1440 last_block = i_size_read(inode) >> blkbits; 1441 while ((probe_block + blocks_per_page) <= last_block && 1442 page_no < sis->max) { 1443 unsigned block_in_page; 1444 sector_t first_block; 1445 1446 first_block = bmap(inode, probe_block); 1447 if (first_block == 0) 1448 goto bad_bmap; 1449 1450 /* 1451 * It must be PAGE_SIZE aligned on-disk 1452 */ 1453 if (first_block & (blocks_per_page - 1)) { 1454 probe_block++; 1455 goto reprobe; 1456 } 1457 1458 for (block_in_page = 1; block_in_page < blocks_per_page; 1459 block_in_page++) { 1460 sector_t block; 1461 1462 block = bmap(inode, probe_block + block_in_page); 1463 if (block == 0) 1464 goto bad_bmap; 1465 if (block != first_block + block_in_page) { 1466 /* Discontiguity */ 1467 probe_block++; 1468 goto reprobe; 1469 } 1470 } 1471 1472 first_block >>= (PAGE_SHIFT - blkbits); 1473 if (page_no) { /* exclude the header page */ 1474 if (first_block < lowest_block) 1475 lowest_block = first_block; 1476 if (first_block > highest_block) 1477 highest_block = first_block; 1478 } 1479 1480 /* 1481 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks 1482 */ 1483 ret = add_swap_extent(sis, page_no, 1, first_block); 1484 if (ret < 0) 1485 goto out; 1486 nr_extents += ret; 1487 page_no++; 1488 probe_block += blocks_per_page; 1489 reprobe: 1490 continue; 1491 } 1492 ret = nr_extents; 1493 *span = 1 + highest_block - lowest_block; 1494 if (page_no == 0) 1495 page_no = 1; /* force Empty message */ 1496 sis->max = page_no; 1497 sis->pages = page_no - 1; 1498 sis->highest_bit = page_no - 1; 1499 done: 1500 sis->curr_swap_extent = list_entry(sis->extent_list.prev, 1501 struct swap_extent, list); 1502 goto out; 1503 bad_bmap: 1504 printk(KERN_ERR "swapon: swapfile has holes\n"); 1505 ret = -EINVAL; 1506 out: 1507 return ret; 1508 } 1509 1510 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) 1511 { 1512 struct swap_info_struct * p = NULL; 1513 unsigned short *swap_map; 1514 struct file *swap_file, *victim; 1515 struct address_space *mapping; 1516 struct inode *inode; 1517 char * pathname; 1518 int i, type, prev; 1519 int err; 1520 1521 if (!capable(CAP_SYS_ADMIN)) 1522 return -EPERM; 1523 1524 pathname = getname(specialfile); 1525 err = PTR_ERR(pathname); 1526 if (IS_ERR(pathname)) 1527 goto out; 1528 1529 victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0); 1530 putname(pathname); 1531 err = PTR_ERR(victim); 1532 if (IS_ERR(victim)) 1533 goto out; 1534 1535 mapping = victim->f_mapping; 1536 prev = -1; 1537 spin_lock(&swap_lock); 1538 for (type = swap_list.head; type >= 0; type = swap_info[type].next) { 1539 p = swap_info + type; 1540 if (p->flags & SWP_WRITEOK) { 1541 if (p->swap_file->f_mapping == mapping) 1542 break; 1543 } 1544 prev = type; 1545 } 1546 if (type < 0) { 1547 err = -EINVAL; 1548 spin_unlock(&swap_lock); 1549 goto out_dput; 1550 } 1551 if (!security_vm_enough_memory(p->pages)) 1552 vm_unacct_memory(p->pages); 1553 else { 1554 err = -ENOMEM; 1555 spin_unlock(&swap_lock); 1556 goto out_dput; 1557 } 1558 if (prev < 0) { 1559 swap_list.head = p->next; 1560 } else { 1561 swap_info[prev].next = p->next; 1562 } 1563 if (type == swap_list.next) { 1564 /* just pick something that's safe... */ 1565 swap_list.next = swap_list.head; 1566 } 1567 if (p->prio < 0) { 1568 for (i = p->next; i >= 0; i = swap_info[i].next) 1569 swap_info[i].prio = p->prio--; 1570 least_priority++; 1571 } 1572 nr_swap_pages -= p->pages; 1573 total_swap_pages -= p->pages; 1574 p->flags &= ~SWP_WRITEOK; 1575 spin_unlock(&swap_lock); 1576 1577 current->flags |= PF_OOM_ORIGIN; 1578 err = try_to_unuse(type); 1579 current->flags &= ~PF_OOM_ORIGIN; 1580 1581 if (err) { 1582 /* re-insert swap space back into swap_list */ 1583 spin_lock(&swap_lock); 1584 if (p->prio < 0) 1585 p->prio = --least_priority; 1586 prev = -1; 1587 for (i = swap_list.head; i >= 0; i = swap_info[i].next) { 1588 if (p->prio >= swap_info[i].prio) 1589 break; 1590 prev = i; 1591 } 1592 p->next = i; 1593 if (prev < 0) 1594 swap_list.head = swap_list.next = p - swap_info; 1595 else 1596 swap_info[prev].next = p - swap_info; 1597 nr_swap_pages += p->pages; 1598 total_swap_pages += p->pages; 1599 p->flags |= SWP_WRITEOK; 1600 spin_unlock(&swap_lock); 1601 goto out_dput; 1602 } 1603 1604 /* wait for any unplug function to finish */ 1605 down_write(&swap_unplug_sem); 1606 up_write(&swap_unplug_sem); 1607 1608 destroy_swap_extents(p); 1609 mutex_lock(&swapon_mutex); 1610 spin_lock(&swap_lock); 1611 drain_mmlist(); 1612 1613 /* wait for anyone still in scan_swap_map */ 1614 p->highest_bit = 0; /* cuts scans short */ 1615 while (p->flags >= SWP_SCANNING) { 1616 spin_unlock(&swap_lock); 1617 schedule_timeout_uninterruptible(1); 1618 spin_lock(&swap_lock); 1619 } 1620 1621 swap_file = p->swap_file; 1622 p->swap_file = NULL; 1623 p->max = 0; 1624 swap_map = p->swap_map; 1625 p->swap_map = NULL; 1626 p->flags = 0; 1627 spin_unlock(&swap_lock); 1628 mutex_unlock(&swapon_mutex); 1629 vfree(swap_map); 1630 /* Destroy swap account informatin */ 1631 swap_cgroup_swapoff(type); 1632 1633 inode = mapping->host; 1634 if (S_ISBLK(inode->i_mode)) { 1635 struct block_device *bdev = I_BDEV(inode); 1636 set_blocksize(bdev, p->old_block_size); 1637 bd_release(bdev); 1638 } else { 1639 mutex_lock(&inode->i_mutex); 1640 inode->i_flags &= ~S_SWAPFILE; 1641 mutex_unlock(&inode->i_mutex); 1642 } 1643 filp_close(swap_file, NULL); 1644 err = 0; 1645 1646 out_dput: 1647 filp_close(victim, NULL); 1648 out: 1649 return err; 1650 } 1651 1652 #ifdef CONFIG_PROC_FS 1653 /* iterator */ 1654 static void *swap_start(struct seq_file *swap, loff_t *pos) 1655 { 1656 struct swap_info_struct *ptr = swap_info; 1657 int i; 1658 loff_t l = *pos; 1659 1660 mutex_lock(&swapon_mutex); 1661 1662 if (!l) 1663 return SEQ_START_TOKEN; 1664 1665 for (i = 0; i < nr_swapfiles; i++, ptr++) { 1666 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1667 continue; 1668 if (!--l) 1669 return ptr; 1670 } 1671 1672 return NULL; 1673 } 1674 1675 static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) 1676 { 1677 struct swap_info_struct *ptr; 1678 struct swap_info_struct *endptr = swap_info + nr_swapfiles; 1679 1680 if (v == SEQ_START_TOKEN) 1681 ptr = swap_info; 1682 else { 1683 ptr = v; 1684 ptr++; 1685 } 1686 1687 for (; ptr < endptr; ptr++) { 1688 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1689 continue; 1690 ++*pos; 1691 return ptr; 1692 } 1693 1694 return NULL; 1695 } 1696 1697 static void swap_stop(struct seq_file *swap, void *v) 1698 { 1699 mutex_unlock(&swapon_mutex); 1700 } 1701 1702 static int swap_show(struct seq_file *swap, void *v) 1703 { 1704 struct swap_info_struct *ptr = v; 1705 struct file *file; 1706 int len; 1707 1708 if (ptr == SEQ_START_TOKEN) { 1709 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); 1710 return 0; 1711 } 1712 1713 file = ptr->swap_file; 1714 len = seq_path(swap, &file->f_path, " \t\n\\"); 1715 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", 1716 len < 40 ? 40 - len : 1, " ", 1717 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? 1718 "partition" : "file\t", 1719 ptr->pages << (PAGE_SHIFT - 10), 1720 ptr->inuse_pages << (PAGE_SHIFT - 10), 1721 ptr->prio); 1722 return 0; 1723 } 1724 1725 static const struct seq_operations swaps_op = { 1726 .start = swap_start, 1727 .next = swap_next, 1728 .stop = swap_stop, 1729 .show = swap_show 1730 }; 1731 1732 static int swaps_open(struct inode *inode, struct file *file) 1733 { 1734 return seq_open(file, &swaps_op); 1735 } 1736 1737 static const struct file_operations proc_swaps_operations = { 1738 .open = swaps_open, 1739 .read = seq_read, 1740 .llseek = seq_lseek, 1741 .release = seq_release, 1742 }; 1743 1744 static int __init procswaps_init(void) 1745 { 1746 proc_create("swaps", 0, NULL, &proc_swaps_operations); 1747 return 0; 1748 } 1749 __initcall(procswaps_init); 1750 #endif /* CONFIG_PROC_FS */ 1751 1752 #ifdef MAX_SWAPFILES_CHECK 1753 static int __init max_swapfiles_check(void) 1754 { 1755 MAX_SWAPFILES_CHECK(); 1756 return 0; 1757 } 1758 late_initcall(max_swapfiles_check); 1759 #endif 1760 1761 /* 1762 * Written 01/25/92 by Simmule Turner, heavily changed by Linus. 1763 * 1764 * The swapon system call 1765 */ 1766 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) 1767 { 1768 struct swap_info_struct * p; 1769 char *name = NULL; 1770 struct block_device *bdev = NULL; 1771 struct file *swap_file = NULL; 1772 struct address_space *mapping; 1773 unsigned int type; 1774 int i, prev; 1775 int error; 1776 union swap_header *swap_header = NULL; 1777 unsigned int nr_good_pages = 0; 1778 int nr_extents = 0; 1779 sector_t span; 1780 unsigned long maxpages = 1; 1781 unsigned long swapfilepages; 1782 unsigned short *swap_map = NULL; 1783 struct page *page = NULL; 1784 struct inode *inode = NULL; 1785 int did_down = 0; 1786 1787 if (!capable(CAP_SYS_ADMIN)) 1788 return -EPERM; 1789 spin_lock(&swap_lock); 1790 p = swap_info; 1791 for (type = 0 ; type < nr_swapfiles ; type++,p++) 1792 if (!(p->flags & SWP_USED)) 1793 break; 1794 error = -EPERM; 1795 if (type >= MAX_SWAPFILES) { 1796 spin_unlock(&swap_lock); 1797 goto out; 1798 } 1799 if (type >= nr_swapfiles) 1800 nr_swapfiles = type+1; 1801 memset(p, 0, sizeof(*p)); 1802 INIT_LIST_HEAD(&p->extent_list); 1803 p->flags = SWP_USED; 1804 p->next = -1; 1805 spin_unlock(&swap_lock); 1806 name = getname(specialfile); 1807 error = PTR_ERR(name); 1808 if (IS_ERR(name)) { 1809 name = NULL; 1810 goto bad_swap_2; 1811 } 1812 swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0); 1813 error = PTR_ERR(swap_file); 1814 if (IS_ERR(swap_file)) { 1815 swap_file = NULL; 1816 goto bad_swap_2; 1817 } 1818 1819 p->swap_file = swap_file; 1820 mapping = swap_file->f_mapping; 1821 inode = mapping->host; 1822 1823 error = -EBUSY; 1824 for (i = 0; i < nr_swapfiles; i++) { 1825 struct swap_info_struct *q = &swap_info[i]; 1826 1827 if (i == type || !q->swap_file) 1828 continue; 1829 if (mapping == q->swap_file->f_mapping) 1830 goto bad_swap; 1831 } 1832 1833 error = -EINVAL; 1834 if (S_ISBLK(inode->i_mode)) { 1835 bdev = I_BDEV(inode); 1836 error = bd_claim(bdev, sys_swapon); 1837 if (error < 0) { 1838 bdev = NULL; 1839 error = -EINVAL; 1840 goto bad_swap; 1841 } 1842 p->old_block_size = block_size(bdev); 1843 error = set_blocksize(bdev, PAGE_SIZE); 1844 if (error < 0) 1845 goto bad_swap; 1846 p->bdev = bdev; 1847 } else if (S_ISREG(inode->i_mode)) { 1848 p->bdev = inode->i_sb->s_bdev; 1849 mutex_lock(&inode->i_mutex); 1850 did_down = 1; 1851 if (IS_SWAPFILE(inode)) { 1852 error = -EBUSY; 1853 goto bad_swap; 1854 } 1855 } else { 1856 goto bad_swap; 1857 } 1858 1859 swapfilepages = i_size_read(inode) >> PAGE_SHIFT; 1860 1861 /* 1862 * Read the swap header. 1863 */ 1864 if (!mapping->a_ops->readpage) { 1865 error = -EINVAL; 1866 goto bad_swap; 1867 } 1868 page = read_mapping_page(mapping, 0, swap_file); 1869 if (IS_ERR(page)) { 1870 error = PTR_ERR(page); 1871 goto bad_swap; 1872 } 1873 swap_header = kmap(page); 1874 1875 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { 1876 printk(KERN_ERR "Unable to find swap-space signature\n"); 1877 error = -EINVAL; 1878 goto bad_swap; 1879 } 1880 1881 /* swap partition endianess hack... */ 1882 if (swab32(swap_header->info.version) == 1) { 1883 swab32s(&swap_header->info.version); 1884 swab32s(&swap_header->info.last_page); 1885 swab32s(&swap_header->info.nr_badpages); 1886 for (i = 0; i < swap_header->info.nr_badpages; i++) 1887 swab32s(&swap_header->info.badpages[i]); 1888 } 1889 /* Check the swap header's sub-version */ 1890 if (swap_header->info.version != 1) { 1891 printk(KERN_WARNING 1892 "Unable to handle swap header version %d\n", 1893 swap_header->info.version); 1894 error = -EINVAL; 1895 goto bad_swap; 1896 } 1897 1898 p->lowest_bit = 1; 1899 p->cluster_next = 1; 1900 1901 /* 1902 * Find out how many pages are allowed for a single swap 1903 * device. There are two limiting factors: 1) the number of 1904 * bits for the swap offset in the swp_entry_t type and 1905 * 2) the number of bits in the a swap pte as defined by 1906 * the different architectures. In order to find the 1907 * largest possible bit mask a swap entry with swap type 0 1908 * and swap offset ~0UL is created, encoded to a swap pte, 1909 * decoded to a swp_entry_t again and finally the swap 1910 * offset is extracted. This will mask all the bits from 1911 * the initial ~0UL mask that can't be encoded in either 1912 * the swp_entry_t or the architecture definition of a 1913 * swap pte. 1914 */ 1915 maxpages = swp_offset(pte_to_swp_entry( 1916 swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1; 1917 if (maxpages > swap_header->info.last_page) 1918 maxpages = swap_header->info.last_page; 1919 p->highest_bit = maxpages - 1; 1920 1921 error = -EINVAL; 1922 if (!maxpages) 1923 goto bad_swap; 1924 if (swapfilepages && maxpages > swapfilepages) { 1925 printk(KERN_WARNING 1926 "Swap area shorter than signature indicates\n"); 1927 goto bad_swap; 1928 } 1929 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) 1930 goto bad_swap; 1931 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 1932 goto bad_swap; 1933 1934 /* OK, set up the swap map and apply the bad block list */ 1935 swap_map = vmalloc(maxpages * sizeof(short)); 1936 if (!swap_map) { 1937 error = -ENOMEM; 1938 goto bad_swap; 1939 } 1940 1941 memset(swap_map, 0, maxpages * sizeof(short)); 1942 for (i = 0; i < swap_header->info.nr_badpages; i++) { 1943 int page_nr = swap_header->info.badpages[i]; 1944 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { 1945 error = -EINVAL; 1946 goto bad_swap; 1947 } 1948 swap_map[page_nr] = SWAP_MAP_BAD; 1949 } 1950 1951 error = swap_cgroup_swapon(type, maxpages); 1952 if (error) 1953 goto bad_swap; 1954 1955 nr_good_pages = swap_header->info.last_page - 1956 swap_header->info.nr_badpages - 1957 1 /* header page */; 1958 1959 if (nr_good_pages) { 1960 swap_map[0] = SWAP_MAP_BAD; 1961 p->max = maxpages; 1962 p->pages = nr_good_pages; 1963 nr_extents = setup_swap_extents(p, &span); 1964 if (nr_extents < 0) { 1965 error = nr_extents; 1966 goto bad_swap; 1967 } 1968 nr_good_pages = p->pages; 1969 } 1970 if (!nr_good_pages) { 1971 printk(KERN_WARNING "Empty swap-file\n"); 1972 error = -EINVAL; 1973 goto bad_swap; 1974 } 1975 1976 if (p->bdev) { 1977 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { 1978 p->flags |= SWP_SOLIDSTATE; 1979 p->cluster_next = 1 + (random32() % p->highest_bit); 1980 } 1981 if (discard_swap(p) == 0) 1982 p->flags |= SWP_DISCARDABLE; 1983 } 1984 1985 mutex_lock(&swapon_mutex); 1986 spin_lock(&swap_lock); 1987 if (swap_flags & SWAP_FLAG_PREFER) 1988 p->prio = 1989 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; 1990 else 1991 p->prio = --least_priority; 1992 p->swap_map = swap_map; 1993 p->flags |= SWP_WRITEOK; 1994 nr_swap_pages += nr_good_pages; 1995 total_swap_pages += nr_good_pages; 1996 1997 printk(KERN_INFO "Adding %uk swap on %s. " 1998 "Priority:%d extents:%d across:%lluk %s%s\n", 1999 nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, 2000 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), 2001 (p->flags & SWP_SOLIDSTATE) ? "SS" : "", 2002 (p->flags & SWP_DISCARDABLE) ? "D" : ""); 2003 2004 /* insert swap space into swap_list: */ 2005 prev = -1; 2006 for (i = swap_list.head; i >= 0; i = swap_info[i].next) { 2007 if (p->prio >= swap_info[i].prio) { 2008 break; 2009 } 2010 prev = i; 2011 } 2012 p->next = i; 2013 if (prev < 0) { 2014 swap_list.head = swap_list.next = p - swap_info; 2015 } else { 2016 swap_info[prev].next = p - swap_info; 2017 } 2018 spin_unlock(&swap_lock); 2019 mutex_unlock(&swapon_mutex); 2020 error = 0; 2021 goto out; 2022 bad_swap: 2023 if (bdev) { 2024 set_blocksize(bdev, p->old_block_size); 2025 bd_release(bdev); 2026 } 2027 destroy_swap_extents(p); 2028 swap_cgroup_swapoff(type); 2029 bad_swap_2: 2030 spin_lock(&swap_lock); 2031 p->swap_file = NULL; 2032 p->flags = 0; 2033 spin_unlock(&swap_lock); 2034 vfree(swap_map); 2035 if (swap_file) 2036 filp_close(swap_file, NULL); 2037 out: 2038 if (page && !IS_ERR(page)) { 2039 kunmap(page); 2040 page_cache_release(page); 2041 } 2042 if (name) 2043 putname(name); 2044 if (did_down) { 2045 if (!error) 2046 inode->i_flags |= S_SWAPFILE; 2047 mutex_unlock(&inode->i_mutex); 2048 } 2049 return error; 2050 } 2051 2052 void si_swapinfo(struct sysinfo *val) 2053 { 2054 unsigned int i; 2055 unsigned long nr_to_be_unused = 0; 2056 2057 spin_lock(&swap_lock); 2058 for (i = 0; i < nr_swapfiles; i++) { 2059 if (!(swap_info[i].flags & SWP_USED) || 2060 (swap_info[i].flags & SWP_WRITEOK)) 2061 continue; 2062 nr_to_be_unused += swap_info[i].inuse_pages; 2063 } 2064 val->freeswap = nr_swap_pages + nr_to_be_unused; 2065 val->totalswap = total_swap_pages + nr_to_be_unused; 2066 spin_unlock(&swap_lock); 2067 } 2068 2069 /* 2070 * Verify that a swap entry is valid and increment its swap map count. 2071 * 2072 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as 2073 * "permanent", but will be reclaimed by the next swapoff. 2074 * Returns error code in following case. 2075 * - success -> 0 2076 * - swp_entry is invalid -> EINVAL 2077 * - swp_entry is migration entry -> EINVAL 2078 * - swap-cache reference is requested but there is already one. -> EEXIST 2079 * - swap-cache reference is requested but the entry is not used. -> ENOENT 2080 */ 2081 static int __swap_duplicate(swp_entry_t entry, bool cache) 2082 { 2083 struct swap_info_struct * p; 2084 unsigned long offset, type; 2085 int result = -EINVAL; 2086 int count; 2087 bool has_cache; 2088 2089 if (non_swap_entry(entry)) 2090 return -EINVAL; 2091 2092 type = swp_type(entry); 2093 if (type >= nr_swapfiles) 2094 goto bad_file; 2095 p = type + swap_info; 2096 offset = swp_offset(entry); 2097 2098 spin_lock(&swap_lock); 2099 2100 if (unlikely(offset >= p->max)) 2101 goto unlock_out; 2102 2103 count = swap_count(p->swap_map[offset]); 2104 has_cache = swap_has_cache(p->swap_map[offset]); 2105 2106 if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ 2107 2108 /* set SWAP_HAS_CACHE if there is no cache and entry is used */ 2109 if (!has_cache && count) { 2110 p->swap_map[offset] = encode_swapmap(count, true); 2111 result = 0; 2112 } else if (has_cache) /* someone added cache */ 2113 result = -EEXIST; 2114 else if (!count) /* no users */ 2115 result = -ENOENT; 2116 2117 } else if (count || has_cache) { 2118 if (count < SWAP_MAP_MAX - 1) { 2119 p->swap_map[offset] = encode_swapmap(count + 1, 2120 has_cache); 2121 result = 0; 2122 } else if (count <= SWAP_MAP_MAX) { 2123 if (swap_overflow++ < 5) 2124 printk(KERN_WARNING 2125 "swap_dup: swap entry overflow\n"); 2126 p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, 2127 has_cache); 2128 result = 0; 2129 } 2130 } else 2131 result = -ENOENT; /* unused swap entry */ 2132 unlock_out: 2133 spin_unlock(&swap_lock); 2134 out: 2135 return result; 2136 2137 bad_file: 2138 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); 2139 goto out; 2140 } 2141 /* 2142 * increase reference count of swap entry by 1. 2143 */ 2144 void swap_duplicate(swp_entry_t entry) 2145 { 2146 __swap_duplicate(entry, SWAP_MAP); 2147 } 2148 2149 /* 2150 * @entry: swap entry for which we allocate swap cache. 2151 * 2152 * Called when allocating swap cache for exising swap entry, 2153 * This can return error codes. Returns 0 at success. 2154 * -EBUSY means there is a swap cache. 2155 * Note: return code is different from swap_duplicate(). 2156 */ 2157 int swapcache_prepare(swp_entry_t entry) 2158 { 2159 return __swap_duplicate(entry, SWAP_CACHE); 2160 } 2161 2162 2163 struct swap_info_struct * 2164 get_swap_info_struct(unsigned type) 2165 { 2166 return &swap_info[type]; 2167 } 2168 2169 /* 2170 * swap_lock prevents swap_map being freed. Don't grab an extra 2171 * reference on the swaphandle, it doesn't matter if it becomes unused. 2172 */ 2173 int valid_swaphandles(swp_entry_t entry, unsigned long *offset) 2174 { 2175 struct swap_info_struct *si; 2176 int our_page_cluster = page_cluster; 2177 pgoff_t target, toff; 2178 pgoff_t base, end; 2179 int nr_pages = 0; 2180 2181 if (!our_page_cluster) /* no readahead */ 2182 return 0; 2183 2184 si = &swap_info[swp_type(entry)]; 2185 target = swp_offset(entry); 2186 base = (target >> our_page_cluster) << our_page_cluster; 2187 end = base + (1 << our_page_cluster); 2188 if (!base) /* first page is swap header */ 2189 base++; 2190 2191 spin_lock(&swap_lock); 2192 if (end > si->max) /* don't go beyond end of map */ 2193 end = si->max; 2194 2195 /* Count contiguous allocated slots above our target */ 2196 for (toff = target; ++toff < end; nr_pages++) { 2197 /* Don't read in free or bad pages */ 2198 if (!si->swap_map[toff]) 2199 break; 2200 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) 2201 break; 2202 } 2203 /* Count contiguous allocated slots below our target */ 2204 for (toff = target; --toff >= base; nr_pages++) { 2205 /* Don't read in free or bad pages */ 2206 if (!si->swap_map[toff]) 2207 break; 2208 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) 2209 break; 2210 } 2211 spin_unlock(&swap_lock); 2212 2213 /* 2214 * Indicate starting offset, and return number of pages to get: 2215 * if only 1, say 0, since there's then no readahead to be done. 2216 */ 2217 *offset = ++toff; 2218 return nr_pages? ++nr_pages: 0; 2219 } 2220