1 /* 2 * linux/mm/swap.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 */ 6 7 /* 8 * This file contains the default values for the operation of the 9 * Linux VM subsystem. Fine-tuning documentation can be found in 10 * Documentation/sysctl/vm.txt. 11 * Started 18.12.91 12 * Swap aging added 23.2.95, Stephen Tweedie. 13 * Buffermem limits added 12.3.98, Rik van Riel. 14 */ 15 16 #include <linux/mm.h> 17 #include <linux/sched.h> 18 #include <linux/kernel_stat.h> 19 #include <linux/swap.h> 20 #include <linux/mman.h> 21 #include <linux/pagemap.h> 22 #include <linux/pagevec.h> 23 #include <linux/init.h> 24 #include <linux/module.h> 25 #include <linux/mm_inline.h> 26 #include <linux/buffer_head.h> /* for try_to_release_page() */ 27 #include <linux/percpu_counter.h> 28 #include <linux/percpu.h> 29 #include <linux/cpu.h> 30 #include <linux/notifier.h> 31 #include <linux/backing-dev.h> 32 #include <linux/memcontrol.h> 33 34 /* How many pages do we try to swap or page in/out together? */ 35 int page_cluster; 36 37 static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; 38 static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; 39 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, }; 40 41 /* 42 * This path almost never happens for VM activity - pages are normally 43 * freed via pagevecs. But it gets used by networking. 44 */ 45 static void __page_cache_release(struct page *page) 46 { 47 if (PageLRU(page)) { 48 unsigned long flags; 49 struct zone *zone = page_zone(page); 50 51 spin_lock_irqsave(&zone->lru_lock, flags); 52 VM_BUG_ON(!PageLRU(page)); 53 __ClearPageLRU(page); 54 del_page_from_lru(zone, page); 55 spin_unlock_irqrestore(&zone->lru_lock, flags); 56 } 57 free_hot_page(page); 58 } 59 60 static void put_compound_page(struct page *page) 61 { 62 page = compound_head(page); 63 if (put_page_testzero(page)) { 64 compound_page_dtor *dtor; 65 66 dtor = get_compound_page_dtor(page); 67 (*dtor)(page); 68 } 69 } 70 71 void put_page(struct page *page) 72 { 73 if (unlikely(PageCompound(page))) 74 put_compound_page(page); 75 else if (put_page_testzero(page)) 76 __page_cache_release(page); 77 } 78 EXPORT_SYMBOL(put_page); 79 80 /** 81 * put_pages_list(): release a list of pages 82 * 83 * Release a list of pages which are strung together on page.lru. Currently 84 * used by read_cache_pages() and related error recovery code. 85 * 86 * @pages: list of pages threaded on page->lru 87 */ 88 void put_pages_list(struct list_head *pages) 89 { 90 while (!list_empty(pages)) { 91 struct page *victim; 92 93 victim = list_entry(pages->prev, struct page, lru); 94 list_del(&victim->lru); 95 page_cache_release(victim); 96 } 97 } 98 EXPORT_SYMBOL(put_pages_list); 99 100 /* 101 * pagevec_move_tail() must be called with IRQ disabled. 102 * Otherwise this may cause nasty races. 103 */ 104 static void pagevec_move_tail(struct pagevec *pvec) 105 { 106 int i; 107 int pgmoved = 0; 108 struct zone *zone = NULL; 109 110 for (i = 0; i < pagevec_count(pvec); i++) { 111 struct page *page = pvec->pages[i]; 112 struct zone *pagezone = page_zone(page); 113 114 if (pagezone != zone) { 115 if (zone) 116 spin_unlock(&zone->lru_lock); 117 zone = pagezone; 118 spin_lock(&zone->lru_lock); 119 } 120 if (PageLRU(page) && !PageActive(page)) { 121 list_move_tail(&page->lru, &zone->inactive_list); 122 pgmoved++; 123 } 124 } 125 if (zone) 126 spin_unlock(&zone->lru_lock); 127 __count_vm_events(PGROTATED, pgmoved); 128 release_pages(pvec->pages, pvec->nr, pvec->cold); 129 pagevec_reinit(pvec); 130 } 131 132 /* 133 * Writeback is about to end against a page which has been marked for immediate 134 * reclaim. If it still appears to be reclaimable, move it to the tail of the 135 * inactive list. 136 * 137 * Returns zero if it cleared PG_writeback. 138 */ 139 int rotate_reclaimable_page(struct page *page) 140 { 141 struct pagevec *pvec; 142 unsigned long flags; 143 144 if (PageLocked(page)) 145 return 1; 146 if (PageDirty(page)) 147 return 1; 148 if (PageActive(page)) 149 return 1; 150 if (!PageLRU(page)) 151 return 1; 152 153 page_cache_get(page); 154 local_irq_save(flags); 155 pvec = &__get_cpu_var(lru_rotate_pvecs); 156 if (!pagevec_add(pvec, page)) 157 pagevec_move_tail(pvec); 158 local_irq_restore(flags); 159 160 if (!test_clear_page_writeback(page)) 161 BUG(); 162 163 return 0; 164 } 165 166 /* 167 * FIXME: speed this up? 168 */ 169 void activate_page(struct page *page) 170 { 171 struct zone *zone = page_zone(page); 172 173 spin_lock_irq(&zone->lru_lock); 174 if (PageLRU(page) && !PageActive(page)) { 175 del_page_from_inactive_list(zone, page); 176 SetPageActive(page); 177 add_page_to_active_list(zone, page); 178 __count_vm_event(PGACTIVATE); 179 mem_cgroup_move_lists(page_get_page_cgroup(page), true); 180 } 181 spin_unlock_irq(&zone->lru_lock); 182 } 183 184 /* 185 * Mark a page as having seen activity. 186 * 187 * inactive,unreferenced -> inactive,referenced 188 * inactive,referenced -> active,unreferenced 189 * active,unreferenced -> active,referenced 190 */ 191 void mark_page_accessed(struct page *page) 192 { 193 if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) { 194 activate_page(page); 195 ClearPageReferenced(page); 196 } else if (!PageReferenced(page)) { 197 SetPageReferenced(page); 198 } 199 } 200 201 EXPORT_SYMBOL(mark_page_accessed); 202 203 /** 204 * lru_cache_add: add a page to the page lists 205 * @page: the page to add 206 */ 207 void lru_cache_add(struct page *page) 208 { 209 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); 210 211 page_cache_get(page); 212 if (!pagevec_add(pvec, page)) 213 __pagevec_lru_add(pvec); 214 put_cpu_var(lru_add_pvecs); 215 } 216 217 void lru_cache_add_active(struct page *page) 218 { 219 struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs); 220 221 page_cache_get(page); 222 if (!pagevec_add(pvec, page)) 223 __pagevec_lru_add_active(pvec); 224 put_cpu_var(lru_add_active_pvecs); 225 } 226 227 /* 228 * Drain pages out of the cpu's pagevecs. 229 * Either "cpu" is the current CPU, and preemption has already been 230 * disabled; or "cpu" is being hot-unplugged, and is already dead. 231 */ 232 static void drain_cpu_pagevecs(int cpu) 233 { 234 struct pagevec *pvec; 235 236 pvec = &per_cpu(lru_add_pvecs, cpu); 237 if (pagevec_count(pvec)) 238 __pagevec_lru_add(pvec); 239 240 pvec = &per_cpu(lru_add_active_pvecs, cpu); 241 if (pagevec_count(pvec)) 242 __pagevec_lru_add_active(pvec); 243 244 pvec = &per_cpu(lru_rotate_pvecs, cpu); 245 if (pagevec_count(pvec)) { 246 unsigned long flags; 247 248 /* No harm done if a racing interrupt already did this */ 249 local_irq_save(flags); 250 pagevec_move_tail(pvec); 251 local_irq_restore(flags); 252 } 253 } 254 255 void lru_add_drain(void) 256 { 257 drain_cpu_pagevecs(get_cpu()); 258 put_cpu(); 259 } 260 261 #ifdef CONFIG_NUMA 262 static void lru_add_drain_per_cpu(struct work_struct *dummy) 263 { 264 lru_add_drain(); 265 } 266 267 /* 268 * Returns 0 for success 269 */ 270 int lru_add_drain_all(void) 271 { 272 return schedule_on_each_cpu(lru_add_drain_per_cpu); 273 } 274 275 #else 276 277 /* 278 * Returns 0 for success 279 */ 280 int lru_add_drain_all(void) 281 { 282 lru_add_drain(); 283 return 0; 284 } 285 #endif 286 287 /* 288 * Batched page_cache_release(). Decrement the reference count on all the 289 * passed pages. If it fell to zero then remove the page from the LRU and 290 * free it. 291 * 292 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it 293 * for the remainder of the operation. 294 * 295 * The locking in this function is against shrink_cache(): we recheck the 296 * page count inside the lock to see whether shrink_cache grabbed the page 297 * via the LRU. If it did, give up: shrink_cache will free it. 298 */ 299 void release_pages(struct page **pages, int nr, int cold) 300 { 301 int i; 302 struct pagevec pages_to_free; 303 struct zone *zone = NULL; 304 unsigned long uninitialized_var(flags); 305 306 pagevec_init(&pages_to_free, cold); 307 for (i = 0; i < nr; i++) { 308 struct page *page = pages[i]; 309 310 if (unlikely(PageCompound(page))) { 311 if (zone) { 312 spin_unlock_irqrestore(&zone->lru_lock, flags); 313 zone = NULL; 314 } 315 put_compound_page(page); 316 continue; 317 } 318 319 if (!put_page_testzero(page)) 320 continue; 321 322 if (PageLRU(page)) { 323 struct zone *pagezone = page_zone(page); 324 if (pagezone != zone) { 325 if (zone) 326 spin_unlock_irqrestore(&zone->lru_lock, 327 flags); 328 zone = pagezone; 329 spin_lock_irqsave(&zone->lru_lock, flags); 330 } 331 VM_BUG_ON(!PageLRU(page)); 332 __ClearPageLRU(page); 333 del_page_from_lru(zone, page); 334 } 335 336 if (!pagevec_add(&pages_to_free, page)) { 337 if (zone) { 338 spin_unlock_irqrestore(&zone->lru_lock, flags); 339 zone = NULL; 340 } 341 __pagevec_free(&pages_to_free); 342 pagevec_reinit(&pages_to_free); 343 } 344 } 345 if (zone) 346 spin_unlock_irqrestore(&zone->lru_lock, flags); 347 348 pagevec_free(&pages_to_free); 349 } 350 351 /* 352 * The pages which we're about to release may be in the deferred lru-addition 353 * queues. That would prevent them from really being freed right now. That's 354 * OK from a correctness point of view but is inefficient - those pages may be 355 * cache-warm and we want to give them back to the page allocator ASAP. 356 * 357 * So __pagevec_release() will drain those queues here. __pagevec_lru_add() 358 * and __pagevec_lru_add_active() call release_pages() directly to avoid 359 * mutual recursion. 360 */ 361 void __pagevec_release(struct pagevec *pvec) 362 { 363 lru_add_drain(); 364 release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); 365 pagevec_reinit(pvec); 366 } 367 368 EXPORT_SYMBOL(__pagevec_release); 369 370 /* 371 * pagevec_release() for pages which are known to not be on the LRU 372 * 373 * This function reinitialises the caller's pagevec. 374 */ 375 void __pagevec_release_nonlru(struct pagevec *pvec) 376 { 377 int i; 378 struct pagevec pages_to_free; 379 380 pagevec_init(&pages_to_free, pvec->cold); 381 for (i = 0; i < pagevec_count(pvec); i++) { 382 struct page *page = pvec->pages[i]; 383 384 VM_BUG_ON(PageLRU(page)); 385 if (put_page_testzero(page)) 386 pagevec_add(&pages_to_free, page); 387 } 388 pagevec_free(&pages_to_free); 389 pagevec_reinit(pvec); 390 } 391 392 /* 393 * Add the passed pages to the LRU, then drop the caller's refcount 394 * on them. Reinitialises the caller's pagevec. 395 */ 396 void __pagevec_lru_add(struct pagevec *pvec) 397 { 398 int i; 399 struct zone *zone = NULL; 400 401 for (i = 0; i < pagevec_count(pvec); i++) { 402 struct page *page = pvec->pages[i]; 403 struct zone *pagezone = page_zone(page); 404 405 if (pagezone != zone) { 406 if (zone) 407 spin_unlock_irq(&zone->lru_lock); 408 zone = pagezone; 409 spin_lock_irq(&zone->lru_lock); 410 } 411 VM_BUG_ON(PageLRU(page)); 412 SetPageLRU(page); 413 add_page_to_inactive_list(zone, page); 414 } 415 if (zone) 416 spin_unlock_irq(&zone->lru_lock); 417 release_pages(pvec->pages, pvec->nr, pvec->cold); 418 pagevec_reinit(pvec); 419 } 420 421 EXPORT_SYMBOL(__pagevec_lru_add); 422 423 void __pagevec_lru_add_active(struct pagevec *pvec) 424 { 425 int i; 426 struct zone *zone = NULL; 427 428 for (i = 0; i < pagevec_count(pvec); i++) { 429 struct page *page = pvec->pages[i]; 430 struct zone *pagezone = page_zone(page); 431 432 if (pagezone != zone) { 433 if (zone) 434 spin_unlock_irq(&zone->lru_lock); 435 zone = pagezone; 436 spin_lock_irq(&zone->lru_lock); 437 } 438 VM_BUG_ON(PageLRU(page)); 439 SetPageLRU(page); 440 VM_BUG_ON(PageActive(page)); 441 SetPageActive(page); 442 add_page_to_active_list(zone, page); 443 } 444 if (zone) 445 spin_unlock_irq(&zone->lru_lock); 446 release_pages(pvec->pages, pvec->nr, pvec->cold); 447 pagevec_reinit(pvec); 448 } 449 450 /* 451 * Try to drop buffers from the pages in a pagevec 452 */ 453 void pagevec_strip(struct pagevec *pvec) 454 { 455 int i; 456 457 for (i = 0; i < pagevec_count(pvec); i++) { 458 struct page *page = pvec->pages[i]; 459 460 if (PagePrivate(page) && !TestSetPageLocked(page)) { 461 if (PagePrivate(page)) 462 try_to_release_page(page, 0); 463 unlock_page(page); 464 } 465 } 466 } 467 468 /** 469 * pagevec_lookup - gang pagecache lookup 470 * @pvec: Where the resulting pages are placed 471 * @mapping: The address_space to search 472 * @start: The starting page index 473 * @nr_pages: The maximum number of pages 474 * 475 * pagevec_lookup() will search for and return a group of up to @nr_pages pages 476 * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a 477 * reference against the pages in @pvec. 478 * 479 * The search returns a group of mapping-contiguous pages with ascending 480 * indexes. There may be holes in the indices due to not-present pages. 481 * 482 * pagevec_lookup() returns the number of pages which were found. 483 */ 484 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, 485 pgoff_t start, unsigned nr_pages) 486 { 487 pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); 488 return pagevec_count(pvec); 489 } 490 491 EXPORT_SYMBOL(pagevec_lookup); 492 493 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, 494 pgoff_t *index, int tag, unsigned nr_pages) 495 { 496 pvec->nr = find_get_pages_tag(mapping, index, tag, 497 nr_pages, pvec->pages); 498 return pagevec_count(pvec); 499 } 500 501 EXPORT_SYMBOL(pagevec_lookup_tag); 502 503 #ifdef CONFIG_SMP 504 /* 505 * We tolerate a little inaccuracy to avoid ping-ponging the counter between 506 * CPUs 507 */ 508 #define ACCT_THRESHOLD max(16, NR_CPUS * 2) 509 510 static DEFINE_PER_CPU(long, committed_space) = 0; 511 512 void vm_acct_memory(long pages) 513 { 514 long *local; 515 516 preempt_disable(); 517 local = &__get_cpu_var(committed_space); 518 *local += pages; 519 if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) { 520 atomic_add(*local, &vm_committed_space); 521 *local = 0; 522 } 523 preempt_enable(); 524 } 525 526 #ifdef CONFIG_HOTPLUG_CPU 527 528 /* Drop the CPU's cached committed space back into the central pool. */ 529 static int cpu_swap_callback(struct notifier_block *nfb, 530 unsigned long action, 531 void *hcpu) 532 { 533 long *committed; 534 535 committed = &per_cpu(committed_space, (long)hcpu); 536 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 537 atomic_add(*committed, &vm_committed_space); 538 *committed = 0; 539 drain_cpu_pagevecs((long)hcpu); 540 } 541 return NOTIFY_OK; 542 } 543 #endif /* CONFIG_HOTPLUG_CPU */ 544 #endif /* CONFIG_SMP */ 545 546 /* 547 * Perform any setup for the swap system 548 */ 549 void __init swap_setup(void) 550 { 551 unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); 552 553 #ifdef CONFIG_SWAP 554 bdi_init(swapper_space.backing_dev_info); 555 #endif 556 557 /* Use a smaller cluster for small-memory machines */ 558 if (megs < 16) 559 page_cluster = 2; 560 else 561 page_cluster = 3; 562 /* 563 * Right now other parts of the system means that we 564 * _really_ don't want to cluster much more 565 */ 566 #ifdef CONFIG_HOTPLUG_CPU 567 hotcpu_notifier(cpu_swap_callback, 0); 568 #endif 569 } 570