11da177e4SLinus Torvalds /* 21da177e4SLinus Torvalds * linux/mm/swap.c 31da177e4SLinus Torvalds * 41da177e4SLinus Torvalds * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 51da177e4SLinus Torvalds */ 61da177e4SLinus Torvalds 71da177e4SLinus Torvalds /* 8183ff22bSSimon Arlott * This file contains the default values for the operation of the 91da177e4SLinus Torvalds * Linux VM subsystem. Fine-tuning documentation can be found in 101da177e4SLinus Torvalds * Documentation/sysctl/vm.txt. 111da177e4SLinus Torvalds * Started 18.12.91 121da177e4SLinus Torvalds * Swap aging added 23.2.95, Stephen Tweedie. 131da177e4SLinus Torvalds * Buffermem limits added 12.3.98, Rik van Riel. 141da177e4SLinus Torvalds */ 151da177e4SLinus Torvalds 161da177e4SLinus Torvalds #include <linux/mm.h> 171da177e4SLinus Torvalds #include <linux/sched.h> 181da177e4SLinus Torvalds #include <linux/kernel_stat.h> 191da177e4SLinus Torvalds #include <linux/swap.h> 201da177e4SLinus Torvalds #include <linux/mman.h> 211da177e4SLinus Torvalds #include <linux/pagemap.h> 221da177e4SLinus Torvalds #include <linux/pagevec.h> 231da177e4SLinus Torvalds #include <linux/init.h> 24b95f1b31SPaul Gortmaker #include <linux/export.h> 251da177e4SLinus Torvalds #include <linux/mm_inline.h> 261da177e4SLinus Torvalds #include <linux/percpu_counter.h> 271da177e4SLinus Torvalds #include <linux/percpu.h> 281da177e4SLinus Torvalds #include <linux/cpu.h> 291da177e4SLinus Torvalds #include <linux/notifier.h> 30e0bf68ddSPeter Zijlstra #include <linux/backing-dev.h> 3166e1707bSBalbir Singh #include <linux/memcontrol.h> 325a0e3ad6STejun Heo #include <linux/gfp.h> 33a27bb332SKent Overstreet #include <linux/uio.h> 341da177e4SLinus Torvalds 3564d6519dSLee Schermerhorn #include "internal.h" 3664d6519dSLee Schermerhorn 37c6286c98SMel Gorman #define CREATE_TRACE_POINTS 38c6286c98SMel Gorman #include <trace/events/pagemap.h> 39c6286c98SMel Gorman 401da177e4SLinus Torvalds /* How many pages do we try to swap or page in/out together? */ 411da177e4SLinus Torvalds int page_cluster; 421da177e4SLinus Torvalds 4313f7f789SMel Gorman static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); 44f84f9504SVegard Nossum static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); 4531560180SMinchan Kim static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); 46902aaed0SHisashi Hifumi 47b221385bSAdrian Bunk /* 48b221385bSAdrian Bunk * This path almost never happens for VM activity - pages are normally 49b221385bSAdrian Bunk * freed via pagevecs. But it gets used by networking. 50b221385bSAdrian Bunk */ 51920c7a5dSHarvey Harrison static void __page_cache_release(struct page *page) 52b221385bSAdrian Bunk { 53b221385bSAdrian Bunk if (PageLRU(page)) { 54b221385bSAdrian Bunk struct zone *zone = page_zone(page); 55fa9add64SHugh Dickins struct lruvec *lruvec; 56fa9add64SHugh Dickins unsigned long flags; 57b221385bSAdrian Bunk 58b221385bSAdrian Bunk spin_lock_irqsave(&zone->lru_lock, flags); 59fa9add64SHugh Dickins lruvec = mem_cgroup_page_lruvec(page, zone); 60309381feSSasha Levin VM_BUG_ON_PAGE(!PageLRU(page), page); 61b221385bSAdrian Bunk __ClearPageLRU(page); 62fa9add64SHugh Dickins del_page_from_lru_list(page, lruvec, page_off_lru(page)); 63b221385bSAdrian Bunk spin_unlock_irqrestore(&zone->lru_lock, flags); 64b221385bSAdrian Bunk } 6591807063SAndrea Arcangeli } 6691807063SAndrea Arcangeli 6791807063SAndrea Arcangeli static void __put_single_page(struct page *page) 6891807063SAndrea Arcangeli { 6991807063SAndrea Arcangeli __page_cache_release(page); 70fc91668eSLi Hong free_hot_cold_page(page, 0); 71b221385bSAdrian Bunk } 72b221385bSAdrian Bunk 7391807063SAndrea Arcangeli static void __put_compound_page(struct page *page) 7491807063SAndrea Arcangeli { 7591807063SAndrea Arcangeli compound_page_dtor *dtor; 7691807063SAndrea Arcangeli 7791807063SAndrea Arcangeli __page_cache_release(page); 7891807063SAndrea Arcangeli dtor = get_compound_page_dtor(page); 7991807063SAndrea Arcangeli (*dtor)(page); 8091807063SAndrea Arcangeli } 8191807063SAndrea Arcangeli 828519fb30SNick Piggin static void put_compound_page(struct page *page) 831da177e4SLinus Torvalds { 8426296ad2SAndrew Morton struct page *page_head; 8526296ad2SAndrew Morton 8626296ad2SAndrew Morton if (likely(!PageTail(page))) { 8726296ad2SAndrew Morton if (put_page_testzero(page)) { 8826296ad2SAndrew Morton /* 8926296ad2SAndrew Morton * By the time all refcounts have been released 9026296ad2SAndrew Morton * split_huge_page cannot run anymore from under us. 9126296ad2SAndrew Morton */ 9226296ad2SAndrew Morton if (PageHead(page)) 9326296ad2SAndrew Morton __put_compound_page(page); 9426296ad2SAndrew Morton else 9526296ad2SAndrew Morton __put_single_page(page); 9626296ad2SAndrew Morton } 9726296ad2SAndrew Morton return; 9826296ad2SAndrew Morton } 9926296ad2SAndrew Morton 10091807063SAndrea Arcangeli /* __split_huge_page_refcount can run under us */ 101668f9abbSDavid Rientjes page_head = compound_head(page); 10270b50f94SAndrea Arcangeli 1035bf5f03cSPravin B Shelar /* 1045bf5f03cSPravin B Shelar * THP can not break up slab pages so avoid taking 10526296ad2SAndrew Morton * compound_lock() and skip the tail page refcounting (in 10626296ad2SAndrew Morton * _mapcount) too. Slab performs non-atomic bit ops on 10726296ad2SAndrew Morton * page->flags for better performance. In particular 10826296ad2SAndrew Morton * slab_unlock() in slub used to be a hot path. It is still 10926296ad2SAndrew Morton * hot on arches that do not support 1105bf5f03cSPravin B Shelar * this_cpu_cmpxchg_double(). 111ebf360f9SAndrea Arcangeli * 11226296ad2SAndrew Morton * If "page" is part of a slab or hugetlbfs page it cannot be 11326296ad2SAndrew Morton * splitted and the head page cannot change from under us. And 11426296ad2SAndrew Morton * if "page" is part of a THP page under splitting, if the 11526296ad2SAndrew Morton * head page pointed by the THP tail isn't a THP head anymore, 11626296ad2SAndrew Morton * we'll find PageTail clear after smp_rmb() and we'll treat 11726296ad2SAndrew Morton * it as a single page. 1185bf5f03cSPravin B Shelar */ 11944518d2bSAndrea Arcangeli if (!__compound_tail_refcounted(page_head)) { 120ebf360f9SAndrea Arcangeli /* 121ebf360f9SAndrea Arcangeli * If "page" is a THP tail, we must read the tail page 122ebf360f9SAndrea Arcangeli * flags after the head page flags. The 12326296ad2SAndrew Morton * split_huge_page side enforces write memory barriers 12426296ad2SAndrew Morton * between clearing PageTail and before the head page 12526296ad2SAndrew Morton * can be freed and reallocated. 126ebf360f9SAndrea Arcangeli */ 127ebf360f9SAndrea Arcangeli smp_rmb(); 12827c73ae7SAndrea Arcangeli if (likely(PageTail(page))) { 12927c73ae7SAndrea Arcangeli /* 13026296ad2SAndrew Morton * __split_huge_page_refcount cannot race 13126296ad2SAndrew Morton * here. 13227c73ae7SAndrea Arcangeli */ 133309381feSSasha Levin VM_BUG_ON_PAGE(!PageHead(page_head), page_head); 134309381feSSasha Levin VM_BUG_ON_PAGE(page_mapcount(page) != 0, page); 13544518d2bSAndrea Arcangeli if (put_page_testzero(page_head)) { 13644518d2bSAndrea Arcangeli /* 13726296ad2SAndrew Morton * If this is the tail of a slab 13826296ad2SAndrew Morton * compound page, the tail pin must 13926296ad2SAndrew Morton * not be the last reference held on 14026296ad2SAndrew Morton * the page, because the PG_slab 14126296ad2SAndrew Morton * cannot be cleared before all tail 14226296ad2SAndrew Morton * pins (which skips the _mapcount 14326296ad2SAndrew Morton * tail refcounting) have been 14426296ad2SAndrew Morton * released. For hugetlbfs the tail 14526296ad2SAndrew Morton * pin may be the last reference on 14626296ad2SAndrew Morton * the page instead, because 14726296ad2SAndrew Morton * PageHeadHuge will not go away until 14826296ad2SAndrew Morton * the compound page enters the buddy 14944518d2bSAndrea Arcangeli * allocator. 15044518d2bSAndrea Arcangeli */ 151309381feSSasha Levin VM_BUG_ON_PAGE(PageSlab(page_head), page_head); 15227c73ae7SAndrea Arcangeli __put_compound_page(page_head); 15344518d2bSAndrea Arcangeli } 15427c73ae7SAndrea Arcangeli return; 1555bf5f03cSPravin B Shelar } else 15627c73ae7SAndrea Arcangeli /* 15726296ad2SAndrew Morton * __split_huge_page_refcount run before us, 15826296ad2SAndrew Morton * "page" was a THP tail. The split page_head 15926296ad2SAndrew Morton * has been freed and reallocated as slab or 16026296ad2SAndrew Morton * hugetlbfs page of smaller order (only 16126296ad2SAndrew Morton * possible if reallocated as slab on x86). 16227c73ae7SAndrea Arcangeli */ 163ebf360f9SAndrea Arcangeli goto out_put_single; 1645bf5f03cSPravin B Shelar } 165ebf360f9SAndrea Arcangeli 16626296ad2SAndrew Morton if (likely(page != page_head && get_page_unless_zero(page_head))) { 167ebf360f9SAndrea Arcangeli unsigned long flags; 168ebf360f9SAndrea Arcangeli 16991807063SAndrea Arcangeli /* 17026296ad2SAndrew Morton * page_head wasn't a dangling pointer but it may not 17126296ad2SAndrew Morton * be a head page anymore by the time we obtain the 17226296ad2SAndrew Morton * lock. That is ok as long as it can't be freed from 17326296ad2SAndrew Morton * under us. 17491807063SAndrea Arcangeli */ 17591807063SAndrea Arcangeli flags = compound_lock_irqsave(page_head); 17691807063SAndrea Arcangeli if (unlikely(!PageTail(page))) { 17791807063SAndrea Arcangeli /* __split_huge_page_refcount run before us */ 17891807063SAndrea Arcangeli compound_unlock_irqrestore(page_head, flags); 17927c73ae7SAndrea Arcangeli if (put_page_testzero(page_head)) { 18027c73ae7SAndrea Arcangeli /* 18126296ad2SAndrew Morton * The head page may have been freed 18226296ad2SAndrew Morton * and reallocated as a compound page 18326296ad2SAndrew Morton * of smaller order and then freed 18426296ad2SAndrew Morton * again. All we know is that it 18526296ad2SAndrew Morton * cannot have become: a THP page, a 18626296ad2SAndrew Morton * compound page of higher order, a 18726296ad2SAndrew Morton * tail page. That is because we 18826296ad2SAndrew Morton * still hold the refcount of the 18926296ad2SAndrew Morton * split THP tail and page_head was 19026296ad2SAndrew Morton * the THP head before the split. 19127c73ae7SAndrea Arcangeli */ 19227c73ae7SAndrea Arcangeli if (PageHead(page_head)) 19327c73ae7SAndrea Arcangeli __put_compound_page(page_head); 19427c73ae7SAndrea Arcangeli else 19591807063SAndrea Arcangeli __put_single_page(page_head); 19627c73ae7SAndrea Arcangeli } 19791807063SAndrea Arcangeli out_put_single: 19891807063SAndrea Arcangeli if (put_page_testzero(page)) 19991807063SAndrea Arcangeli __put_single_page(page); 20091807063SAndrea Arcangeli return; 20191807063SAndrea Arcangeli } 202309381feSSasha Levin VM_BUG_ON_PAGE(page_head != page->first_page, page); 20391807063SAndrea Arcangeli /* 20491807063SAndrea Arcangeli * We can release the refcount taken by 20570b50f94SAndrea Arcangeli * get_page_unless_zero() now that 20626296ad2SAndrew Morton * __split_huge_page_refcount() is blocked on the 20726296ad2SAndrew Morton * compound_lock. 20891807063SAndrea Arcangeli */ 20991807063SAndrea Arcangeli if (put_page_testzero(page_head)) 210309381feSSasha Levin VM_BUG_ON_PAGE(1, page_head); 21191807063SAndrea Arcangeli /* __split_huge_page_refcount will wait now */ 212309381feSSasha Levin VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page); 21370b50f94SAndrea Arcangeli atomic_dec(&page->_mapcount); 214309381feSSasha Levin VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head); 215309381feSSasha Levin VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); 21691807063SAndrea Arcangeli compound_unlock_irqrestore(page_head, flags); 2175bf5f03cSPravin B Shelar 218a95a82e9SAndrea Arcangeli if (put_page_testzero(page_head)) { 219a95a82e9SAndrea Arcangeli if (PageHead(page_head)) 22091807063SAndrea Arcangeli __put_compound_page(page_head); 221a95a82e9SAndrea Arcangeli else 222a95a82e9SAndrea Arcangeli __put_single_page(page_head); 223a95a82e9SAndrea Arcangeli } 22491807063SAndrea Arcangeli } else { 22591807063SAndrea Arcangeli /* page_head is a dangling pointer */ 226309381feSSasha Levin VM_BUG_ON_PAGE(PageTail(page), page); 22791807063SAndrea Arcangeli goto out_put_single; 22891807063SAndrea Arcangeli } 2291da177e4SLinus Torvalds } 2308519fb30SNick Piggin 2318519fb30SNick Piggin void put_page(struct page *page) 2328519fb30SNick Piggin { 2338519fb30SNick Piggin if (unlikely(PageCompound(page))) 2348519fb30SNick Piggin put_compound_page(page); 2358519fb30SNick Piggin else if (put_page_testzero(page)) 23691807063SAndrea Arcangeli __put_single_page(page); 2371da177e4SLinus Torvalds } 2381da177e4SLinus Torvalds EXPORT_SYMBOL(put_page); 2391da177e4SLinus Torvalds 24070b50f94SAndrea Arcangeli /* 24170b50f94SAndrea Arcangeli * This function is exported but must not be called by anything other 24270b50f94SAndrea Arcangeli * than get_page(). It implements the slow path of get_page(). 24370b50f94SAndrea Arcangeli */ 24470b50f94SAndrea Arcangeli bool __get_page_tail(struct page *page) 24570b50f94SAndrea Arcangeli { 24670b50f94SAndrea Arcangeli /* 24770b50f94SAndrea Arcangeli * This takes care of get_page() if run on a tail page 24870b50f94SAndrea Arcangeli * returned by one of the get_user_pages/follow_page variants. 24970b50f94SAndrea Arcangeli * get_user_pages/follow_page itself doesn't need the compound 25070b50f94SAndrea Arcangeli * lock because it runs __get_page_tail_foll() under the 25170b50f94SAndrea Arcangeli * proper PT lock that already serializes against 25270b50f94SAndrea Arcangeli * split_huge_page(). 25370b50f94SAndrea Arcangeli */ 2547cb2ef56SKhalid Aziz unsigned long flags; 255ebf360f9SAndrea Arcangeli bool got; 256668f9abbSDavid Rientjes struct page *page_head = compound_head(page); 2577cb2ef56SKhalid Aziz 2585bf5f03cSPravin B Shelar /* Ref to put_compound_page() comment. */ 2593bfcd13eSAndrea Arcangeli if (!__compound_tail_refcounted(page_head)) { 260ebf360f9SAndrea Arcangeli smp_rmb(); 2615bf5f03cSPravin B Shelar if (likely(PageTail(page))) { 26227c73ae7SAndrea Arcangeli /* 26327c73ae7SAndrea Arcangeli * This is a hugetlbfs page or a slab 26427c73ae7SAndrea Arcangeli * page. __split_huge_page_refcount 26527c73ae7SAndrea Arcangeli * cannot race here. 26627c73ae7SAndrea Arcangeli */ 267309381feSSasha Levin VM_BUG_ON_PAGE(!PageHead(page_head), page_head); 268ebf360f9SAndrea Arcangeli __get_page_tail_foll(page, true); 2695bf5f03cSPravin B Shelar return true; 2705bf5f03cSPravin B Shelar } else { 27127c73ae7SAndrea Arcangeli /* 27227c73ae7SAndrea Arcangeli * __split_huge_page_refcount run 27327c73ae7SAndrea Arcangeli * before us, "page" was a THP 27427c73ae7SAndrea Arcangeli * tail. The split page_head has been 27527c73ae7SAndrea Arcangeli * freed and reallocated as slab or 27627c73ae7SAndrea Arcangeli * hugetlbfs page of smaller order 27727c73ae7SAndrea Arcangeli * (only possible if reallocated as 27827c73ae7SAndrea Arcangeli * slab on x86). 27927c73ae7SAndrea Arcangeli */ 2805bf5f03cSPravin B Shelar return false; 2815bf5f03cSPravin B Shelar } 2825bf5f03cSPravin B Shelar } 2835bf5f03cSPravin B Shelar 284ebf360f9SAndrea Arcangeli got = false; 285ebf360f9SAndrea Arcangeli if (likely(page != page_head && get_page_unless_zero(page_head))) { 28670b50f94SAndrea Arcangeli /* 28770b50f94SAndrea Arcangeli * page_head wasn't a dangling pointer but it 28870b50f94SAndrea Arcangeli * may not be a head page anymore by the time 28970b50f94SAndrea Arcangeli * we obtain the lock. That is ok as long as it 29070b50f94SAndrea Arcangeli * can't be freed from under us. 29170b50f94SAndrea Arcangeli */ 29270b50f94SAndrea Arcangeli flags = compound_lock_irqsave(page_head); 29370b50f94SAndrea Arcangeli /* here __split_huge_page_refcount won't run anymore */ 29470b50f94SAndrea Arcangeli if (likely(PageTail(page))) { 29570b50f94SAndrea Arcangeli __get_page_tail_foll(page, false); 29670b50f94SAndrea Arcangeli got = true; 29770b50f94SAndrea Arcangeli } 29870b50f94SAndrea Arcangeli compound_unlock_irqrestore(page_head, flags); 29970b50f94SAndrea Arcangeli if (unlikely(!got)) 30070b50f94SAndrea Arcangeli put_page(page_head); 30170b50f94SAndrea Arcangeli } 30270b50f94SAndrea Arcangeli return got; 30370b50f94SAndrea Arcangeli } 30470b50f94SAndrea Arcangeli EXPORT_SYMBOL(__get_page_tail); 30570b50f94SAndrea Arcangeli 3061d7ea732SAlexander Zarochentsev /** 3077682486bSRandy Dunlap * put_pages_list() - release a list of pages 3087682486bSRandy Dunlap * @pages: list of pages threaded on page->lru 3091d7ea732SAlexander Zarochentsev * 3101d7ea732SAlexander Zarochentsev * Release a list of pages which are strung together on page.lru. Currently 3111d7ea732SAlexander Zarochentsev * used by read_cache_pages() and related error recovery code. 3121d7ea732SAlexander Zarochentsev */ 3131d7ea732SAlexander Zarochentsev void put_pages_list(struct list_head *pages) 3141d7ea732SAlexander Zarochentsev { 3151d7ea732SAlexander Zarochentsev while (!list_empty(pages)) { 3161d7ea732SAlexander Zarochentsev struct page *victim; 3171d7ea732SAlexander Zarochentsev 3181d7ea732SAlexander Zarochentsev victim = list_entry(pages->prev, struct page, lru); 3191d7ea732SAlexander Zarochentsev list_del(&victim->lru); 3201d7ea732SAlexander Zarochentsev page_cache_release(victim); 3211d7ea732SAlexander Zarochentsev } 3221d7ea732SAlexander Zarochentsev } 3231d7ea732SAlexander Zarochentsev EXPORT_SYMBOL(put_pages_list); 3241d7ea732SAlexander Zarochentsev 32518022c5dSMel Gorman /* 32618022c5dSMel Gorman * get_kernel_pages() - pin kernel pages in memory 32718022c5dSMel Gorman * @kiov: An array of struct kvec structures 32818022c5dSMel Gorman * @nr_segs: number of segments to pin 32918022c5dSMel Gorman * @write: pinning for read/write, currently ignored 33018022c5dSMel Gorman * @pages: array that receives pointers to the pages pinned. 33118022c5dSMel Gorman * Should be at least nr_segs long. 33218022c5dSMel Gorman * 33318022c5dSMel Gorman * Returns number of pages pinned. This may be fewer than the number 33418022c5dSMel Gorman * requested. If nr_pages is 0 or negative, returns 0. If no pages 33518022c5dSMel Gorman * were pinned, returns -errno. Each page returned must be released 33618022c5dSMel Gorman * with a put_page() call when it is finished with. 33718022c5dSMel Gorman */ 33818022c5dSMel Gorman int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, 33918022c5dSMel Gorman struct page **pages) 34018022c5dSMel Gorman { 34118022c5dSMel Gorman int seg; 34218022c5dSMel Gorman 34318022c5dSMel Gorman for (seg = 0; seg < nr_segs; seg++) { 34418022c5dSMel Gorman if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE)) 34518022c5dSMel Gorman return seg; 34618022c5dSMel Gorman 3475a178119SMel Gorman pages[seg] = kmap_to_page(kiov[seg].iov_base); 34818022c5dSMel Gorman page_cache_get(pages[seg]); 34918022c5dSMel Gorman } 35018022c5dSMel Gorman 35118022c5dSMel Gorman return seg; 35218022c5dSMel Gorman } 35318022c5dSMel Gorman EXPORT_SYMBOL_GPL(get_kernel_pages); 35418022c5dSMel Gorman 35518022c5dSMel Gorman /* 35618022c5dSMel Gorman * get_kernel_page() - pin a kernel page in memory 35718022c5dSMel Gorman * @start: starting kernel address 35818022c5dSMel Gorman * @write: pinning for read/write, currently ignored 35918022c5dSMel Gorman * @pages: array that receives pointer to the page pinned. 36018022c5dSMel Gorman * Must be at least nr_segs long. 36118022c5dSMel Gorman * 36218022c5dSMel Gorman * Returns 1 if page is pinned. If the page was not pinned, returns 36318022c5dSMel Gorman * -errno. The page returned must be released with a put_page() call 36418022c5dSMel Gorman * when it is finished with. 36518022c5dSMel Gorman */ 36618022c5dSMel Gorman int get_kernel_page(unsigned long start, int write, struct page **pages) 36718022c5dSMel Gorman { 36818022c5dSMel Gorman const struct kvec kiov = { 36918022c5dSMel Gorman .iov_base = (void *)start, 37018022c5dSMel Gorman .iov_len = PAGE_SIZE 37118022c5dSMel Gorman }; 37218022c5dSMel Gorman 37318022c5dSMel Gorman return get_kernel_pages(&kiov, 1, write, pages); 37418022c5dSMel Gorman } 37518022c5dSMel Gorman EXPORT_SYMBOL_GPL(get_kernel_page); 37618022c5dSMel Gorman 3773dd7ae8eSShaohua Li static void pagevec_lru_move_fn(struct pagevec *pvec, 378fa9add64SHugh Dickins void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), 3793dd7ae8eSShaohua Li void *arg) 380902aaed0SHisashi Hifumi { 381902aaed0SHisashi Hifumi int i; 382902aaed0SHisashi Hifumi struct zone *zone = NULL; 383fa9add64SHugh Dickins struct lruvec *lruvec; 3843dd7ae8eSShaohua Li unsigned long flags = 0; 385902aaed0SHisashi Hifumi 386902aaed0SHisashi Hifumi for (i = 0; i < pagevec_count(pvec); i++) { 387902aaed0SHisashi Hifumi struct page *page = pvec->pages[i]; 388902aaed0SHisashi Hifumi struct zone *pagezone = page_zone(page); 389902aaed0SHisashi Hifumi 390902aaed0SHisashi Hifumi if (pagezone != zone) { 391902aaed0SHisashi Hifumi if (zone) 3923dd7ae8eSShaohua Li spin_unlock_irqrestore(&zone->lru_lock, flags); 393902aaed0SHisashi Hifumi zone = pagezone; 3943dd7ae8eSShaohua Li spin_lock_irqsave(&zone->lru_lock, flags); 395902aaed0SHisashi Hifumi } 3963dd7ae8eSShaohua Li 397fa9add64SHugh Dickins lruvec = mem_cgroup_page_lruvec(page, zone); 398fa9add64SHugh Dickins (*move_fn)(page, lruvec, arg); 3993dd7ae8eSShaohua Li } 4003dd7ae8eSShaohua Li if (zone) 4013dd7ae8eSShaohua Li spin_unlock_irqrestore(&zone->lru_lock, flags); 4023dd7ae8eSShaohua Li release_pages(pvec->pages, pvec->nr, pvec->cold); 4033dd7ae8eSShaohua Li pagevec_reinit(pvec); 4043dd7ae8eSShaohua Li } 4053dd7ae8eSShaohua Li 406fa9add64SHugh Dickins static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec, 407fa9add64SHugh Dickins void *arg) 4083dd7ae8eSShaohua Li { 4093dd7ae8eSShaohua Li int *pgmoved = arg; 4103dd7ae8eSShaohua Li 411894bc310SLee Schermerhorn if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 4123f58a829SMinchan Kim enum lru_list lru = page_lru_base_type(page); 413925b7673SJohannes Weiner list_move_tail(&page->lru, &lruvec->lists[lru]); 4143dd7ae8eSShaohua Li (*pgmoved)++; 415902aaed0SHisashi Hifumi } 416902aaed0SHisashi Hifumi } 4173dd7ae8eSShaohua Li 4183dd7ae8eSShaohua Li /* 4193dd7ae8eSShaohua Li * pagevec_move_tail() must be called with IRQ disabled. 4203dd7ae8eSShaohua Li * Otherwise this may cause nasty races. 4213dd7ae8eSShaohua Li */ 4223dd7ae8eSShaohua Li static void pagevec_move_tail(struct pagevec *pvec) 4233dd7ae8eSShaohua Li { 4243dd7ae8eSShaohua Li int pgmoved = 0; 4253dd7ae8eSShaohua Li 4263dd7ae8eSShaohua Li pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved); 427902aaed0SHisashi Hifumi __count_vm_events(PGROTATED, pgmoved); 428902aaed0SHisashi Hifumi } 429902aaed0SHisashi Hifumi 430902aaed0SHisashi Hifumi /* 4311da177e4SLinus Torvalds * Writeback is about to end against a page which has been marked for immediate 4321da177e4SLinus Torvalds * reclaim. If it still appears to be reclaimable, move it to the tail of the 433902aaed0SHisashi Hifumi * inactive list. 4341da177e4SLinus Torvalds */ 435ac6aadb2SMiklos Szeredi void rotate_reclaimable_page(struct page *page) 4361da177e4SLinus Torvalds { 437ac6aadb2SMiklos Szeredi if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && 438894bc310SLee Schermerhorn !PageUnevictable(page) && PageLRU(page)) { 439902aaed0SHisashi Hifumi struct pagevec *pvec; 4401da177e4SLinus Torvalds unsigned long flags; 4411da177e4SLinus Torvalds 442902aaed0SHisashi Hifumi page_cache_get(page); 443902aaed0SHisashi Hifumi local_irq_save(flags); 444902aaed0SHisashi Hifumi pvec = &__get_cpu_var(lru_rotate_pvecs); 445902aaed0SHisashi Hifumi if (!pagevec_add(pvec, page)) 446902aaed0SHisashi Hifumi pagevec_move_tail(pvec); 447902aaed0SHisashi Hifumi local_irq_restore(flags); 448ac6aadb2SMiklos Szeredi } 4491da177e4SLinus Torvalds } 4501da177e4SLinus Torvalds 451fa9add64SHugh Dickins static void update_page_reclaim_stat(struct lruvec *lruvec, 4523e2f41f1SKOSAKI Motohiro int file, int rotated) 4533e2f41f1SKOSAKI Motohiro { 454fa9add64SHugh Dickins struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 4553e2f41f1SKOSAKI Motohiro 4563e2f41f1SKOSAKI Motohiro reclaim_stat->recent_scanned[file]++; 4573e2f41f1SKOSAKI Motohiro if (rotated) 4583e2f41f1SKOSAKI Motohiro reclaim_stat->recent_rotated[file]++; 4593e2f41f1SKOSAKI Motohiro } 4603e2f41f1SKOSAKI Motohiro 461fa9add64SHugh Dickins static void __activate_page(struct page *page, struct lruvec *lruvec, 462fa9add64SHugh Dickins void *arg) 463744ed144SShaohua Li { 4647a608572SLinus Torvalds if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 465744ed144SShaohua Li int file = page_is_file_cache(page); 466744ed144SShaohua Li int lru = page_lru_base_type(page); 467744ed144SShaohua Li 468fa9add64SHugh Dickins del_page_from_lru_list(page, lruvec, lru); 469744ed144SShaohua Li SetPageActive(page); 470744ed144SShaohua Li lru += LRU_ACTIVE; 471fa9add64SHugh Dickins add_page_to_lru_list(page, lruvec, lru); 472c6286c98SMel Gorman trace_mm_lru_activate(page, page_to_pfn(page)); 4737a608572SLinus Torvalds 474fa9add64SHugh Dickins __count_vm_event(PGACTIVATE); 475fa9add64SHugh Dickins update_page_reclaim_stat(lruvec, file, 1); 476744ed144SShaohua Li } 477eb709b0dSShaohua Li } 478eb709b0dSShaohua Li 479eb709b0dSShaohua Li #ifdef CONFIG_SMP 480eb709b0dSShaohua Li static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); 481eb709b0dSShaohua Li 482eb709b0dSShaohua Li static void activate_page_drain(int cpu) 483eb709b0dSShaohua Li { 484eb709b0dSShaohua Li struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); 485eb709b0dSShaohua Li 486eb709b0dSShaohua Li if (pagevec_count(pvec)) 487eb709b0dSShaohua Li pagevec_lru_move_fn(pvec, __activate_page, NULL); 488eb709b0dSShaohua Li } 489eb709b0dSShaohua Li 4905fbc4616SChris Metcalf static bool need_activate_page_drain(int cpu) 4915fbc4616SChris Metcalf { 4925fbc4616SChris Metcalf return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; 4935fbc4616SChris Metcalf } 4945fbc4616SChris Metcalf 495eb709b0dSShaohua Li void activate_page(struct page *page) 496eb709b0dSShaohua Li { 497eb709b0dSShaohua Li if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 498eb709b0dSShaohua Li struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); 499eb709b0dSShaohua Li 500eb709b0dSShaohua Li page_cache_get(page); 501eb709b0dSShaohua Li if (!pagevec_add(pvec, page)) 502eb709b0dSShaohua Li pagevec_lru_move_fn(pvec, __activate_page, NULL); 503eb709b0dSShaohua Li put_cpu_var(activate_page_pvecs); 504eb709b0dSShaohua Li } 505eb709b0dSShaohua Li } 506eb709b0dSShaohua Li 507eb709b0dSShaohua Li #else 508eb709b0dSShaohua Li static inline void activate_page_drain(int cpu) 509eb709b0dSShaohua Li { 510eb709b0dSShaohua Li } 511eb709b0dSShaohua Li 5125fbc4616SChris Metcalf static bool need_activate_page_drain(int cpu) 5135fbc4616SChris Metcalf { 5145fbc4616SChris Metcalf return false; 5155fbc4616SChris Metcalf } 5165fbc4616SChris Metcalf 517eb709b0dSShaohua Li void activate_page(struct page *page) 518eb709b0dSShaohua Li { 519eb709b0dSShaohua Li struct zone *zone = page_zone(page); 520eb709b0dSShaohua Li 521eb709b0dSShaohua Li spin_lock_irq(&zone->lru_lock); 522fa9add64SHugh Dickins __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL); 5231da177e4SLinus Torvalds spin_unlock_irq(&zone->lru_lock); 5241da177e4SLinus Torvalds } 525eb709b0dSShaohua Li #endif 5261da177e4SLinus Torvalds 527059285a2SMel Gorman static void __lru_cache_activate_page(struct page *page) 528059285a2SMel Gorman { 529059285a2SMel Gorman struct pagevec *pvec = &get_cpu_var(lru_add_pvec); 530059285a2SMel Gorman int i; 531059285a2SMel Gorman 532059285a2SMel Gorman /* 533059285a2SMel Gorman * Search backwards on the optimistic assumption that the page being 534059285a2SMel Gorman * activated has just been added to this pagevec. Note that only 535059285a2SMel Gorman * the local pagevec is examined as a !PageLRU page could be in the 536059285a2SMel Gorman * process of being released, reclaimed, migrated or on a remote 537059285a2SMel Gorman * pagevec that is currently being drained. Furthermore, marking 538059285a2SMel Gorman * a remote pagevec's page PageActive potentially hits a race where 539059285a2SMel Gorman * a page is marked PageActive just after it is added to the inactive 540059285a2SMel Gorman * list causing accounting errors and BUG_ON checks to trigger. 541059285a2SMel Gorman */ 542059285a2SMel Gorman for (i = pagevec_count(pvec) - 1; i >= 0; i--) { 543059285a2SMel Gorman struct page *pagevec_page = pvec->pages[i]; 544059285a2SMel Gorman 545059285a2SMel Gorman if (pagevec_page == page) { 546059285a2SMel Gorman SetPageActive(page); 547059285a2SMel Gorman break; 548059285a2SMel Gorman } 549059285a2SMel Gorman } 550059285a2SMel Gorman 551059285a2SMel Gorman put_cpu_var(lru_add_pvec); 552059285a2SMel Gorman } 553059285a2SMel Gorman 5541da177e4SLinus Torvalds /* 5551da177e4SLinus Torvalds * Mark a page as having seen activity. 5561da177e4SLinus Torvalds * 5571da177e4SLinus Torvalds * inactive,unreferenced -> inactive,referenced 5581da177e4SLinus Torvalds * inactive,referenced -> active,unreferenced 5591da177e4SLinus Torvalds * active,unreferenced -> active,referenced 5601da177e4SLinus Torvalds */ 561920c7a5dSHarvey Harrison void mark_page_accessed(struct page *page) 5621da177e4SLinus Torvalds { 563894bc310SLee Schermerhorn if (!PageActive(page) && !PageUnevictable(page) && 564059285a2SMel Gorman PageReferenced(page)) { 565059285a2SMel Gorman 566059285a2SMel Gorman /* 567059285a2SMel Gorman * If the page is on the LRU, queue it for activation via 568059285a2SMel Gorman * activate_page_pvecs. Otherwise, assume the page is on a 569059285a2SMel Gorman * pagevec, mark it active and it'll be moved to the active 570059285a2SMel Gorman * LRU on the next drain. 571059285a2SMel Gorman */ 572059285a2SMel Gorman if (PageLRU(page)) 5731da177e4SLinus Torvalds activate_page(page); 574059285a2SMel Gorman else 575059285a2SMel Gorman __lru_cache_activate_page(page); 5761da177e4SLinus Torvalds ClearPageReferenced(page); 5771da177e4SLinus Torvalds } else if (!PageReferenced(page)) { 5781da177e4SLinus Torvalds SetPageReferenced(page); 5791da177e4SLinus Torvalds } 5801da177e4SLinus Torvalds } 5811da177e4SLinus Torvalds EXPORT_SYMBOL(mark_page_accessed); 5821da177e4SLinus Torvalds 583d741c9cdSRobin Dong /* 58413f7f789SMel Gorman * Queue the page for addition to the LRU via pagevec. The decision on whether 58513f7f789SMel Gorman * to add the page to the [in]active [file|anon] list is deferred until the 58613f7f789SMel Gorman * pagevec is drained. This gives a chance for the caller of __lru_cache_add() 58713f7f789SMel Gorman * have the page added to the active list using mark_page_accessed(). 588d741c9cdSRobin Dong */ 589c53954a0SMel Gorman void __lru_cache_add(struct page *page) 5901da177e4SLinus Torvalds { 59113f7f789SMel Gorman struct pagevec *pvec = &get_cpu_var(lru_add_pvec); 59213f7f789SMel Gorman 5931da177e4SLinus Torvalds page_cache_get(page); 594d741c9cdSRobin Dong if (!pagevec_space(pvec)) 595a0b8cab3SMel Gorman __pagevec_lru_add(pvec); 596d741c9cdSRobin Dong pagevec_add(pvec, page); 59713f7f789SMel Gorman put_cpu_var(lru_add_pvec); 5981da177e4SLinus Torvalds } 59947846b06SMiklos Szeredi EXPORT_SYMBOL(__lru_cache_add); 6001da177e4SLinus Torvalds 601f04e9ebbSKOSAKI Motohiro /** 602c53954a0SMel Gorman * lru_cache_add - add a page to a page list 603f04e9ebbSKOSAKI Motohiro * @page: the page to be added to the LRU. 604f04e9ebbSKOSAKI Motohiro */ 605c53954a0SMel Gorman void lru_cache_add(struct page *page) 6061da177e4SLinus Torvalds { 607309381feSSasha Levin VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); 608309381feSSasha Levin VM_BUG_ON_PAGE(PageLRU(page), page); 609c53954a0SMel Gorman __lru_cache_add(page); 6101da177e4SLinus Torvalds } 6111da177e4SLinus Torvalds 612894bc310SLee Schermerhorn /** 613894bc310SLee Schermerhorn * add_page_to_unevictable_list - add a page to the unevictable list 614894bc310SLee Schermerhorn * @page: the page to be added to the unevictable list 615894bc310SLee Schermerhorn * 616894bc310SLee Schermerhorn * Add page directly to its zone's unevictable list. To avoid races with 617894bc310SLee Schermerhorn * tasks that might be making the page evictable, through eg. munlock, 618894bc310SLee Schermerhorn * munmap or exit, while it's not on the lru, we want to add the page 619894bc310SLee Schermerhorn * while it's locked or otherwise "invisible" to other tasks. This is 620894bc310SLee Schermerhorn * difficult to do when using the pagevec cache, so bypass that. 621894bc310SLee Schermerhorn */ 622894bc310SLee Schermerhorn void add_page_to_unevictable_list(struct page *page) 623894bc310SLee Schermerhorn { 624894bc310SLee Schermerhorn struct zone *zone = page_zone(page); 625fa9add64SHugh Dickins struct lruvec *lruvec; 626894bc310SLee Schermerhorn 627894bc310SLee Schermerhorn spin_lock_irq(&zone->lru_lock); 628fa9add64SHugh Dickins lruvec = mem_cgroup_page_lruvec(page, zone); 629ef2a2cbdSNaoya Horiguchi ClearPageActive(page); 630894bc310SLee Schermerhorn SetPageUnevictable(page); 631894bc310SLee Schermerhorn SetPageLRU(page); 632fa9add64SHugh Dickins add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); 633894bc310SLee Schermerhorn spin_unlock_irq(&zone->lru_lock); 634894bc310SLee Schermerhorn } 635894bc310SLee Schermerhorn 636902aaed0SHisashi Hifumi /* 63731560180SMinchan Kim * If the page can not be invalidated, it is moved to the 63831560180SMinchan Kim * inactive list to speed up its reclaim. It is moved to the 63931560180SMinchan Kim * head of the list, rather than the tail, to give the flusher 64031560180SMinchan Kim * threads some time to write it out, as this is much more 64131560180SMinchan Kim * effective than the single-page writeout from reclaim. 642278df9f4SMinchan Kim * 643278df9f4SMinchan Kim * If the page isn't page_mapped and dirty/writeback, the page 644278df9f4SMinchan Kim * could reclaim asap using PG_reclaim. 645278df9f4SMinchan Kim * 646278df9f4SMinchan Kim * 1. active, mapped page -> none 647278df9f4SMinchan Kim * 2. active, dirty/writeback page -> inactive, head, PG_reclaim 648278df9f4SMinchan Kim * 3. inactive, mapped page -> none 649278df9f4SMinchan Kim * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim 650278df9f4SMinchan Kim * 5. inactive, clean -> inactive, tail 651278df9f4SMinchan Kim * 6. Others -> none 652278df9f4SMinchan Kim * 653278df9f4SMinchan Kim * In 4, why it moves inactive's head, the VM expects the page would 654278df9f4SMinchan Kim * be write it out by flusher threads as this is much more effective 655278df9f4SMinchan Kim * than the single-page writeout from reclaim. 65631560180SMinchan Kim */ 657fa9add64SHugh Dickins static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, 658fa9add64SHugh Dickins void *arg) 65931560180SMinchan Kim { 66031560180SMinchan Kim int lru, file; 661278df9f4SMinchan Kim bool active; 66231560180SMinchan Kim 663278df9f4SMinchan Kim if (!PageLRU(page)) 66431560180SMinchan Kim return; 66531560180SMinchan Kim 666bad49d9cSMinchan Kim if (PageUnevictable(page)) 667bad49d9cSMinchan Kim return; 668bad49d9cSMinchan Kim 66931560180SMinchan Kim /* Some processes are using the page */ 67031560180SMinchan Kim if (page_mapped(page)) 67131560180SMinchan Kim return; 67231560180SMinchan Kim 673278df9f4SMinchan Kim active = PageActive(page); 67431560180SMinchan Kim file = page_is_file_cache(page); 67531560180SMinchan Kim lru = page_lru_base_type(page); 676fa9add64SHugh Dickins 677fa9add64SHugh Dickins del_page_from_lru_list(page, lruvec, lru + active); 67831560180SMinchan Kim ClearPageActive(page); 67931560180SMinchan Kim ClearPageReferenced(page); 680fa9add64SHugh Dickins add_page_to_lru_list(page, lruvec, lru); 68131560180SMinchan Kim 682278df9f4SMinchan Kim if (PageWriteback(page) || PageDirty(page)) { 683278df9f4SMinchan Kim /* 684278df9f4SMinchan Kim * PG_reclaim could be raced with end_page_writeback 685278df9f4SMinchan Kim * It can make readahead confusing. But race window 686278df9f4SMinchan Kim * is _really_ small and it's non-critical problem. 687278df9f4SMinchan Kim */ 688278df9f4SMinchan Kim SetPageReclaim(page); 689278df9f4SMinchan Kim } else { 690278df9f4SMinchan Kim /* 691278df9f4SMinchan Kim * The page's writeback ends up during pagevec 692278df9f4SMinchan Kim * We moves tha page into tail of inactive. 693278df9f4SMinchan Kim */ 694925b7673SJohannes Weiner list_move_tail(&page->lru, &lruvec->lists[lru]); 695278df9f4SMinchan Kim __count_vm_event(PGROTATED); 696278df9f4SMinchan Kim } 697278df9f4SMinchan Kim 698278df9f4SMinchan Kim if (active) 699278df9f4SMinchan Kim __count_vm_event(PGDEACTIVATE); 700fa9add64SHugh Dickins update_page_reclaim_stat(lruvec, file, 0); 70131560180SMinchan Kim } 70231560180SMinchan Kim 70331560180SMinchan Kim /* 704902aaed0SHisashi Hifumi * Drain pages out of the cpu's pagevecs. 705902aaed0SHisashi Hifumi * Either "cpu" is the current CPU, and preemption has already been 706902aaed0SHisashi Hifumi * disabled; or "cpu" is being hot-unplugged, and is already dead. 707902aaed0SHisashi Hifumi */ 708f0cb3c76SKonstantin Khlebnikov void lru_add_drain_cpu(int cpu) 7091da177e4SLinus Torvalds { 71013f7f789SMel Gorman struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu); 7111da177e4SLinus Torvalds 7121da177e4SLinus Torvalds if (pagevec_count(pvec)) 713a0b8cab3SMel Gorman __pagevec_lru_add(pvec); 714902aaed0SHisashi Hifumi 715902aaed0SHisashi Hifumi pvec = &per_cpu(lru_rotate_pvecs, cpu); 716902aaed0SHisashi Hifumi if (pagevec_count(pvec)) { 717902aaed0SHisashi Hifumi unsigned long flags; 718902aaed0SHisashi Hifumi 719902aaed0SHisashi Hifumi /* No harm done if a racing interrupt already did this */ 720902aaed0SHisashi Hifumi local_irq_save(flags); 721902aaed0SHisashi Hifumi pagevec_move_tail(pvec); 722902aaed0SHisashi Hifumi local_irq_restore(flags); 723902aaed0SHisashi Hifumi } 72431560180SMinchan Kim 72531560180SMinchan Kim pvec = &per_cpu(lru_deactivate_pvecs, cpu); 72631560180SMinchan Kim if (pagevec_count(pvec)) 7273dd7ae8eSShaohua Li pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); 728eb709b0dSShaohua Li 729eb709b0dSShaohua Li activate_page_drain(cpu); 73031560180SMinchan Kim } 73131560180SMinchan Kim 73231560180SMinchan Kim /** 73331560180SMinchan Kim * deactivate_page - forcefully deactivate a page 73431560180SMinchan Kim * @page: page to deactivate 73531560180SMinchan Kim * 73631560180SMinchan Kim * This function hints the VM that @page is a good reclaim candidate, 73731560180SMinchan Kim * for example if its invalidation fails due to the page being dirty 73831560180SMinchan Kim * or under writeback. 73931560180SMinchan Kim */ 74031560180SMinchan Kim void deactivate_page(struct page *page) 74131560180SMinchan Kim { 742821ed6bbSMinchan Kim /* 743821ed6bbSMinchan Kim * In a workload with many unevictable page such as mprotect, unevictable 744821ed6bbSMinchan Kim * page deactivation for accelerating reclaim is pointless. 745821ed6bbSMinchan Kim */ 746821ed6bbSMinchan Kim if (PageUnevictable(page)) 747821ed6bbSMinchan Kim return; 748821ed6bbSMinchan Kim 74931560180SMinchan Kim if (likely(get_page_unless_zero(page))) { 75031560180SMinchan Kim struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); 75131560180SMinchan Kim 75231560180SMinchan Kim if (!pagevec_add(pvec, page)) 7533dd7ae8eSShaohua Li pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); 75431560180SMinchan Kim put_cpu_var(lru_deactivate_pvecs); 75531560180SMinchan Kim } 75680bfed90SAndrew Morton } 75780bfed90SAndrew Morton 75880bfed90SAndrew Morton void lru_add_drain(void) 75980bfed90SAndrew Morton { 760f0cb3c76SKonstantin Khlebnikov lru_add_drain_cpu(get_cpu()); 76180bfed90SAndrew Morton put_cpu(); 7621da177e4SLinus Torvalds } 7631da177e4SLinus Torvalds 764c4028958SDavid Howells static void lru_add_drain_per_cpu(struct work_struct *dummy) 765053837fcSNick Piggin { 766053837fcSNick Piggin lru_add_drain(); 767053837fcSNick Piggin } 768053837fcSNick Piggin 7695fbc4616SChris Metcalf static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); 7705fbc4616SChris Metcalf 7715fbc4616SChris Metcalf void lru_add_drain_all(void) 772053837fcSNick Piggin { 7735fbc4616SChris Metcalf static DEFINE_MUTEX(lock); 7745fbc4616SChris Metcalf static struct cpumask has_work; 7755fbc4616SChris Metcalf int cpu; 7765fbc4616SChris Metcalf 7775fbc4616SChris Metcalf mutex_lock(&lock); 7785fbc4616SChris Metcalf get_online_cpus(); 7795fbc4616SChris Metcalf cpumask_clear(&has_work); 7805fbc4616SChris Metcalf 7815fbc4616SChris Metcalf for_each_online_cpu(cpu) { 7825fbc4616SChris Metcalf struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); 7835fbc4616SChris Metcalf 7845fbc4616SChris Metcalf if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || 7855fbc4616SChris Metcalf pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || 7865fbc4616SChris Metcalf pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || 7875fbc4616SChris Metcalf need_activate_page_drain(cpu)) { 7885fbc4616SChris Metcalf INIT_WORK(work, lru_add_drain_per_cpu); 7895fbc4616SChris Metcalf schedule_work_on(cpu, work); 7905fbc4616SChris Metcalf cpumask_set_cpu(cpu, &has_work); 7915fbc4616SChris Metcalf } 7925fbc4616SChris Metcalf } 7935fbc4616SChris Metcalf 7945fbc4616SChris Metcalf for_each_cpu(cpu, &has_work) 7955fbc4616SChris Metcalf flush_work(&per_cpu(lru_add_drain_work, cpu)); 7965fbc4616SChris Metcalf 7975fbc4616SChris Metcalf put_online_cpus(); 7985fbc4616SChris Metcalf mutex_unlock(&lock); 799053837fcSNick Piggin } 800053837fcSNick Piggin 8011da177e4SLinus Torvalds /* 8021da177e4SLinus Torvalds * Batched page_cache_release(). Decrement the reference count on all the 8031da177e4SLinus Torvalds * passed pages. If it fell to zero then remove the page from the LRU and 8041da177e4SLinus Torvalds * free it. 8051da177e4SLinus Torvalds * 8061da177e4SLinus Torvalds * Avoid taking zone->lru_lock if possible, but if it is taken, retain it 8071da177e4SLinus Torvalds * for the remainder of the operation. 8081da177e4SLinus Torvalds * 809ab33dc09SFernando Luis Vazquez Cao * The locking in this function is against shrink_inactive_list(): we recheck 810ab33dc09SFernando Luis Vazquez Cao * the page count inside the lock to see whether shrink_inactive_list() 811ab33dc09SFernando Luis Vazquez Cao * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() 812ab33dc09SFernando Luis Vazquez Cao * will free it. 8131da177e4SLinus Torvalds */ 8141da177e4SLinus Torvalds void release_pages(struct page **pages, int nr, int cold) 8151da177e4SLinus Torvalds { 8161da177e4SLinus Torvalds int i; 817cc59850eSKonstantin Khlebnikov LIST_HEAD(pages_to_free); 8181da177e4SLinus Torvalds struct zone *zone = NULL; 819fa9add64SHugh Dickins struct lruvec *lruvec; 820902aaed0SHisashi Hifumi unsigned long uninitialized_var(flags); 8211da177e4SLinus Torvalds 8221da177e4SLinus Torvalds for (i = 0; i < nr; i++) { 8231da177e4SLinus Torvalds struct page *page = pages[i]; 8241da177e4SLinus Torvalds 8258519fb30SNick Piggin if (unlikely(PageCompound(page))) { 8268519fb30SNick Piggin if (zone) { 827902aaed0SHisashi Hifumi spin_unlock_irqrestore(&zone->lru_lock, flags); 8288519fb30SNick Piggin zone = NULL; 8298519fb30SNick Piggin } 8308519fb30SNick Piggin put_compound_page(page); 8318519fb30SNick Piggin continue; 8328519fb30SNick Piggin } 8338519fb30SNick Piggin 834b5810039SNick Piggin if (!put_page_testzero(page)) 8351da177e4SLinus Torvalds continue; 8361da177e4SLinus Torvalds 83746453a6eSNick Piggin if (PageLRU(page)) { 83846453a6eSNick Piggin struct zone *pagezone = page_zone(page); 839894bc310SLee Schermerhorn 8401da177e4SLinus Torvalds if (pagezone != zone) { 8411da177e4SLinus Torvalds if (zone) 842902aaed0SHisashi Hifumi spin_unlock_irqrestore(&zone->lru_lock, 843902aaed0SHisashi Hifumi flags); 8441da177e4SLinus Torvalds zone = pagezone; 845902aaed0SHisashi Hifumi spin_lock_irqsave(&zone->lru_lock, flags); 8461da177e4SLinus Torvalds } 847fa9add64SHugh Dickins 848fa9add64SHugh Dickins lruvec = mem_cgroup_page_lruvec(page, zone); 849309381feSSasha Levin VM_BUG_ON_PAGE(!PageLRU(page), page); 85067453911SNick Piggin __ClearPageLRU(page); 851fa9add64SHugh Dickins del_page_from_lru_list(page, lruvec, page_off_lru(page)); 85246453a6eSNick Piggin } 85346453a6eSNick Piggin 854c53954a0SMel Gorman /* Clear Active bit in case of parallel mark_page_accessed */ 855c53954a0SMel Gorman ClearPageActive(page); 856c53954a0SMel Gorman 857cc59850eSKonstantin Khlebnikov list_add(&page->lru, &pages_to_free); 8581da177e4SLinus Torvalds } 8591da177e4SLinus Torvalds if (zone) 860902aaed0SHisashi Hifumi spin_unlock_irqrestore(&zone->lru_lock, flags); 8611da177e4SLinus Torvalds 862cc59850eSKonstantin Khlebnikov free_hot_cold_page_list(&pages_to_free, cold); 8631da177e4SLinus Torvalds } 8640be8557bSMiklos Szeredi EXPORT_SYMBOL(release_pages); 8651da177e4SLinus Torvalds 8661da177e4SLinus Torvalds /* 8671da177e4SLinus Torvalds * The pages which we're about to release may be in the deferred lru-addition 8681da177e4SLinus Torvalds * queues. That would prevent them from really being freed right now. That's 8691da177e4SLinus Torvalds * OK from a correctness point of view but is inefficient - those pages may be 8701da177e4SLinus Torvalds * cache-warm and we want to give them back to the page allocator ASAP. 8711da177e4SLinus Torvalds * 8721da177e4SLinus Torvalds * So __pagevec_release() will drain those queues here. __pagevec_lru_add() 8731da177e4SLinus Torvalds * and __pagevec_lru_add_active() call release_pages() directly to avoid 8741da177e4SLinus Torvalds * mutual recursion. 8751da177e4SLinus Torvalds */ 8761da177e4SLinus Torvalds void __pagevec_release(struct pagevec *pvec) 8771da177e4SLinus Torvalds { 8781da177e4SLinus Torvalds lru_add_drain(); 8791da177e4SLinus Torvalds release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); 8801da177e4SLinus Torvalds pagevec_reinit(pvec); 8811da177e4SLinus Torvalds } 8827f285701SSteve French EXPORT_SYMBOL(__pagevec_release); 8837f285701SSteve French 88412d27107SHugh Dickins #ifdef CONFIG_TRANSPARENT_HUGEPAGE 88571e3aac0SAndrea Arcangeli /* used by __split_huge_page_refcount() */ 886fa9add64SHugh Dickins void lru_add_page_tail(struct page *page, struct page *page_tail, 8875bc7b8acSShaohua Li struct lruvec *lruvec, struct list_head *list) 88871e3aac0SAndrea Arcangeli { 88971e3aac0SAndrea Arcangeli const int file = 0; 89071e3aac0SAndrea Arcangeli 891309381feSSasha Levin VM_BUG_ON_PAGE(!PageHead(page), page); 892309381feSSasha Levin VM_BUG_ON_PAGE(PageCompound(page_tail), page); 893309381feSSasha Levin VM_BUG_ON_PAGE(PageLRU(page_tail), page); 894fa9add64SHugh Dickins VM_BUG_ON(NR_CPUS != 1 && 895fa9add64SHugh Dickins !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); 89671e3aac0SAndrea Arcangeli 8975bc7b8acSShaohua Li if (!list) 89871e3aac0SAndrea Arcangeli SetPageLRU(page_tail); 89971e3aac0SAndrea Arcangeli 90012d27107SHugh Dickins if (likely(PageLRU(page))) 90112d27107SHugh Dickins list_add_tail(&page_tail->lru, &page->lru); 9025bc7b8acSShaohua Li else if (list) { 9035bc7b8acSShaohua Li /* page reclaim is reclaiming a huge page */ 9045bc7b8acSShaohua Li get_page(page_tail); 9055bc7b8acSShaohua Li list_add_tail(&page_tail->lru, list); 9065bc7b8acSShaohua Li } else { 90712d27107SHugh Dickins struct list_head *list_head; 90812d27107SHugh Dickins /* 90912d27107SHugh Dickins * Head page has not yet been counted, as an hpage, 91012d27107SHugh Dickins * so we must account for each subpage individually. 91112d27107SHugh Dickins * 91212d27107SHugh Dickins * Use the standard add function to put page_tail on the list, 91312d27107SHugh Dickins * but then correct its position so they all end up in order. 91412d27107SHugh Dickins */ 915e180cf80SKirill A. Shutemov add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail)); 91612d27107SHugh Dickins list_head = page_tail->lru.prev; 91712d27107SHugh Dickins list_move_tail(&page_tail->lru, list_head); 91871e3aac0SAndrea Arcangeli } 9197512102cSHugh Dickins 9207512102cSHugh Dickins if (!PageUnevictable(page)) 921e180cf80SKirill A. Shutemov update_page_reclaim_stat(lruvec, file, PageActive(page_tail)); 92271e3aac0SAndrea Arcangeli } 92312d27107SHugh Dickins #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 92471e3aac0SAndrea Arcangeli 925fa9add64SHugh Dickins static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, 926fa9add64SHugh Dickins void *arg) 9273dd7ae8eSShaohua Li { 92813f7f789SMel Gorman int file = page_is_file_cache(page); 92913f7f789SMel Gorman int active = PageActive(page); 93013f7f789SMel Gorman enum lru_list lru = page_lru(page); 9313dd7ae8eSShaohua Li 932309381feSSasha Levin VM_BUG_ON_PAGE(PageLRU(page), page); 9333dd7ae8eSShaohua Li 9343dd7ae8eSShaohua Li SetPageLRU(page); 935fa9add64SHugh Dickins add_page_to_lru_list(page, lruvec, lru); 936fa9add64SHugh Dickins update_page_reclaim_stat(lruvec, file, active); 937c6286c98SMel Gorman trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page)); 9383dd7ae8eSShaohua Li } 9393dd7ae8eSShaohua Li 9401da177e4SLinus Torvalds /* 9411da177e4SLinus Torvalds * Add the passed pages to the LRU, then drop the caller's refcount 9421da177e4SLinus Torvalds * on them. Reinitialises the caller's pagevec. 9431da177e4SLinus Torvalds */ 944a0b8cab3SMel Gorman void __pagevec_lru_add(struct pagevec *pvec) 9451da177e4SLinus Torvalds { 946a0b8cab3SMel Gorman pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL); 9471da177e4SLinus Torvalds } 9485095ae83SHugh Dickins EXPORT_SYMBOL(__pagevec_lru_add); 949f04e9ebbSKOSAKI Motohiro 9501da177e4SLinus Torvalds /** 951*0cd6144aSJohannes Weiner * pagevec_lookup_entries - gang pagecache lookup 952*0cd6144aSJohannes Weiner * @pvec: Where the resulting entries are placed 953*0cd6144aSJohannes Weiner * @mapping: The address_space to search 954*0cd6144aSJohannes Weiner * @start: The starting entry index 955*0cd6144aSJohannes Weiner * @nr_entries: The maximum number of entries 956*0cd6144aSJohannes Weiner * @indices: The cache indices corresponding to the entries in @pvec 957*0cd6144aSJohannes Weiner * 958*0cd6144aSJohannes Weiner * pagevec_lookup_entries() will search for and return a group of up 959*0cd6144aSJohannes Weiner * to @nr_entries pages and shadow entries in the mapping. All 960*0cd6144aSJohannes Weiner * entries are placed in @pvec. pagevec_lookup_entries() takes a 961*0cd6144aSJohannes Weiner * reference against actual pages in @pvec. 962*0cd6144aSJohannes Weiner * 963*0cd6144aSJohannes Weiner * The search returns a group of mapping-contiguous entries with 964*0cd6144aSJohannes Weiner * ascending indexes. There may be holes in the indices due to 965*0cd6144aSJohannes Weiner * not-present entries. 966*0cd6144aSJohannes Weiner * 967*0cd6144aSJohannes Weiner * pagevec_lookup_entries() returns the number of entries which were 968*0cd6144aSJohannes Weiner * found. 969*0cd6144aSJohannes Weiner */ 970*0cd6144aSJohannes Weiner unsigned pagevec_lookup_entries(struct pagevec *pvec, 971*0cd6144aSJohannes Weiner struct address_space *mapping, 972*0cd6144aSJohannes Weiner pgoff_t start, unsigned nr_pages, 973*0cd6144aSJohannes Weiner pgoff_t *indices) 974*0cd6144aSJohannes Weiner { 975*0cd6144aSJohannes Weiner pvec->nr = find_get_entries(mapping, start, nr_pages, 976*0cd6144aSJohannes Weiner pvec->pages, indices); 977*0cd6144aSJohannes Weiner return pagevec_count(pvec); 978*0cd6144aSJohannes Weiner } 979*0cd6144aSJohannes Weiner 980*0cd6144aSJohannes Weiner /** 981*0cd6144aSJohannes Weiner * pagevec_remove_exceptionals - pagevec exceptionals pruning 982*0cd6144aSJohannes Weiner * @pvec: The pagevec to prune 983*0cd6144aSJohannes Weiner * 984*0cd6144aSJohannes Weiner * pagevec_lookup_entries() fills both pages and exceptional radix 985*0cd6144aSJohannes Weiner * tree entries into the pagevec. This function prunes all 986*0cd6144aSJohannes Weiner * exceptionals from @pvec without leaving holes, so that it can be 987*0cd6144aSJohannes Weiner * passed on to page-only pagevec operations. 988*0cd6144aSJohannes Weiner */ 989*0cd6144aSJohannes Weiner void pagevec_remove_exceptionals(struct pagevec *pvec) 990*0cd6144aSJohannes Weiner { 991*0cd6144aSJohannes Weiner int i, j; 992*0cd6144aSJohannes Weiner 993*0cd6144aSJohannes Weiner for (i = 0, j = 0; i < pagevec_count(pvec); i++) { 994*0cd6144aSJohannes Weiner struct page *page = pvec->pages[i]; 995*0cd6144aSJohannes Weiner if (!radix_tree_exceptional_entry(page)) 996*0cd6144aSJohannes Weiner pvec->pages[j++] = page; 997*0cd6144aSJohannes Weiner } 998*0cd6144aSJohannes Weiner pvec->nr = j; 999*0cd6144aSJohannes Weiner } 1000*0cd6144aSJohannes Weiner 1001*0cd6144aSJohannes Weiner /** 10021da177e4SLinus Torvalds * pagevec_lookup - gang pagecache lookup 10031da177e4SLinus Torvalds * @pvec: Where the resulting pages are placed 10041da177e4SLinus Torvalds * @mapping: The address_space to search 10051da177e4SLinus Torvalds * @start: The starting page index 10061da177e4SLinus Torvalds * @nr_pages: The maximum number of pages 10071da177e4SLinus Torvalds * 10081da177e4SLinus Torvalds * pagevec_lookup() will search for and return a group of up to @nr_pages pages 10091da177e4SLinus Torvalds * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a 10101da177e4SLinus Torvalds * reference against the pages in @pvec. 10111da177e4SLinus Torvalds * 10121da177e4SLinus Torvalds * The search returns a group of mapping-contiguous pages with ascending 10131da177e4SLinus Torvalds * indexes. There may be holes in the indices due to not-present pages. 10141da177e4SLinus Torvalds * 10151da177e4SLinus Torvalds * pagevec_lookup() returns the number of pages which were found. 10161da177e4SLinus Torvalds */ 10171da177e4SLinus Torvalds unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, 10181da177e4SLinus Torvalds pgoff_t start, unsigned nr_pages) 10191da177e4SLinus Torvalds { 10201da177e4SLinus Torvalds pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); 10211da177e4SLinus Torvalds return pagevec_count(pvec); 10221da177e4SLinus Torvalds } 102378539fdfSChristoph Hellwig EXPORT_SYMBOL(pagevec_lookup); 102478539fdfSChristoph Hellwig 10251da177e4SLinus Torvalds unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, 10261da177e4SLinus Torvalds pgoff_t *index, int tag, unsigned nr_pages) 10271da177e4SLinus Torvalds { 10281da177e4SLinus Torvalds pvec->nr = find_get_pages_tag(mapping, index, tag, 10291da177e4SLinus Torvalds nr_pages, pvec->pages); 10301da177e4SLinus Torvalds return pagevec_count(pvec); 10311da177e4SLinus Torvalds } 10327f285701SSteve French EXPORT_SYMBOL(pagevec_lookup_tag); 10331da177e4SLinus Torvalds 10341da177e4SLinus Torvalds /* 10351da177e4SLinus Torvalds * Perform any setup for the swap system 10361da177e4SLinus Torvalds */ 10371da177e4SLinus Torvalds void __init swap_setup(void) 10381da177e4SLinus Torvalds { 10394481374cSJan Beulich unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); 1040e0bf68ddSPeter Zijlstra #ifdef CONFIG_SWAP 104133806f06SShaohua Li int i; 104233806f06SShaohua Li 10438077c0d9SMikulas Patocka if (bdi_init(swapper_spaces[0].backing_dev_info)) 10448077c0d9SMikulas Patocka panic("Failed to init swap bdi"); 104533806f06SShaohua Li for (i = 0; i < MAX_SWAPFILES; i++) { 104633806f06SShaohua Li spin_lock_init(&swapper_spaces[i].tree_lock); 104733806f06SShaohua Li INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); 104833806f06SShaohua Li } 1049e0bf68ddSPeter Zijlstra #endif 1050e0bf68ddSPeter Zijlstra 10511da177e4SLinus Torvalds /* Use a smaller cluster for small-memory machines */ 10521da177e4SLinus Torvalds if (megs < 16) 10531da177e4SLinus Torvalds page_cluster = 2; 10541da177e4SLinus Torvalds else 10551da177e4SLinus Torvalds page_cluster = 3; 10561da177e4SLinus Torvalds /* 10571da177e4SLinus Torvalds * Right now other parts of the system means that we 10581da177e4SLinus Torvalds * _really_ don't want to cluster much more 10591da177e4SLinus Torvalds */ 10601da177e4SLinus Torvalds } 1061