1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0 2eefa864bSJoonsoo Kim #include <linux/mm.h> 3eefa864bSJoonsoo Kim #include <linux/mmzone.h> 457c8a661SMike Rapoport #include <linux/memblock.h> 5eefa864bSJoonsoo Kim #include <linux/page_ext.h> 6eefa864bSJoonsoo Kim #include <linux/memory.h> 7eefa864bSJoonsoo Kim #include <linux/vmalloc.h> 8eefa864bSJoonsoo Kim #include <linux/kmemleak.h> 948c96a36SJoonsoo Kim #include <linux/page_owner.h> 1033c3fc71SVladimir Davydov #include <linux/page_idle.h> 11df4e817bSPasha Tatashin #include <linux/page_table_check.h> 12b1d5488aSCharan Teja Kalla #include <linux/rcupdate.h> 13eefa864bSJoonsoo Kim 14eefa864bSJoonsoo Kim /* 15eefa864bSJoonsoo Kim * struct page extension 16eefa864bSJoonsoo Kim * 17eefa864bSJoonsoo Kim * This is the feature to manage memory for extended data per page. 18eefa864bSJoonsoo Kim * 19eefa864bSJoonsoo Kim * Until now, we must modify struct page itself to store extra data per page. 20eefa864bSJoonsoo Kim * This requires rebuilding the kernel and it is really time consuming process. 21eefa864bSJoonsoo Kim * And, sometimes, rebuild is impossible due to third party module dependency. 22eefa864bSJoonsoo Kim * At last, enlarging struct page could cause un-wanted system behaviour change. 23eefa864bSJoonsoo Kim * 24eefa864bSJoonsoo Kim * This feature is intended to overcome above mentioned problems. This feature 25eefa864bSJoonsoo Kim * allocates memory for extended data per page in certain place rather than 26eefa864bSJoonsoo Kim * the struct page itself. This memory can be accessed by the accessor 27eefa864bSJoonsoo Kim * functions provided by this code. During the boot process, it checks whether 28eefa864bSJoonsoo Kim * allocation of huge chunk of memory is needed or not. If not, it avoids 29eefa864bSJoonsoo Kim * allocating memory at all. With this advantage, we can include this feature 30eefa864bSJoonsoo Kim * into the kernel in default and can avoid rebuild and solve related problems. 31eefa864bSJoonsoo Kim * 32eefa864bSJoonsoo Kim * To help these things to work well, there are two callbacks for clients. One 33eefa864bSJoonsoo Kim * is the need callback which is mandatory if user wants to avoid useless 34eefa864bSJoonsoo Kim * memory allocation at boot-time. The other is optional, init callback, which 35eefa864bSJoonsoo Kim * is used to do proper initialization after memory is allocated. 36eefa864bSJoonsoo Kim * 37eefa864bSJoonsoo Kim * The need callback is used to decide whether extended memory allocation is 38eefa864bSJoonsoo Kim * needed or not. Sometimes users want to deactivate some features in this 398958b249SHaitao Shi * boot and extra memory would be unnecessary. In this case, to avoid 40eefa864bSJoonsoo Kim * allocating huge chunk of memory, each clients represent their need of 41eefa864bSJoonsoo Kim * extra memory through the need callback. If one of the need callbacks 42eefa864bSJoonsoo Kim * returns true, it means that someone needs extra memory so that 43eefa864bSJoonsoo Kim * page extension core should allocates memory for page extension. If 44eefa864bSJoonsoo Kim * none of need callbacks return true, memory isn't needed at all in this boot 45eefa864bSJoonsoo Kim * and page extension core can skip to allocate memory. As result, 46eefa864bSJoonsoo Kim * none of memory is wasted. 47eefa864bSJoonsoo Kim * 48980ac167SJoonsoo Kim * When need callback returns true, page_ext checks if there is a request for 49980ac167SJoonsoo Kim * extra memory through size in struct page_ext_operations. If it is non-zero, 50980ac167SJoonsoo Kim * extra space is allocated for each page_ext entry and offset is returned to 51980ac167SJoonsoo Kim * user through offset in struct page_ext_operations. 52980ac167SJoonsoo Kim * 53eefa864bSJoonsoo Kim * The init callback is used to do proper initialization after page extension 54eefa864bSJoonsoo Kim * is completely initialized. In sparse memory system, extra memory is 55eefa864bSJoonsoo Kim * allocated some time later than memmap is allocated. In other words, lifetime 56eefa864bSJoonsoo Kim * of memory for page extension isn't same with memmap for struct page. 57eefa864bSJoonsoo Kim * Therefore, clients can't store extra data until page extension is 58eefa864bSJoonsoo Kim * initialized, even if pages are allocated and used freely. This could 59eefa864bSJoonsoo Kim * cause inadequate state of extra data per page, so, to prevent it, client 60eefa864bSJoonsoo Kim * can utilize this callback to initialize the state of it correctly. 61eefa864bSJoonsoo Kim */ 62eefa864bSJoonsoo Kim 63b1d5488aSCharan Teja Kalla #ifdef CONFIG_SPARSEMEM 64b1d5488aSCharan Teja Kalla #define PAGE_EXT_INVALID (0x1) 65b1d5488aSCharan Teja Kalla #endif 66b1d5488aSCharan Teja Kalla 671c676e0dSSeongJae Park #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) 681c676e0dSSeongJae Park static bool need_page_idle(void) 691c676e0dSSeongJae Park { 701c676e0dSSeongJae Park return true; 711c676e0dSSeongJae Park } 72cab0a7c1STing Liu static struct page_ext_operations page_idle_ops __initdata = { 731c676e0dSSeongJae Park .need = need_page_idle, 741c676e0dSSeongJae Park }; 751c676e0dSSeongJae Park #endif 761c676e0dSSeongJae Park 77cab0a7c1STing Liu static struct page_ext_operations *page_ext_ops[] __initdata = { 7848c96a36SJoonsoo Kim #ifdef CONFIG_PAGE_OWNER 7948c96a36SJoonsoo Kim &page_owner_ops, 8048c96a36SJoonsoo Kim #endif 811c676e0dSSeongJae Park #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) 8233c3fc71SVladimir Davydov &page_idle_ops, 8333c3fc71SVladimir Davydov #endif 84df4e817bSPasha Tatashin #ifdef CONFIG_PAGE_TABLE_CHECK 85df4e817bSPasha Tatashin &page_table_check_ops, 86df4e817bSPasha Tatashin #endif 87eefa864bSJoonsoo Kim }; 88eefa864bSJoonsoo Kim 895556cfe8SVlastimil Babka unsigned long page_ext_size = sizeof(struct page_ext); 905556cfe8SVlastimil Babka 91eefa864bSJoonsoo Kim static unsigned long total_usage; 92b1d5488aSCharan Teja Kalla static struct page_ext *lookup_page_ext(const struct page *page); 93eefa864bSJoonsoo Kim 94*c4f20f14SLi Zhe bool early_page_ext; 95*c4f20f14SLi Zhe static int __init setup_early_page_ext(char *str) 96*c4f20f14SLi Zhe { 97*c4f20f14SLi Zhe early_page_ext = true; 98*c4f20f14SLi Zhe return 0; 99*c4f20f14SLi Zhe } 100*c4f20f14SLi Zhe early_param("early_page_ext", setup_early_page_ext); 101*c4f20f14SLi Zhe 102eefa864bSJoonsoo Kim static bool __init invoke_need_callbacks(void) 103eefa864bSJoonsoo Kim { 104eefa864bSJoonsoo Kim int i; 105eefa864bSJoonsoo Kim int entries = ARRAY_SIZE(page_ext_ops); 106980ac167SJoonsoo Kim bool need = false; 107eefa864bSJoonsoo Kim 108eefa864bSJoonsoo Kim for (i = 0; i < entries; i++) { 109980ac167SJoonsoo Kim if (page_ext_ops[i]->need && page_ext_ops[i]->need()) { 1105556cfe8SVlastimil Babka page_ext_ops[i]->offset = page_ext_size; 1115556cfe8SVlastimil Babka page_ext_size += page_ext_ops[i]->size; 112980ac167SJoonsoo Kim need = true; 113980ac167SJoonsoo Kim } 114eefa864bSJoonsoo Kim } 115eefa864bSJoonsoo Kim 116980ac167SJoonsoo Kim return need; 117eefa864bSJoonsoo Kim } 118eefa864bSJoonsoo Kim 119eefa864bSJoonsoo Kim static void __init invoke_init_callbacks(void) 120eefa864bSJoonsoo Kim { 121eefa864bSJoonsoo Kim int i; 122eefa864bSJoonsoo Kim int entries = ARRAY_SIZE(page_ext_ops); 123eefa864bSJoonsoo Kim 124eefa864bSJoonsoo Kim for (i = 0; i < entries; i++) { 125eefa864bSJoonsoo Kim if (page_ext_ops[i]->init) 126eefa864bSJoonsoo Kim page_ext_ops[i]->init(); 127eefa864bSJoonsoo Kim } 128eefa864bSJoonsoo Kim } 129eefa864bSJoonsoo Kim 1307fb7ab6dSZhenhua Huang #ifndef CONFIG_SPARSEMEM 1317fb7ab6dSZhenhua Huang void __init page_ext_init_flatmem_late(void) 1327fb7ab6dSZhenhua Huang { 1337fb7ab6dSZhenhua Huang invoke_init_callbacks(); 1347fb7ab6dSZhenhua Huang } 1357fb7ab6dSZhenhua Huang #endif 1367fb7ab6dSZhenhua Huang 137980ac167SJoonsoo Kim static inline struct page_ext *get_entry(void *base, unsigned long index) 138980ac167SJoonsoo Kim { 1395556cfe8SVlastimil Babka return base + page_ext_size * index; 140980ac167SJoonsoo Kim } 141980ac167SJoonsoo Kim 142b1d5488aSCharan Teja Kalla /** 143b1d5488aSCharan Teja Kalla * page_ext_get() - Get the extended information for a page. 144b1d5488aSCharan Teja Kalla * @page: The page we're interested in. 145b1d5488aSCharan Teja Kalla * 146b1d5488aSCharan Teja Kalla * Ensures that the page_ext will remain valid until page_ext_put() 147b1d5488aSCharan Teja Kalla * is called. 148b1d5488aSCharan Teja Kalla * 149b1d5488aSCharan Teja Kalla * Return: NULL if no page_ext exists for this page. 150b1d5488aSCharan Teja Kalla * Context: Any context. Caller may not sleep until they have called 151b1d5488aSCharan Teja Kalla * page_ext_put(). 152b1d5488aSCharan Teja Kalla */ 153b1d5488aSCharan Teja Kalla struct page_ext *page_ext_get(struct page *page) 154b1d5488aSCharan Teja Kalla { 155b1d5488aSCharan Teja Kalla struct page_ext *page_ext; 156b1d5488aSCharan Teja Kalla 157b1d5488aSCharan Teja Kalla rcu_read_lock(); 158b1d5488aSCharan Teja Kalla page_ext = lookup_page_ext(page); 159b1d5488aSCharan Teja Kalla if (!page_ext) { 160b1d5488aSCharan Teja Kalla rcu_read_unlock(); 161b1d5488aSCharan Teja Kalla return NULL; 162b1d5488aSCharan Teja Kalla } 163b1d5488aSCharan Teja Kalla 164b1d5488aSCharan Teja Kalla return page_ext; 165b1d5488aSCharan Teja Kalla } 166b1d5488aSCharan Teja Kalla 167b1d5488aSCharan Teja Kalla /** 168b1d5488aSCharan Teja Kalla * page_ext_put() - Working with page extended information is done. 169b1d5488aSCharan Teja Kalla * @page_ext - Page extended information received from page_ext_get(). 170b1d5488aSCharan Teja Kalla * 171b1d5488aSCharan Teja Kalla * The page extended information of the page may not be valid after this 172b1d5488aSCharan Teja Kalla * function is called. 173b1d5488aSCharan Teja Kalla * 174b1d5488aSCharan Teja Kalla * Return: None. 175b1d5488aSCharan Teja Kalla * Context: Any context with corresponding page_ext_get() is called. 176b1d5488aSCharan Teja Kalla */ 177b1d5488aSCharan Teja Kalla void page_ext_put(struct page_ext *page_ext) 178b1d5488aSCharan Teja Kalla { 179b1d5488aSCharan Teja Kalla if (unlikely(!page_ext)) 180b1d5488aSCharan Teja Kalla return; 181b1d5488aSCharan Teja Kalla 182b1d5488aSCharan Teja Kalla rcu_read_unlock(); 183b1d5488aSCharan Teja Kalla } 1847fb7ab6dSZhenhua Huang #ifndef CONFIG_SPARSEMEM 185eefa864bSJoonsoo Kim 186eefa864bSJoonsoo Kim 187eefa864bSJoonsoo Kim void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) 188eefa864bSJoonsoo Kim { 189eefa864bSJoonsoo Kim pgdat->node_page_ext = NULL; 190eefa864bSJoonsoo Kim } 191eefa864bSJoonsoo Kim 192b1d5488aSCharan Teja Kalla static struct page_ext *lookup_page_ext(const struct page *page) 193eefa864bSJoonsoo Kim { 194eefa864bSJoonsoo Kim unsigned long pfn = page_to_pfn(page); 1950b06bb3fSJoonsoo Kim unsigned long index; 196eefa864bSJoonsoo Kim struct page_ext *base; 197eefa864bSJoonsoo Kim 198b1d5488aSCharan Teja Kalla WARN_ON_ONCE(!rcu_read_lock_held()); 199eefa864bSJoonsoo Kim base = NODE_DATA(page_to_nid(page))->node_page_ext; 200eefa864bSJoonsoo Kim /* 201eefa864bSJoonsoo Kim * The sanity checks the page allocator does upon freeing a 202eefa864bSJoonsoo Kim * page can reach here before the page_ext arrays are 203eefa864bSJoonsoo Kim * allocated when feeding a range of pages to the allocator 204eefa864bSJoonsoo Kim * for the first time during bootup or memory hotplug. 205eefa864bSJoonsoo Kim */ 206eefa864bSJoonsoo Kim if (unlikely(!base)) 207eefa864bSJoonsoo Kim return NULL; 2080b06bb3fSJoonsoo Kim index = pfn - round_down(node_start_pfn(page_to_nid(page)), 209eefa864bSJoonsoo Kim MAX_ORDER_NR_PAGES); 210980ac167SJoonsoo Kim return get_entry(base, index); 211eefa864bSJoonsoo Kim } 212eefa864bSJoonsoo Kim 213eefa864bSJoonsoo Kim static int __init alloc_node_page_ext(int nid) 214eefa864bSJoonsoo Kim { 215eefa864bSJoonsoo Kim struct page_ext *base; 216eefa864bSJoonsoo Kim unsigned long table_size; 217eefa864bSJoonsoo Kim unsigned long nr_pages; 218eefa864bSJoonsoo Kim 219eefa864bSJoonsoo Kim nr_pages = NODE_DATA(nid)->node_spanned_pages; 220eefa864bSJoonsoo Kim if (!nr_pages) 221eefa864bSJoonsoo Kim return 0; 222eefa864bSJoonsoo Kim 223eefa864bSJoonsoo Kim /* 224eefa864bSJoonsoo Kim * Need extra space if node range is not aligned with 225eefa864bSJoonsoo Kim * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm 226eefa864bSJoonsoo Kim * checks buddy's status, range could be out of exact node range. 227eefa864bSJoonsoo Kim */ 228eefa864bSJoonsoo Kim if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) || 229eefa864bSJoonsoo Kim !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES)) 230eefa864bSJoonsoo Kim nr_pages += MAX_ORDER_NR_PAGES; 231eefa864bSJoonsoo Kim 2325556cfe8SVlastimil Babka table_size = page_ext_size * nr_pages; 233eefa864bSJoonsoo Kim 23426fb3daeSMike Rapoport base = memblock_alloc_try_nid( 235eefa864bSJoonsoo Kim table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 23697ad1087SMike Rapoport MEMBLOCK_ALLOC_ACCESSIBLE, nid); 237eefa864bSJoonsoo Kim if (!base) 238eefa864bSJoonsoo Kim return -ENOMEM; 239eefa864bSJoonsoo Kim NODE_DATA(nid)->node_page_ext = base; 240eefa864bSJoonsoo Kim total_usage += table_size; 241eefa864bSJoonsoo Kim return 0; 242eefa864bSJoonsoo Kim } 243eefa864bSJoonsoo Kim 244eefa864bSJoonsoo Kim void __init page_ext_init_flatmem(void) 245eefa864bSJoonsoo Kim { 246eefa864bSJoonsoo Kim 247eefa864bSJoonsoo Kim int nid, fail; 248eefa864bSJoonsoo Kim 249eefa864bSJoonsoo Kim if (!invoke_need_callbacks()) 250eefa864bSJoonsoo Kim return; 251eefa864bSJoonsoo Kim 252eefa864bSJoonsoo Kim for_each_online_node(nid) { 253eefa864bSJoonsoo Kim fail = alloc_node_page_ext(nid); 254eefa864bSJoonsoo Kim if (fail) 255eefa864bSJoonsoo Kim goto fail; 256eefa864bSJoonsoo Kim } 257eefa864bSJoonsoo Kim pr_info("allocated %ld bytes of page_ext\n", total_usage); 258eefa864bSJoonsoo Kim return; 259eefa864bSJoonsoo Kim 260eefa864bSJoonsoo Kim fail: 261eefa864bSJoonsoo Kim pr_crit("allocation of page_ext failed.\n"); 262eefa864bSJoonsoo Kim panic("Out of memory"); 263eefa864bSJoonsoo Kim } 264eefa864bSJoonsoo Kim 265d1fea155SYinan Zhang #else /* CONFIG_SPARSEMEM */ 266b1d5488aSCharan Teja Kalla static bool page_ext_invalid(struct page_ext *page_ext) 267b1d5488aSCharan Teja Kalla { 268b1d5488aSCharan Teja Kalla return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID); 269b1d5488aSCharan Teja Kalla } 270eefa864bSJoonsoo Kim 271b1d5488aSCharan Teja Kalla static struct page_ext *lookup_page_ext(const struct page *page) 272eefa864bSJoonsoo Kim { 273eefa864bSJoonsoo Kim unsigned long pfn = page_to_pfn(page); 274eefa864bSJoonsoo Kim struct mem_section *section = __pfn_to_section(pfn); 275b1d5488aSCharan Teja Kalla struct page_ext *page_ext = READ_ONCE(section->page_ext); 276b1d5488aSCharan Teja Kalla 277b1d5488aSCharan Teja Kalla WARN_ON_ONCE(!rcu_read_lock_held()); 278eefa864bSJoonsoo Kim /* 279eefa864bSJoonsoo Kim * The sanity checks the page allocator does upon freeing a 280eefa864bSJoonsoo Kim * page can reach here before the page_ext arrays are 281eefa864bSJoonsoo Kim * allocated when feeding a range of pages to the allocator 282eefa864bSJoonsoo Kim * for the first time during bootup or memory hotplug. 283eefa864bSJoonsoo Kim */ 284b1d5488aSCharan Teja Kalla if (page_ext_invalid(page_ext)) 285eefa864bSJoonsoo Kim return NULL; 286b1d5488aSCharan Teja Kalla return get_entry(page_ext, pfn); 287eefa864bSJoonsoo Kim } 288eefa864bSJoonsoo Kim 289eefa864bSJoonsoo Kim static void *__meminit alloc_page_ext(size_t size, int nid) 290eefa864bSJoonsoo Kim { 291eefa864bSJoonsoo Kim gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; 292eefa864bSJoonsoo Kim void *addr = NULL; 293eefa864bSJoonsoo Kim 294eefa864bSJoonsoo Kim addr = alloc_pages_exact_nid(nid, size, flags); 295eefa864bSJoonsoo Kim if (addr) { 296eefa864bSJoonsoo Kim kmemleak_alloc(addr, size, 1, flags); 297eefa864bSJoonsoo Kim return addr; 298eefa864bSJoonsoo Kim } 299eefa864bSJoonsoo Kim 300eefa864bSJoonsoo Kim addr = vzalloc_node(size, nid); 301eefa864bSJoonsoo Kim 302eefa864bSJoonsoo Kim return addr; 303eefa864bSJoonsoo Kim } 304eefa864bSJoonsoo Kim 305eefa864bSJoonsoo Kim static int __meminit init_section_page_ext(unsigned long pfn, int nid) 306eefa864bSJoonsoo Kim { 307eefa864bSJoonsoo Kim struct mem_section *section; 308eefa864bSJoonsoo Kim struct page_ext *base; 309eefa864bSJoonsoo Kim unsigned long table_size; 310eefa864bSJoonsoo Kim 311eefa864bSJoonsoo Kim section = __pfn_to_section(pfn); 312eefa864bSJoonsoo Kim 313eefa864bSJoonsoo Kim if (section->page_ext) 314eefa864bSJoonsoo Kim return 0; 315eefa864bSJoonsoo Kim 3165556cfe8SVlastimil Babka table_size = page_ext_size * PAGES_PER_SECTION; 317eefa864bSJoonsoo Kim base = alloc_page_ext(table_size, nid); 318eefa864bSJoonsoo Kim 319eefa864bSJoonsoo Kim /* 320eefa864bSJoonsoo Kim * The value stored in section->page_ext is (base - pfn) 321eefa864bSJoonsoo Kim * and it does not point to the memory block allocated above, 322eefa864bSJoonsoo Kim * causing kmemleak false positives. 323eefa864bSJoonsoo Kim */ 324eefa864bSJoonsoo Kim kmemleak_not_leak(base); 325eefa864bSJoonsoo Kim 326eefa864bSJoonsoo Kim if (!base) { 327eefa864bSJoonsoo Kim pr_err("page ext allocation failure\n"); 328eefa864bSJoonsoo Kim return -ENOMEM; 329eefa864bSJoonsoo Kim } 330eefa864bSJoonsoo Kim 331eefa864bSJoonsoo Kim /* 332eefa864bSJoonsoo Kim * The passed "pfn" may not be aligned to SECTION. For the calculation 333eefa864bSJoonsoo Kim * we need to apply a mask. 334eefa864bSJoonsoo Kim */ 335eefa864bSJoonsoo Kim pfn &= PAGE_SECTION_MASK; 3365556cfe8SVlastimil Babka section->page_ext = (void *)base - page_ext_size * pfn; 337eefa864bSJoonsoo Kim total_usage += table_size; 338eefa864bSJoonsoo Kim return 0; 339eefa864bSJoonsoo Kim } 34076af6a05SDave Hansen 341eefa864bSJoonsoo Kim static void free_page_ext(void *addr) 342eefa864bSJoonsoo Kim { 343eefa864bSJoonsoo Kim if (is_vmalloc_addr(addr)) { 344eefa864bSJoonsoo Kim vfree(addr); 345eefa864bSJoonsoo Kim } else { 346eefa864bSJoonsoo Kim struct page *page = virt_to_page(addr); 347eefa864bSJoonsoo Kim size_t table_size; 348eefa864bSJoonsoo Kim 3495556cfe8SVlastimil Babka table_size = page_ext_size * PAGES_PER_SECTION; 350eefa864bSJoonsoo Kim 351eefa864bSJoonsoo Kim BUG_ON(PageReserved(page)); 3520c815854SQian Cai kmemleak_free(addr); 353eefa864bSJoonsoo Kim free_pages_exact(addr, table_size); 354eefa864bSJoonsoo Kim } 355eefa864bSJoonsoo Kim } 356eefa864bSJoonsoo Kim 357eefa864bSJoonsoo Kim static void __free_page_ext(unsigned long pfn) 358eefa864bSJoonsoo Kim { 359eefa864bSJoonsoo Kim struct mem_section *ms; 360eefa864bSJoonsoo Kim struct page_ext *base; 361eefa864bSJoonsoo Kim 362eefa864bSJoonsoo Kim ms = __pfn_to_section(pfn); 363eefa864bSJoonsoo Kim if (!ms || !ms->page_ext) 364eefa864bSJoonsoo Kim return; 365b1d5488aSCharan Teja Kalla 366b1d5488aSCharan Teja Kalla base = READ_ONCE(ms->page_ext); 367b1d5488aSCharan Teja Kalla /* 368b1d5488aSCharan Teja Kalla * page_ext here can be valid while doing the roll back 369b1d5488aSCharan Teja Kalla * operation in online_page_ext(). 370b1d5488aSCharan Teja Kalla */ 371b1d5488aSCharan Teja Kalla if (page_ext_invalid(base)) 372b1d5488aSCharan Teja Kalla base = (void *)base - PAGE_EXT_INVALID; 373b1d5488aSCharan Teja Kalla WRITE_ONCE(ms->page_ext, NULL); 374b1d5488aSCharan Teja Kalla 375b1d5488aSCharan Teja Kalla base = get_entry(base, pfn); 376eefa864bSJoonsoo Kim free_page_ext(base); 377b1d5488aSCharan Teja Kalla } 378b1d5488aSCharan Teja Kalla 379b1d5488aSCharan Teja Kalla static void __invalidate_page_ext(unsigned long pfn) 380b1d5488aSCharan Teja Kalla { 381b1d5488aSCharan Teja Kalla struct mem_section *ms; 382b1d5488aSCharan Teja Kalla void *val; 383b1d5488aSCharan Teja Kalla 384b1d5488aSCharan Teja Kalla ms = __pfn_to_section(pfn); 385b1d5488aSCharan Teja Kalla if (!ms || !ms->page_ext) 386b1d5488aSCharan Teja Kalla return; 387b1d5488aSCharan Teja Kalla val = (void *)ms->page_ext + PAGE_EXT_INVALID; 388b1d5488aSCharan Teja Kalla WRITE_ONCE(ms->page_ext, val); 389eefa864bSJoonsoo Kim } 390eefa864bSJoonsoo Kim 391eefa864bSJoonsoo Kim static int __meminit online_page_ext(unsigned long start_pfn, 392eefa864bSJoonsoo Kim unsigned long nr_pages, 393eefa864bSJoonsoo Kim int nid) 394eefa864bSJoonsoo Kim { 395eefa864bSJoonsoo Kim unsigned long start, end, pfn; 396eefa864bSJoonsoo Kim int fail = 0; 397eefa864bSJoonsoo Kim 398eefa864bSJoonsoo Kim start = SECTION_ALIGN_DOWN(start_pfn); 399eefa864bSJoonsoo Kim end = SECTION_ALIGN_UP(start_pfn + nr_pages); 400eefa864bSJoonsoo Kim 40198fa15f3SAnshuman Khandual if (nid == NUMA_NO_NODE) { 402eefa864bSJoonsoo Kim /* 403eefa864bSJoonsoo Kim * In this case, "nid" already exists and contains valid memory. 404eefa864bSJoonsoo Kim * "start_pfn" passed to us is a pfn which is an arg for 405eefa864bSJoonsoo Kim * online__pages(), and start_pfn should exist. 406eefa864bSJoonsoo Kim */ 407eefa864bSJoonsoo Kim nid = pfn_to_nid(start_pfn); 40830a51400SPeng Liu VM_BUG_ON(!node_online(nid)); 409eefa864bSJoonsoo Kim } 410eefa864bSJoonsoo Kim 411dccacf8dSDavid Hildenbrand for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) 412eefa864bSJoonsoo Kim fail = init_section_page_ext(pfn, nid); 413eefa864bSJoonsoo Kim if (!fail) 414eefa864bSJoonsoo Kim return 0; 415eefa864bSJoonsoo Kim 416eefa864bSJoonsoo Kim /* rollback */ 417eefa864bSJoonsoo Kim for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) 418eefa864bSJoonsoo Kim __free_page_ext(pfn); 419eefa864bSJoonsoo Kim 420eefa864bSJoonsoo Kim return -ENOMEM; 421eefa864bSJoonsoo Kim } 422eefa864bSJoonsoo Kim 423eefa864bSJoonsoo Kim static int __meminit offline_page_ext(unsigned long start_pfn, 4247b5a0b66SCharan Teja Kalla unsigned long nr_pages) 425eefa864bSJoonsoo Kim { 426eefa864bSJoonsoo Kim unsigned long start, end, pfn; 427eefa864bSJoonsoo Kim 428eefa864bSJoonsoo Kim start = SECTION_ALIGN_DOWN(start_pfn); 429eefa864bSJoonsoo Kim end = SECTION_ALIGN_UP(start_pfn + nr_pages); 430eefa864bSJoonsoo Kim 431b1d5488aSCharan Teja Kalla /* 432b1d5488aSCharan Teja Kalla * Freeing of page_ext is done in 3 steps to avoid 433b1d5488aSCharan Teja Kalla * use-after-free of it: 434b1d5488aSCharan Teja Kalla * 1) Traverse all the sections and mark their page_ext 435b1d5488aSCharan Teja Kalla * as invalid. 436b1d5488aSCharan Teja Kalla * 2) Wait for all the existing users of page_ext who 437b1d5488aSCharan Teja Kalla * started before invalidation to finish. 438b1d5488aSCharan Teja Kalla * 3) Free the page_ext. 439b1d5488aSCharan Teja Kalla */ 440b1d5488aSCharan Teja Kalla for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) 441b1d5488aSCharan Teja Kalla __invalidate_page_ext(pfn); 442b1d5488aSCharan Teja Kalla 443b1d5488aSCharan Teja Kalla synchronize_rcu(); 444b1d5488aSCharan Teja Kalla 445eefa864bSJoonsoo Kim for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) 446eefa864bSJoonsoo Kim __free_page_ext(pfn); 447eefa864bSJoonsoo Kim return 0; 448eefa864bSJoonsoo Kim 449eefa864bSJoonsoo Kim } 450eefa864bSJoonsoo Kim 451eefa864bSJoonsoo Kim static int __meminit page_ext_callback(struct notifier_block *self, 452eefa864bSJoonsoo Kim unsigned long action, void *arg) 453eefa864bSJoonsoo Kim { 454eefa864bSJoonsoo Kim struct memory_notify *mn = arg; 455eefa864bSJoonsoo Kim int ret = 0; 456eefa864bSJoonsoo Kim 457eefa864bSJoonsoo Kim switch (action) { 458eefa864bSJoonsoo Kim case MEM_GOING_ONLINE: 459eefa864bSJoonsoo Kim ret = online_page_ext(mn->start_pfn, 460eefa864bSJoonsoo Kim mn->nr_pages, mn->status_change_nid); 461eefa864bSJoonsoo Kim break; 462eefa864bSJoonsoo Kim case MEM_OFFLINE: 463eefa864bSJoonsoo Kim offline_page_ext(mn->start_pfn, 4647b5a0b66SCharan Teja Kalla mn->nr_pages); 465eefa864bSJoonsoo Kim break; 466eefa864bSJoonsoo Kim case MEM_CANCEL_ONLINE: 467eefa864bSJoonsoo Kim offline_page_ext(mn->start_pfn, 4687b5a0b66SCharan Teja Kalla mn->nr_pages); 469eefa864bSJoonsoo Kim break; 470eefa864bSJoonsoo Kim case MEM_GOING_OFFLINE: 471eefa864bSJoonsoo Kim break; 472eefa864bSJoonsoo Kim case MEM_ONLINE: 473eefa864bSJoonsoo Kim case MEM_CANCEL_OFFLINE: 474eefa864bSJoonsoo Kim break; 475eefa864bSJoonsoo Kim } 476eefa864bSJoonsoo Kim 477eefa864bSJoonsoo Kim return notifier_from_errno(ret); 478eefa864bSJoonsoo Kim } 479eefa864bSJoonsoo Kim 480eefa864bSJoonsoo Kim void __init page_ext_init(void) 481eefa864bSJoonsoo Kim { 482eefa864bSJoonsoo Kim unsigned long pfn; 483eefa864bSJoonsoo Kim int nid; 484eefa864bSJoonsoo Kim 485eefa864bSJoonsoo Kim if (!invoke_need_callbacks()) 486eefa864bSJoonsoo Kim return; 487eefa864bSJoonsoo Kim 488eefa864bSJoonsoo Kim for_each_node_state(nid, N_MEMORY) { 489eefa864bSJoonsoo Kim unsigned long start_pfn, end_pfn; 490eefa864bSJoonsoo Kim 491eefa864bSJoonsoo Kim start_pfn = node_start_pfn(nid); 492eefa864bSJoonsoo Kim end_pfn = node_end_pfn(nid); 493eefa864bSJoonsoo Kim /* 494eefa864bSJoonsoo Kim * start_pfn and end_pfn may not be aligned to SECTION and the 495eefa864bSJoonsoo Kim * page->flags of out of node pages are not initialized. So we 496eefa864bSJoonsoo Kim * scan [start_pfn, the biggest section's pfn < end_pfn) here. 497eefa864bSJoonsoo Kim */ 498eefa864bSJoonsoo Kim for (pfn = start_pfn; pfn < end_pfn; 499eefa864bSJoonsoo Kim pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { 500eefa864bSJoonsoo Kim 501eefa864bSJoonsoo Kim if (!pfn_valid(pfn)) 502eefa864bSJoonsoo Kim continue; 503eefa864bSJoonsoo Kim /* 504eefa864bSJoonsoo Kim * Nodes's pfns can be overlapping. 505eefa864bSJoonsoo Kim * We know some arch can have a nodes layout such as 506eefa864bSJoonsoo Kim * -------------pfn--------------> 507eefa864bSJoonsoo Kim * N0 | N1 | N2 | N0 | N1 | N2|.... 508eefa864bSJoonsoo Kim */ 5092f1ee091SQian Cai if (pfn_to_nid(pfn) != nid) 510eefa864bSJoonsoo Kim continue; 511eefa864bSJoonsoo Kim if (init_section_page_ext(pfn, nid)) 512eefa864bSJoonsoo Kim goto oom; 5130fc542b7SVlastimil Babka cond_resched(); 514eefa864bSJoonsoo Kim } 515eefa864bSJoonsoo Kim } 516eefa864bSJoonsoo Kim hotplug_memory_notifier(page_ext_callback, 0); 517eefa864bSJoonsoo Kim pr_info("allocated %ld bytes of page_ext\n", total_usage); 518eefa864bSJoonsoo Kim invoke_init_callbacks(); 519eefa864bSJoonsoo Kim return; 520eefa864bSJoonsoo Kim 521eefa864bSJoonsoo Kim oom: 522eefa864bSJoonsoo Kim panic("Out of memory"); 523eefa864bSJoonsoo Kim } 524eefa864bSJoonsoo Kim 525eefa864bSJoonsoo Kim void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) 526eefa864bSJoonsoo Kim { 527eefa864bSJoonsoo Kim } 528eefa864bSJoonsoo Kim 529eefa864bSJoonsoo Kim #endif 530