1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/mm.h> 3 #include <linux/mmzone.h> 4 #include <linux/memblock.h> 5 #include <linux/page_ext.h> 6 #include <linux/memory.h> 7 #include <linux/vmalloc.h> 8 #include <linux/kmemleak.h> 9 #include <linux/page_owner.h> 10 #include <linux/page_idle.h> 11 #include <linux/page_table_check.h> 12 #include <linux/rcupdate.h> 13 14 /* 15 * struct page extension 16 * 17 * This is the feature to manage memory for extended data per page. 18 * 19 * Until now, we must modify struct page itself to store extra data per page. 20 * This requires rebuilding the kernel and it is really time consuming process. 21 * And, sometimes, rebuild is impossible due to third party module dependency. 22 * At last, enlarging struct page could cause un-wanted system behaviour change. 23 * 24 * This feature is intended to overcome above mentioned problems. This feature 25 * allocates memory for extended data per page in certain place rather than 26 * the struct page itself. This memory can be accessed by the accessor 27 * functions provided by this code. During the boot process, it checks whether 28 * allocation of huge chunk of memory is needed or not. If not, it avoids 29 * allocating memory at all. With this advantage, we can include this feature 30 * into the kernel in default and can avoid rebuild and solve related problems. 31 * 32 * To help these things to work well, there are two callbacks for clients. One 33 * is the need callback which is mandatory if user wants to avoid useless 34 * memory allocation at boot-time. The other is optional, init callback, which 35 * is used to do proper initialization after memory is allocated. 36 * 37 * The need callback is used to decide whether extended memory allocation is 38 * needed or not. Sometimes users want to deactivate some features in this 39 * boot and extra memory would be unnecessary. In this case, to avoid 40 * allocating huge chunk of memory, each clients represent their need of 41 * extra memory through the need callback. If one of the need callbacks 42 * returns true, it means that someone needs extra memory so that 43 * page extension core should allocates memory for page extension. If 44 * none of need callbacks return true, memory isn't needed at all in this boot 45 * and page extension core can skip to allocate memory. As result, 46 * none of memory is wasted. 47 * 48 * When need callback returns true, page_ext checks if there is a request for 49 * extra memory through size in struct page_ext_operations. If it is non-zero, 50 * extra space is allocated for each page_ext entry and offset is returned to 51 * user through offset in struct page_ext_operations. 52 * 53 * The init callback is used to do proper initialization after page extension 54 * is completely initialized. In sparse memory system, extra memory is 55 * allocated some time later than memmap is allocated. In other words, lifetime 56 * of memory for page extension isn't same with memmap for struct page. 57 * Therefore, clients can't store extra data until page extension is 58 * initialized, even if pages are allocated and used freely. This could 59 * cause inadequate state of extra data per page, so, to prevent it, client 60 * can utilize this callback to initialize the state of it correctly. 61 */ 62 63 #ifdef CONFIG_SPARSEMEM 64 #define PAGE_EXT_INVALID (0x1) 65 #endif 66 67 #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) 68 static bool need_page_idle(void) 69 { 70 return true; 71 } 72 static struct page_ext_operations page_idle_ops __initdata = { 73 .need = need_page_idle, 74 .need_shared_flags = true, 75 }; 76 #endif 77 78 static struct page_ext_operations *page_ext_ops[] __initdata = { 79 #ifdef CONFIG_PAGE_OWNER 80 &page_owner_ops, 81 #endif 82 #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) 83 &page_idle_ops, 84 #endif 85 #ifdef CONFIG_PAGE_TABLE_CHECK 86 &page_table_check_ops, 87 #endif 88 }; 89 90 unsigned long page_ext_size; 91 92 static unsigned long total_usage; 93 static struct page_ext *lookup_page_ext(const struct page *page); 94 95 bool early_page_ext __meminitdata; 96 static int __init setup_early_page_ext(char *str) 97 { 98 early_page_ext = true; 99 return 0; 100 } 101 early_param("early_page_ext", setup_early_page_ext); 102 103 static bool __init invoke_need_callbacks(void) 104 { 105 int i; 106 int entries = ARRAY_SIZE(page_ext_ops); 107 bool need = false; 108 109 for (i = 0; i < entries; i++) { 110 if (page_ext_ops[i]->need()) { 111 if (page_ext_ops[i]->need_shared_flags) { 112 page_ext_size = sizeof(struct page_ext); 113 break; 114 } 115 } 116 } 117 118 for (i = 0; i < entries; i++) { 119 if (page_ext_ops[i]->need()) { 120 page_ext_ops[i]->offset = page_ext_size; 121 page_ext_size += page_ext_ops[i]->size; 122 need = true; 123 } 124 } 125 126 return need; 127 } 128 129 static void __init invoke_init_callbacks(void) 130 { 131 int i; 132 int entries = ARRAY_SIZE(page_ext_ops); 133 134 for (i = 0; i < entries; i++) { 135 if (page_ext_ops[i]->init) 136 page_ext_ops[i]->init(); 137 } 138 } 139 140 #ifndef CONFIG_SPARSEMEM 141 void __init page_ext_init_flatmem_late(void) 142 { 143 invoke_init_callbacks(); 144 } 145 #endif 146 147 static inline struct page_ext *get_entry(void *base, unsigned long index) 148 { 149 return base + page_ext_size * index; 150 } 151 152 /** 153 * page_ext_get() - Get the extended information for a page. 154 * @page: The page we're interested in. 155 * 156 * Ensures that the page_ext will remain valid until page_ext_put() 157 * is called. 158 * 159 * Return: NULL if no page_ext exists for this page. 160 * Context: Any context. Caller may not sleep until they have called 161 * page_ext_put(). 162 */ 163 struct page_ext *page_ext_get(struct page *page) 164 { 165 struct page_ext *page_ext; 166 167 rcu_read_lock(); 168 page_ext = lookup_page_ext(page); 169 if (!page_ext) { 170 rcu_read_unlock(); 171 return NULL; 172 } 173 174 return page_ext; 175 } 176 177 /** 178 * page_ext_put() - Working with page extended information is done. 179 * @page_ext: Page extended information received from page_ext_get(). 180 * 181 * The page extended information of the page may not be valid after this 182 * function is called. 183 * 184 * Return: None. 185 * Context: Any context with corresponding page_ext_get() is called. 186 */ 187 void page_ext_put(struct page_ext *page_ext) 188 { 189 if (unlikely(!page_ext)) 190 return; 191 192 rcu_read_unlock(); 193 } 194 #ifndef CONFIG_SPARSEMEM 195 196 197 void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) 198 { 199 pgdat->node_page_ext = NULL; 200 } 201 202 static struct page_ext *lookup_page_ext(const struct page *page) 203 { 204 unsigned long pfn = page_to_pfn(page); 205 unsigned long index; 206 struct page_ext *base; 207 208 WARN_ON_ONCE(!rcu_read_lock_held()); 209 base = NODE_DATA(page_to_nid(page))->node_page_ext; 210 /* 211 * The sanity checks the page allocator does upon freeing a 212 * page can reach here before the page_ext arrays are 213 * allocated when feeding a range of pages to the allocator 214 * for the first time during bootup or memory hotplug. 215 */ 216 if (unlikely(!base)) 217 return NULL; 218 index = pfn - round_down(node_start_pfn(page_to_nid(page)), 219 MAX_ORDER_NR_PAGES); 220 return get_entry(base, index); 221 } 222 223 static int __init alloc_node_page_ext(int nid) 224 { 225 struct page_ext *base; 226 unsigned long table_size; 227 unsigned long nr_pages; 228 229 nr_pages = NODE_DATA(nid)->node_spanned_pages; 230 if (!nr_pages) 231 return 0; 232 233 /* 234 * Need extra space if node range is not aligned with 235 * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm 236 * checks buddy's status, range could be out of exact node range. 237 */ 238 if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) || 239 !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES)) 240 nr_pages += MAX_ORDER_NR_PAGES; 241 242 table_size = page_ext_size * nr_pages; 243 244 base = memblock_alloc_try_nid( 245 table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 246 MEMBLOCK_ALLOC_ACCESSIBLE, nid); 247 if (!base) 248 return -ENOMEM; 249 NODE_DATA(nid)->node_page_ext = base; 250 total_usage += table_size; 251 return 0; 252 } 253 254 void __init page_ext_init_flatmem(void) 255 { 256 257 int nid, fail; 258 259 if (!invoke_need_callbacks()) 260 return; 261 262 for_each_online_node(nid) { 263 fail = alloc_node_page_ext(nid); 264 if (fail) 265 goto fail; 266 } 267 pr_info("allocated %ld bytes of page_ext\n", total_usage); 268 return; 269 270 fail: 271 pr_crit("allocation of page_ext failed.\n"); 272 panic("Out of memory"); 273 } 274 275 #else /* CONFIG_SPARSEMEM */ 276 static bool page_ext_invalid(struct page_ext *page_ext) 277 { 278 return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID); 279 } 280 281 static struct page_ext *lookup_page_ext(const struct page *page) 282 { 283 unsigned long pfn = page_to_pfn(page); 284 struct mem_section *section = __pfn_to_section(pfn); 285 struct page_ext *page_ext = READ_ONCE(section->page_ext); 286 287 WARN_ON_ONCE(!rcu_read_lock_held()); 288 /* 289 * The sanity checks the page allocator does upon freeing a 290 * page can reach here before the page_ext arrays are 291 * allocated when feeding a range of pages to the allocator 292 * for the first time during bootup or memory hotplug. 293 */ 294 if (page_ext_invalid(page_ext)) 295 return NULL; 296 return get_entry(page_ext, pfn); 297 } 298 299 static void *__meminit alloc_page_ext(size_t size, int nid) 300 { 301 gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; 302 void *addr = NULL; 303 304 addr = alloc_pages_exact_nid(nid, size, flags); 305 if (addr) { 306 kmemleak_alloc(addr, size, 1, flags); 307 return addr; 308 } 309 310 addr = vzalloc_node(size, nid); 311 312 return addr; 313 } 314 315 static int __meminit init_section_page_ext(unsigned long pfn, int nid) 316 { 317 struct mem_section *section; 318 struct page_ext *base; 319 unsigned long table_size; 320 321 section = __pfn_to_section(pfn); 322 323 if (section->page_ext) 324 return 0; 325 326 table_size = page_ext_size * PAGES_PER_SECTION; 327 base = alloc_page_ext(table_size, nid); 328 329 /* 330 * The value stored in section->page_ext is (base - pfn) 331 * and it does not point to the memory block allocated above, 332 * causing kmemleak false positives. 333 */ 334 kmemleak_not_leak(base); 335 336 if (!base) { 337 pr_err("page ext allocation failure\n"); 338 return -ENOMEM; 339 } 340 341 /* 342 * The passed "pfn" may not be aligned to SECTION. For the calculation 343 * we need to apply a mask. 344 */ 345 pfn &= PAGE_SECTION_MASK; 346 section->page_ext = (void *)base - page_ext_size * pfn; 347 total_usage += table_size; 348 return 0; 349 } 350 351 static void free_page_ext(void *addr) 352 { 353 if (is_vmalloc_addr(addr)) { 354 vfree(addr); 355 } else { 356 struct page *page = virt_to_page(addr); 357 size_t table_size; 358 359 table_size = page_ext_size * PAGES_PER_SECTION; 360 361 BUG_ON(PageReserved(page)); 362 kmemleak_free(addr); 363 free_pages_exact(addr, table_size); 364 } 365 } 366 367 static void __free_page_ext(unsigned long pfn) 368 { 369 struct mem_section *ms; 370 struct page_ext *base; 371 372 ms = __pfn_to_section(pfn); 373 if (!ms || !ms->page_ext) 374 return; 375 376 base = READ_ONCE(ms->page_ext); 377 /* 378 * page_ext here can be valid while doing the roll back 379 * operation in online_page_ext(). 380 */ 381 if (page_ext_invalid(base)) 382 base = (void *)base - PAGE_EXT_INVALID; 383 WRITE_ONCE(ms->page_ext, NULL); 384 385 base = get_entry(base, pfn); 386 free_page_ext(base); 387 } 388 389 static void __invalidate_page_ext(unsigned long pfn) 390 { 391 struct mem_section *ms; 392 void *val; 393 394 ms = __pfn_to_section(pfn); 395 if (!ms || !ms->page_ext) 396 return; 397 val = (void *)ms->page_ext + PAGE_EXT_INVALID; 398 WRITE_ONCE(ms->page_ext, val); 399 } 400 401 static int __meminit online_page_ext(unsigned long start_pfn, 402 unsigned long nr_pages, 403 int nid) 404 { 405 unsigned long start, end, pfn; 406 int fail = 0; 407 408 start = SECTION_ALIGN_DOWN(start_pfn); 409 end = SECTION_ALIGN_UP(start_pfn + nr_pages); 410 411 if (nid == NUMA_NO_NODE) { 412 /* 413 * In this case, "nid" already exists and contains valid memory. 414 * "start_pfn" passed to us is a pfn which is an arg for 415 * online__pages(), and start_pfn should exist. 416 */ 417 nid = pfn_to_nid(start_pfn); 418 VM_BUG_ON(!node_online(nid)); 419 } 420 421 for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) 422 fail = init_section_page_ext(pfn, nid); 423 if (!fail) 424 return 0; 425 426 /* rollback */ 427 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) 428 __free_page_ext(pfn); 429 430 return -ENOMEM; 431 } 432 433 static int __meminit offline_page_ext(unsigned long start_pfn, 434 unsigned long nr_pages) 435 { 436 unsigned long start, end, pfn; 437 438 start = SECTION_ALIGN_DOWN(start_pfn); 439 end = SECTION_ALIGN_UP(start_pfn + nr_pages); 440 441 /* 442 * Freeing of page_ext is done in 3 steps to avoid 443 * use-after-free of it: 444 * 1) Traverse all the sections and mark their page_ext 445 * as invalid. 446 * 2) Wait for all the existing users of page_ext who 447 * started before invalidation to finish. 448 * 3) Free the page_ext. 449 */ 450 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) 451 __invalidate_page_ext(pfn); 452 453 synchronize_rcu(); 454 455 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) 456 __free_page_ext(pfn); 457 return 0; 458 459 } 460 461 static int __meminit page_ext_callback(struct notifier_block *self, 462 unsigned long action, void *arg) 463 { 464 struct memory_notify *mn = arg; 465 int ret = 0; 466 467 switch (action) { 468 case MEM_GOING_ONLINE: 469 ret = online_page_ext(mn->start_pfn, 470 mn->nr_pages, mn->status_change_nid); 471 break; 472 case MEM_OFFLINE: 473 offline_page_ext(mn->start_pfn, 474 mn->nr_pages); 475 break; 476 case MEM_CANCEL_ONLINE: 477 offline_page_ext(mn->start_pfn, 478 mn->nr_pages); 479 break; 480 case MEM_GOING_OFFLINE: 481 break; 482 case MEM_ONLINE: 483 case MEM_CANCEL_OFFLINE: 484 break; 485 } 486 487 return notifier_from_errno(ret); 488 } 489 490 void __init page_ext_init(void) 491 { 492 unsigned long pfn; 493 int nid; 494 495 if (!invoke_need_callbacks()) 496 return; 497 498 for_each_node_state(nid, N_MEMORY) { 499 unsigned long start_pfn, end_pfn; 500 501 start_pfn = node_start_pfn(nid); 502 end_pfn = node_end_pfn(nid); 503 /* 504 * start_pfn and end_pfn may not be aligned to SECTION and the 505 * page->flags of out of node pages are not initialized. So we 506 * scan [start_pfn, the biggest section's pfn < end_pfn) here. 507 */ 508 for (pfn = start_pfn; pfn < end_pfn; 509 pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { 510 511 if (!pfn_valid(pfn)) 512 continue; 513 /* 514 * Nodes's pfns can be overlapping. 515 * We know some arch can have a nodes layout such as 516 * -------------pfn--------------> 517 * N0 | N1 | N2 | N0 | N1 | N2|.... 518 */ 519 if (pfn_to_nid(pfn) != nid) 520 continue; 521 if (init_section_page_ext(pfn, nid)) 522 goto oom; 523 cond_resched(); 524 } 525 } 526 hotplug_memory_notifier(page_ext_callback, DEFAULT_CALLBACK_PRI); 527 pr_info("allocated %ld bytes of page_ext\n", total_usage); 528 invoke_init_callbacks(); 529 return; 530 531 oom: 532 panic("Out of memory"); 533 } 534 535 void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) 536 { 537 } 538 539 #endif 540