1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/mm.h> 3 #include <linux/mmzone.h> 4 #include <linux/memblock.h> 5 #include <linux/page_ext.h> 6 #include <linux/memory.h> 7 #include <linux/vmalloc.h> 8 #include <linux/kmemleak.h> 9 #include <linux/page_owner.h> 10 #include <linux/page_idle.h> 11 #include <linux/page_table_check.h> 12 #include <linux/rcupdate.h> 13 14 /* 15 * struct page extension 16 * 17 * This is the feature to manage memory for extended data per page. 18 * 19 * Until now, we must modify struct page itself to store extra data per page. 20 * This requires rebuilding the kernel and it is really time consuming process. 21 * And, sometimes, rebuild is impossible due to third party module dependency. 22 * At last, enlarging struct page could cause un-wanted system behaviour change. 23 * 24 * This feature is intended to overcome above mentioned problems. This feature 25 * allocates memory for extended data per page in certain place rather than 26 * the struct page itself. This memory can be accessed by the accessor 27 * functions provided by this code. During the boot process, it checks whether 28 * allocation of huge chunk of memory is needed or not. If not, it avoids 29 * allocating memory at all. With this advantage, we can include this feature 30 * into the kernel in default and can avoid rebuild and solve related problems. 31 * 32 * To help these things to work well, there are two callbacks for clients. One 33 * is the need callback which is mandatory if user wants to avoid useless 34 * memory allocation at boot-time. The other is optional, init callback, which 35 * is used to do proper initialization after memory is allocated. 36 * 37 * The need callback is used to decide whether extended memory allocation is 38 * needed or not. Sometimes users want to deactivate some features in this 39 * boot and extra memory would be unnecessary. In this case, to avoid 40 * allocating huge chunk of memory, each clients represent their need of 41 * extra memory through the need callback. If one of the need callbacks 42 * returns true, it means that someone needs extra memory so that 43 * page extension core should allocates memory for page extension. If 44 * none of need callbacks return true, memory isn't needed at all in this boot 45 * and page extension core can skip to allocate memory. As result, 46 * none of memory is wasted. 47 * 48 * When need callback returns true, page_ext checks if there is a request for 49 * extra memory through size in struct page_ext_operations. If it is non-zero, 50 * extra space is allocated for each page_ext entry and offset is returned to 51 * user through offset in struct page_ext_operations. 52 * 53 * The init callback is used to do proper initialization after page extension 54 * is completely initialized. In sparse memory system, extra memory is 55 * allocated some time later than memmap is allocated. In other words, lifetime 56 * of memory for page extension isn't same with memmap for struct page. 57 * Therefore, clients can't store extra data until page extension is 58 * initialized, even if pages are allocated and used freely. This could 59 * cause inadequate state of extra data per page, so, to prevent it, client 60 * can utilize this callback to initialize the state of it correctly. 61 */ 62 63 #ifdef CONFIG_SPARSEMEM 64 #define PAGE_EXT_INVALID (0x1) 65 #endif 66 67 #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) 68 static bool need_page_idle(void) 69 { 70 return true; 71 } 72 static struct page_ext_operations page_idle_ops __initdata = { 73 .need = need_page_idle, 74 }; 75 #endif 76 77 static struct page_ext_operations *page_ext_ops[] __initdata = { 78 #ifdef CONFIG_PAGE_OWNER 79 &page_owner_ops, 80 #endif 81 #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) 82 &page_idle_ops, 83 #endif 84 #ifdef CONFIG_PAGE_TABLE_CHECK 85 &page_table_check_ops, 86 #endif 87 }; 88 89 unsigned long page_ext_size = sizeof(struct page_ext); 90 91 static unsigned long total_usage; 92 static struct page_ext *lookup_page_ext(const struct page *page); 93 94 bool early_page_ext; 95 static int __init setup_early_page_ext(char *str) 96 { 97 early_page_ext = true; 98 return 0; 99 } 100 early_param("early_page_ext", setup_early_page_ext); 101 102 static bool __init invoke_need_callbacks(void) 103 { 104 int i; 105 int entries = ARRAY_SIZE(page_ext_ops); 106 bool need = false; 107 108 for (i = 0; i < entries; i++) { 109 if (page_ext_ops[i]->need && page_ext_ops[i]->need()) { 110 page_ext_ops[i]->offset = page_ext_size; 111 page_ext_size += page_ext_ops[i]->size; 112 need = true; 113 } 114 } 115 116 return need; 117 } 118 119 static void __init invoke_init_callbacks(void) 120 { 121 int i; 122 int entries = ARRAY_SIZE(page_ext_ops); 123 124 for (i = 0; i < entries; i++) { 125 if (page_ext_ops[i]->init) 126 page_ext_ops[i]->init(); 127 } 128 } 129 130 #ifndef CONFIG_SPARSEMEM 131 void __init page_ext_init_flatmem_late(void) 132 { 133 invoke_init_callbacks(); 134 } 135 #endif 136 137 static inline struct page_ext *get_entry(void *base, unsigned long index) 138 { 139 return base + page_ext_size * index; 140 } 141 142 /** 143 * page_ext_get() - Get the extended information for a page. 144 * @page: The page we're interested in. 145 * 146 * Ensures that the page_ext will remain valid until page_ext_put() 147 * is called. 148 * 149 * Return: NULL if no page_ext exists for this page. 150 * Context: Any context. Caller may not sleep until they have called 151 * page_ext_put(). 152 */ 153 struct page_ext *page_ext_get(struct page *page) 154 { 155 struct page_ext *page_ext; 156 157 rcu_read_lock(); 158 page_ext = lookup_page_ext(page); 159 if (!page_ext) { 160 rcu_read_unlock(); 161 return NULL; 162 } 163 164 return page_ext; 165 } 166 167 /** 168 * page_ext_put() - Working with page extended information is done. 169 * @page_ext: Page extended information received from page_ext_get(). 170 * 171 * The page extended information of the page may not be valid after this 172 * function is called. 173 * 174 * Return: None. 175 * Context: Any context with corresponding page_ext_get() is called. 176 */ 177 void page_ext_put(struct page_ext *page_ext) 178 { 179 if (unlikely(!page_ext)) 180 return; 181 182 rcu_read_unlock(); 183 } 184 #ifndef CONFIG_SPARSEMEM 185 186 187 void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) 188 { 189 pgdat->node_page_ext = NULL; 190 } 191 192 static struct page_ext *lookup_page_ext(const struct page *page) 193 { 194 unsigned long pfn = page_to_pfn(page); 195 unsigned long index; 196 struct page_ext *base; 197 198 WARN_ON_ONCE(!rcu_read_lock_held()); 199 base = NODE_DATA(page_to_nid(page))->node_page_ext; 200 /* 201 * The sanity checks the page allocator does upon freeing a 202 * page can reach here before the page_ext arrays are 203 * allocated when feeding a range of pages to the allocator 204 * for the first time during bootup or memory hotplug. 205 */ 206 if (unlikely(!base)) 207 return NULL; 208 index = pfn - round_down(node_start_pfn(page_to_nid(page)), 209 MAX_ORDER_NR_PAGES); 210 return get_entry(base, index); 211 } 212 213 static int __init alloc_node_page_ext(int nid) 214 { 215 struct page_ext *base; 216 unsigned long table_size; 217 unsigned long nr_pages; 218 219 nr_pages = NODE_DATA(nid)->node_spanned_pages; 220 if (!nr_pages) 221 return 0; 222 223 /* 224 * Need extra space if node range is not aligned with 225 * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm 226 * checks buddy's status, range could be out of exact node range. 227 */ 228 if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) || 229 !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES)) 230 nr_pages += MAX_ORDER_NR_PAGES; 231 232 table_size = page_ext_size * nr_pages; 233 234 base = memblock_alloc_try_nid( 235 table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 236 MEMBLOCK_ALLOC_ACCESSIBLE, nid); 237 if (!base) 238 return -ENOMEM; 239 NODE_DATA(nid)->node_page_ext = base; 240 total_usage += table_size; 241 return 0; 242 } 243 244 void __init page_ext_init_flatmem(void) 245 { 246 247 int nid, fail; 248 249 if (!invoke_need_callbacks()) 250 return; 251 252 for_each_online_node(nid) { 253 fail = alloc_node_page_ext(nid); 254 if (fail) 255 goto fail; 256 } 257 pr_info("allocated %ld bytes of page_ext\n", total_usage); 258 return; 259 260 fail: 261 pr_crit("allocation of page_ext failed.\n"); 262 panic("Out of memory"); 263 } 264 265 #else /* CONFIG_SPARSEMEM */ 266 static bool page_ext_invalid(struct page_ext *page_ext) 267 { 268 return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID); 269 } 270 271 static struct page_ext *lookup_page_ext(const struct page *page) 272 { 273 unsigned long pfn = page_to_pfn(page); 274 struct mem_section *section = __pfn_to_section(pfn); 275 struct page_ext *page_ext = READ_ONCE(section->page_ext); 276 277 WARN_ON_ONCE(!rcu_read_lock_held()); 278 /* 279 * The sanity checks the page allocator does upon freeing a 280 * page can reach here before the page_ext arrays are 281 * allocated when feeding a range of pages to the allocator 282 * for the first time during bootup or memory hotplug. 283 */ 284 if (page_ext_invalid(page_ext)) 285 return NULL; 286 return get_entry(page_ext, pfn); 287 } 288 289 static void *__meminit alloc_page_ext(size_t size, int nid) 290 { 291 gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; 292 void *addr = NULL; 293 294 addr = alloc_pages_exact_nid(nid, size, flags); 295 if (addr) { 296 kmemleak_alloc(addr, size, 1, flags); 297 return addr; 298 } 299 300 addr = vzalloc_node(size, nid); 301 302 return addr; 303 } 304 305 static int __meminit init_section_page_ext(unsigned long pfn, int nid) 306 { 307 struct mem_section *section; 308 struct page_ext *base; 309 unsigned long table_size; 310 311 section = __pfn_to_section(pfn); 312 313 if (section->page_ext) 314 return 0; 315 316 table_size = page_ext_size * PAGES_PER_SECTION; 317 base = alloc_page_ext(table_size, nid); 318 319 /* 320 * The value stored in section->page_ext is (base - pfn) 321 * and it does not point to the memory block allocated above, 322 * causing kmemleak false positives. 323 */ 324 kmemleak_not_leak(base); 325 326 if (!base) { 327 pr_err("page ext allocation failure\n"); 328 return -ENOMEM; 329 } 330 331 /* 332 * The passed "pfn" may not be aligned to SECTION. For the calculation 333 * we need to apply a mask. 334 */ 335 pfn &= PAGE_SECTION_MASK; 336 section->page_ext = (void *)base - page_ext_size * pfn; 337 total_usage += table_size; 338 return 0; 339 } 340 341 static void free_page_ext(void *addr) 342 { 343 if (is_vmalloc_addr(addr)) { 344 vfree(addr); 345 } else { 346 struct page *page = virt_to_page(addr); 347 size_t table_size; 348 349 table_size = page_ext_size * PAGES_PER_SECTION; 350 351 BUG_ON(PageReserved(page)); 352 kmemleak_free(addr); 353 free_pages_exact(addr, table_size); 354 } 355 } 356 357 static void __free_page_ext(unsigned long pfn) 358 { 359 struct mem_section *ms; 360 struct page_ext *base; 361 362 ms = __pfn_to_section(pfn); 363 if (!ms || !ms->page_ext) 364 return; 365 366 base = READ_ONCE(ms->page_ext); 367 /* 368 * page_ext here can be valid while doing the roll back 369 * operation in online_page_ext(). 370 */ 371 if (page_ext_invalid(base)) 372 base = (void *)base - PAGE_EXT_INVALID; 373 WRITE_ONCE(ms->page_ext, NULL); 374 375 base = get_entry(base, pfn); 376 free_page_ext(base); 377 } 378 379 static void __invalidate_page_ext(unsigned long pfn) 380 { 381 struct mem_section *ms; 382 void *val; 383 384 ms = __pfn_to_section(pfn); 385 if (!ms || !ms->page_ext) 386 return; 387 val = (void *)ms->page_ext + PAGE_EXT_INVALID; 388 WRITE_ONCE(ms->page_ext, val); 389 } 390 391 static int __meminit online_page_ext(unsigned long start_pfn, 392 unsigned long nr_pages, 393 int nid) 394 { 395 unsigned long start, end, pfn; 396 int fail = 0; 397 398 start = SECTION_ALIGN_DOWN(start_pfn); 399 end = SECTION_ALIGN_UP(start_pfn + nr_pages); 400 401 if (nid == NUMA_NO_NODE) { 402 /* 403 * In this case, "nid" already exists and contains valid memory. 404 * "start_pfn" passed to us is a pfn which is an arg for 405 * online__pages(), and start_pfn should exist. 406 */ 407 nid = pfn_to_nid(start_pfn); 408 VM_BUG_ON(!node_online(nid)); 409 } 410 411 for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) 412 fail = init_section_page_ext(pfn, nid); 413 if (!fail) 414 return 0; 415 416 /* rollback */ 417 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) 418 __free_page_ext(pfn); 419 420 return -ENOMEM; 421 } 422 423 static int __meminit offline_page_ext(unsigned long start_pfn, 424 unsigned long nr_pages) 425 { 426 unsigned long start, end, pfn; 427 428 start = SECTION_ALIGN_DOWN(start_pfn); 429 end = SECTION_ALIGN_UP(start_pfn + nr_pages); 430 431 /* 432 * Freeing of page_ext is done in 3 steps to avoid 433 * use-after-free of it: 434 * 1) Traverse all the sections and mark their page_ext 435 * as invalid. 436 * 2) Wait for all the existing users of page_ext who 437 * started before invalidation to finish. 438 * 3) Free the page_ext. 439 */ 440 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) 441 __invalidate_page_ext(pfn); 442 443 synchronize_rcu(); 444 445 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) 446 __free_page_ext(pfn); 447 return 0; 448 449 } 450 451 static int __meminit page_ext_callback(struct notifier_block *self, 452 unsigned long action, void *arg) 453 { 454 struct memory_notify *mn = arg; 455 int ret = 0; 456 457 switch (action) { 458 case MEM_GOING_ONLINE: 459 ret = online_page_ext(mn->start_pfn, 460 mn->nr_pages, mn->status_change_nid); 461 break; 462 case MEM_OFFLINE: 463 offline_page_ext(mn->start_pfn, 464 mn->nr_pages); 465 break; 466 case MEM_CANCEL_ONLINE: 467 offline_page_ext(mn->start_pfn, 468 mn->nr_pages); 469 break; 470 case MEM_GOING_OFFLINE: 471 break; 472 case MEM_ONLINE: 473 case MEM_CANCEL_OFFLINE: 474 break; 475 } 476 477 return notifier_from_errno(ret); 478 } 479 480 void __init page_ext_init(void) 481 { 482 unsigned long pfn; 483 int nid; 484 485 if (!invoke_need_callbacks()) 486 return; 487 488 for_each_node_state(nid, N_MEMORY) { 489 unsigned long start_pfn, end_pfn; 490 491 start_pfn = node_start_pfn(nid); 492 end_pfn = node_end_pfn(nid); 493 /* 494 * start_pfn and end_pfn may not be aligned to SECTION and the 495 * page->flags of out of node pages are not initialized. So we 496 * scan [start_pfn, the biggest section's pfn < end_pfn) here. 497 */ 498 for (pfn = start_pfn; pfn < end_pfn; 499 pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { 500 501 if (!pfn_valid(pfn)) 502 continue; 503 /* 504 * Nodes's pfns can be overlapping. 505 * We know some arch can have a nodes layout such as 506 * -------------pfn--------------> 507 * N0 | N1 | N2 | N0 | N1 | N2|.... 508 */ 509 if (pfn_to_nid(pfn) != nid) 510 continue; 511 if (init_section_page_ext(pfn, nid)) 512 goto oom; 513 cond_resched(); 514 } 515 } 516 hotplug_memory_notifier(page_ext_callback, DEFAULT_CALLBACK_PRI); 517 pr_info("allocated %ld bytes of page_ext\n", total_usage); 518 invoke_init_callbacks(); 519 return; 520 521 oom: 522 panic("Out of memory"); 523 } 524 525 void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) 526 { 527 } 528 529 #endif 530