1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * HugeTLB Vmemmap Optimization (HVO) 4 * 5 * Copyright (c) 2020, ByteDance. All rights reserved. 6 * 7 * Author: Muchun Song <songmuchun@bytedance.com> 8 * 9 * See Documentation/mm/vmemmap_dedup.rst 10 */ 11 #define pr_fmt(fmt) "HugeTLB: " fmt 12 13 #include <linux/pgtable.h> 14 #include <linux/moduleparam.h> 15 #include <linux/bootmem_info.h> 16 #include <linux/mmdebug.h> 17 #include <linux/pagewalk.h> 18 #include <linux/pgalloc.h> 19 20 #include <asm/tlbflush.h> 21 #include "hugetlb_vmemmap.h" 22 #include "internal.h" 23 24 /** 25 * struct vmemmap_remap_walk - walk vmemmap page table 26 * 27 * @remap_pte: called for each lowest-level entry (PTE). 28 * @nr_walked: the number of walked pte. 29 * @vmemmap_head: the page to be installed as first in the vmemmap range 30 * @vmemmap_tail: the page to be installed as non-first in the vmemmap range 31 * @vmemmap_pages: the list head of the vmemmap pages that can be freed 32 * or is mapped from. 33 * @flags: used to modify behavior in vmemmap page table walking 34 * operations. 35 */ 36 struct vmemmap_remap_walk { 37 void (*remap_pte)(pte_t *pte, unsigned long addr, 38 struct vmemmap_remap_walk *walk); 39 40 unsigned long nr_walked; 41 struct page *vmemmap_head; 42 struct page *vmemmap_tail; 43 struct list_head *vmemmap_pages; 44 45 46 /* Skip the TLB flush when we split the PMD */ 47 #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) 48 /* Skip the TLB flush when we remap the PTE */ 49 #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1) 50 unsigned long flags; 51 }; 52 53 static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start, 54 struct vmemmap_remap_walk *walk) 55 { 56 pmd_t __pmd; 57 int i; 58 unsigned long addr = start; 59 pte_t *pgtable; 60 61 pgtable = pte_alloc_one_kernel(&init_mm); 62 if (!pgtable) 63 return -ENOMEM; 64 65 pmd_populate_kernel(&init_mm, &__pmd, pgtable); 66 67 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { 68 pte_t entry, *pte; 69 pgprot_t pgprot = PAGE_KERNEL; 70 71 entry = mk_pte(head + i, pgprot); 72 pte = pte_offset_kernel(&__pmd, addr); 73 set_pte_at(&init_mm, addr, pte, entry); 74 } 75 76 spin_lock(&init_mm.page_table_lock); 77 if (likely(pmd_leaf(*pmd))) { 78 /* 79 * Higher order allocations from buddy allocator must be able to 80 * be treated as independent small pages (as they can be freed 81 * individually). 82 */ 83 if (!PageReserved(head)) 84 split_page(head, get_order(PMD_SIZE)); 85 86 /* Make pte visible before pmd. See comment in pmd_install(). */ 87 smp_wmb(); 88 pmd_populate_kernel(&init_mm, pmd, pgtable); 89 if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)) 90 flush_tlb_kernel_range(start, start + PMD_SIZE); 91 } else { 92 pte_free_kernel(&init_mm, pgtable); 93 } 94 spin_unlock(&init_mm.page_table_lock); 95 96 return 0; 97 } 98 99 static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr, 100 unsigned long next, struct mm_walk *walk) 101 { 102 int ret = 0; 103 struct page *head; 104 struct vmemmap_remap_walk *vmemmap_walk = walk->private; 105 106 /* Only splitting, not remapping the vmemmap pages. */ 107 if (!vmemmap_walk->remap_pte) 108 walk->action = ACTION_CONTINUE; 109 110 spin_lock(&init_mm.page_table_lock); 111 head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL; 112 /* 113 * Due to HugeTLB alignment requirements and the vmemmap 114 * pages being at the start of the hotplugged memory 115 * region in memory_hotplug.memmap_on_memory case. Checking 116 * the vmemmap page associated with the first vmemmap page 117 * if it is self-hosted is sufficient. 118 * 119 * [ hotplugged memory ] 120 * [ section ][...][ section ] 121 * [ vmemmap ][ usable memory ] 122 * ^ | ^ | 123 * +--+ | | 124 * +------------------------+ 125 */ 126 if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) { 127 struct page *page = head ? head + pte_index(addr) : 128 pte_page(ptep_get(pte_offset_kernel(pmd, addr))); 129 130 if (PageVmemmapSelfHosted(page)) 131 ret = -ENOTSUPP; 132 } 133 spin_unlock(&init_mm.page_table_lock); 134 if (!head || ret) 135 return ret; 136 137 return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk); 138 } 139 140 static int vmemmap_pte_entry(pte_t *pte, unsigned long addr, 141 unsigned long next, struct mm_walk *walk) 142 { 143 struct vmemmap_remap_walk *vmemmap_walk = walk->private; 144 145 vmemmap_walk->remap_pte(pte, addr, vmemmap_walk); 146 vmemmap_walk->nr_walked++; 147 148 return 0; 149 } 150 151 static const struct mm_walk_ops vmemmap_remap_ops = { 152 .pmd_entry = vmemmap_pmd_entry, 153 .pte_entry = vmemmap_pte_entry, 154 }; 155 156 static int vmemmap_remap_range(unsigned long start, unsigned long end, 157 struct vmemmap_remap_walk *walk) 158 { 159 int ret; 160 161 VM_BUG_ON(!PAGE_ALIGNED(start | end)); 162 163 mmap_read_lock(&init_mm); 164 ret = walk_kernel_page_table_range(start, end, &vmemmap_remap_ops, 165 NULL, walk); 166 mmap_read_unlock(&init_mm); 167 if (ret) 168 return ret; 169 170 if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH)) 171 flush_tlb_kernel_range(start, end); 172 173 return 0; 174 } 175 176 /* 177 * Free a vmemmap page. A vmemmap page can be allocated from the memblock 178 * allocator or buddy allocator. If the PG_reserved flag is set, it means 179 * that it allocated from the memblock allocator, just free it via the 180 * free_bootmem_page(). Otherwise, use __free_page(). 181 */ 182 static inline void free_vmemmap_page(struct page *page) 183 { 184 if (PageReserved(page)) { 185 memmap_boot_pages_add(-1); 186 free_bootmem_page(page); 187 } else { 188 memmap_pages_add(-1); 189 __free_page(page); 190 } 191 } 192 193 /* Free a list of the vmemmap pages */ 194 static void free_vmemmap_page_list(struct list_head *list) 195 { 196 struct page *page, *next; 197 198 list_for_each_entry_safe(page, next, list, lru) 199 free_vmemmap_page(page); 200 } 201 202 static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, 203 struct vmemmap_remap_walk *walk) 204 { 205 struct page *page = pte_page(ptep_get(pte)); 206 pte_t entry; 207 208 /* Remapping the head page requires r/w */ 209 if (unlikely(walk->nr_walked == 0 && walk->vmemmap_head)) { 210 list_del(&walk->vmemmap_head->lru); 211 212 /* 213 * Makes sure that preceding stores to the page contents from 214 * vmemmap_remap_free() become visible before the set_pte_at() 215 * write. 216 */ 217 smp_wmb(); 218 219 entry = mk_pte(walk->vmemmap_head, PAGE_KERNEL); 220 } else { 221 /* 222 * Remap the tail pages as read-only to catch illegal write 223 * operation to the tail pages. 224 */ 225 entry = mk_pte(walk->vmemmap_tail, PAGE_KERNEL_RO); 226 } 227 228 list_add(&page->lru, walk->vmemmap_pages); 229 set_pte_at(&init_mm, addr, pte, entry); 230 } 231 232 static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, 233 struct vmemmap_remap_walk *walk) 234 { 235 struct page *page; 236 struct page *from, *to; 237 238 page = list_first_entry(walk->vmemmap_pages, struct page, lru); 239 list_del(&page->lru); 240 241 /* 242 * Initialize tail pages in the newly allocated vmemmap page. 243 * 244 * There is folio-scope metadata that is encoded in the first few 245 * tail pages. 246 * 247 * Use the value last tail page in the page with the head page 248 * to initialize the rest of tail pages. 249 */ 250 from = compound_head((struct page *)addr) + 251 PAGE_SIZE / sizeof(struct page) - 1; 252 to = page_to_virt(page); 253 for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++, to++) 254 *to = *from; 255 256 /* 257 * Makes sure that preceding stores to the page contents become visible 258 * before the set_pte_at() write. 259 */ 260 smp_wmb(); 261 set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL)); 262 } 263 264 /** 265 * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end) 266 * backing PMDs of the directmap into PTEs 267 * @start: start address of the vmemmap virtual address range that we want 268 * to remap. 269 * @end: end address of the vmemmap virtual address range that we want to 270 * remap. 271 * Return: %0 on success, negative error code otherwise. 272 */ 273 static int vmemmap_remap_split(unsigned long start, unsigned long end) 274 { 275 struct vmemmap_remap_walk walk = { 276 .remap_pte = NULL, 277 .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH, 278 }; 279 280 return vmemmap_remap_range(start, end, &walk); 281 } 282 283 /** 284 * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) 285 * to use @vmemmap_head/tail, then free vmemmap which 286 * the range are mapped to. 287 * @start: start address of the vmemmap virtual address range that we want 288 * to remap. 289 * @end: end address of the vmemmap virtual address range that we want to 290 * remap. 291 * @vmemmap_head: the page to be installed as first in the vmemmap range 292 * @vmemmap_tail: the page to be installed as non-first in the vmemmap range 293 * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers 294 * responsibility to free pages. 295 * @flags: modifications to vmemmap_remap_walk flags 296 * 297 * Return: %0 on success, negative error code otherwise. 298 */ 299 static int vmemmap_remap_free(unsigned long start, unsigned long end, 300 struct page *vmemmap_head, 301 struct page *vmemmap_tail, 302 struct list_head *vmemmap_pages, 303 unsigned long flags) 304 { 305 int ret; 306 struct vmemmap_remap_walk walk = { 307 .remap_pte = vmemmap_remap_pte, 308 .vmemmap_head = vmemmap_head, 309 .vmemmap_tail = vmemmap_tail, 310 .vmemmap_pages = vmemmap_pages, 311 .flags = flags, 312 }; 313 314 ret = vmemmap_remap_range(start, end, &walk); 315 if (!ret || !walk.nr_walked) 316 return ret; 317 318 end = start + walk.nr_walked * PAGE_SIZE; 319 320 /* 321 * vmemmap_pages contains pages from the previous vmemmap_remap_range() 322 * call which failed. These are pages which were removed from 323 * the vmemmap. They will be restored in the following call. 324 */ 325 walk = (struct vmemmap_remap_walk) { 326 .remap_pte = vmemmap_restore_pte, 327 .vmemmap_pages = vmemmap_pages, 328 .flags = 0, 329 }; 330 331 vmemmap_remap_range(start, end, &walk); 332 333 return ret; 334 } 335 336 static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, 337 struct list_head *list) 338 { 339 gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL; 340 unsigned long nr_pages = (end - start) >> PAGE_SHIFT; 341 int nid = page_to_nid((struct page *)start); 342 struct page *page, *next; 343 int i; 344 345 for (i = 0; i < nr_pages; i++) { 346 page = alloc_pages_node(nid, gfp_mask, 0); 347 if (!page) 348 goto out; 349 list_add(&page->lru, list); 350 } 351 memmap_pages_add(nr_pages); 352 353 return 0; 354 out: 355 list_for_each_entry_safe(page, next, list, lru) 356 __free_page(page); 357 return -ENOMEM; 358 } 359 360 /** 361 * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) 362 * to the page which is from the @vmemmap_pages 363 * respectively. 364 * @start: start address of the vmemmap virtual address range that we want 365 * to remap. 366 * @end: end address of the vmemmap virtual address range that we want to 367 * remap. 368 * @flags: modifications to vmemmap_remap_walk flags 369 * 370 * Return: %0 on success, negative error code otherwise. 371 */ 372 static int vmemmap_remap_alloc(unsigned long start, unsigned long end, 373 unsigned long flags) 374 { 375 LIST_HEAD(vmemmap_pages); 376 struct vmemmap_remap_walk walk = { 377 .remap_pte = vmemmap_restore_pte, 378 .vmemmap_pages = &vmemmap_pages, 379 .flags = flags, 380 }; 381 382 if (alloc_vmemmap_page_list(start, end, &vmemmap_pages)) 383 return -ENOMEM; 384 385 return vmemmap_remap_range(start, end, &walk); 386 } 387 388 static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); 389 static int __init hugetlb_vmemmap_optimize_param(char *buf) 390 { 391 return kstrtobool(buf, &vmemmap_optimize_enabled); 392 } 393 early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_optimize_param); 394 395 static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, 396 struct folio *folio, unsigned long flags) 397 { 398 int ret; 399 unsigned long vmemmap_start, vmemmap_end; 400 401 VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); 402 VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); 403 404 if (!folio_test_hugetlb_vmemmap_optimized(folio)) 405 return 0; 406 407 vmemmap_start = (unsigned long)&folio->page; 408 vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); 409 410 vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; 411 412 /* 413 * The pages which the vmemmap virtual address range [@vmemmap_start, 414 * @vmemmap_end) are mapped to are freed to the buddy allocator. 415 * When a HugeTLB page is freed to the buddy allocator, previously 416 * discarded vmemmap pages must be allocated and remapping. 417 */ 418 ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, flags); 419 if (!ret) 420 folio_clear_hugetlb_vmemmap_optimized(folio); 421 422 return ret; 423 } 424 425 /** 426 * hugetlb_vmemmap_restore_folio - restore previously optimized (by 427 * hugetlb_vmemmap_optimize_folio()) vmemmap pages which 428 * will be reallocated and remapped. 429 * @h: struct hstate. 430 * @folio: the folio whose vmemmap pages will be restored. 431 * 432 * Return: %0 if @folio's vmemmap pages have been reallocated and remapped, 433 * negative error code otherwise. 434 */ 435 int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) 436 { 437 return __hugetlb_vmemmap_restore_folio(h, folio, 0); 438 } 439 440 /** 441 * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list. 442 * @h: hstate. 443 * @folio_list: list of folios. 444 * @non_hvo_folios: Output list of folios for which vmemmap exists. 445 * 446 * Return: number of folios for which vmemmap was restored, or an error code 447 * if an error was encountered restoring vmemmap for a folio. 448 * Folios that have vmemmap are moved to the non_hvo_folios 449 * list. Processing of entries stops when the first error is 450 * encountered. The folio that experienced the error and all 451 * non-processed folios will remain on folio_list. 452 */ 453 long hugetlb_vmemmap_restore_folios(const struct hstate *h, 454 struct list_head *folio_list, 455 struct list_head *non_hvo_folios) 456 { 457 struct folio *folio, *t_folio; 458 long restored = 0; 459 long ret = 0; 460 unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH; 461 462 list_for_each_entry_safe(folio, t_folio, folio_list, lru) { 463 if (folio_test_hugetlb_vmemmap_optimized(folio)) { 464 ret = __hugetlb_vmemmap_restore_folio(h, folio, flags); 465 if (ret) 466 break; 467 restored++; 468 } 469 470 /* Add non-optimized folios to output list */ 471 list_move(&folio->lru, non_hvo_folios); 472 } 473 474 if (restored) 475 flush_tlb_all(); 476 if (!ret) 477 ret = restored; 478 return ret; 479 } 480 481 /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ 482 static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio) 483 { 484 if (folio_test_hugetlb_vmemmap_optimized(folio)) 485 return false; 486 487 if (!READ_ONCE(vmemmap_optimize_enabled)) 488 return false; 489 490 if (!hugetlb_vmemmap_optimizable(h)) 491 return false; 492 493 return true; 494 } 495 496 static struct page *vmemmap_get_tail(unsigned int order, struct zone *zone) 497 { 498 const unsigned int idx = order - VMEMMAP_TAIL_MIN_ORDER; 499 struct page *tail, *p; 500 int node = zone_to_nid(zone); 501 502 tail = READ_ONCE(zone->vmemmap_tails[idx]); 503 if (likely(tail)) 504 return tail; 505 506 tail = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); 507 if (!tail) 508 return NULL; 509 510 p = page_to_virt(tail); 511 for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++) 512 init_compound_tail(p + i, NULL, order, zone); 513 514 if (cmpxchg(&zone->vmemmap_tails[idx], NULL, tail)) { 515 __free_page(tail); 516 tail = READ_ONCE(zone->vmemmap_tails[idx]); 517 } 518 519 return tail; 520 } 521 522 static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, 523 struct folio *folio, 524 struct list_head *vmemmap_pages, 525 unsigned long flags) 526 { 527 unsigned long vmemmap_start, vmemmap_end; 528 struct page *vmemmap_head, *vmemmap_tail; 529 int nid, ret = 0; 530 531 VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); 532 VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); 533 534 if (!vmemmap_should_optimize_folio(h, folio)) 535 return ret; 536 537 nid = folio_nid(folio); 538 vmemmap_tail = vmemmap_get_tail(h->order, folio_zone(folio)); 539 if (!vmemmap_tail) 540 return -ENOMEM; 541 542 /* 543 * Very Subtle 544 * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed 545 * immediately after remapping. As a result, subsequent accesses 546 * and modifications to struct pages associated with the hugetlb 547 * page could be to the OLD struct pages. Set the vmemmap optimized 548 * flag here so that it is copied to the new head page. This keeps 549 * the old and new struct pages in sync. 550 * If there is an error during optimization, we will immediately FLUSH 551 * the TLB and clear the flag below. 552 */ 553 folio_set_hugetlb_vmemmap_optimized(folio); 554 555 vmemmap_head = alloc_pages_node(nid, GFP_KERNEL, 0); 556 if (!vmemmap_head) { 557 ret = -ENOMEM; 558 goto out; 559 } 560 561 copy_page(page_to_virt(vmemmap_head), folio); 562 list_add(&vmemmap_head->lru, vmemmap_pages); 563 memmap_pages_add(1); 564 565 vmemmap_start = (unsigned long)&folio->page; 566 vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); 567 568 /* 569 * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end). 570 * Add pages previously mapping the range to vmemmap_pages list so that 571 * they can be freed by the caller. 572 */ 573 ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, 574 vmemmap_head, vmemmap_tail, 575 vmemmap_pages, flags); 576 out: 577 if (ret) 578 folio_clear_hugetlb_vmemmap_optimized(folio); 579 580 return ret; 581 } 582 583 /** 584 * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages. 585 * @h: struct hstate. 586 * @folio: the folio whose vmemmap pages will be optimized. 587 * 588 * This function only tries to optimize @folio's vmemmap pages and does not 589 * guarantee that the optimization will succeed after it returns. The caller 590 * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's 591 * vmemmap pages have been optimized. 592 */ 593 void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) 594 { 595 LIST_HEAD(vmemmap_pages); 596 597 __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0); 598 free_vmemmap_page_list(&vmemmap_pages); 599 } 600 601 static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio) 602 { 603 unsigned long vmemmap_start, vmemmap_end; 604 605 if (!vmemmap_should_optimize_folio(h, folio)) 606 return 0; 607 608 vmemmap_start = (unsigned long)&folio->page; 609 vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); 610 611 /* 612 * Split PMDs on the vmemmap virtual address range [@vmemmap_start, 613 * @vmemmap_end] 614 */ 615 return vmemmap_remap_split(vmemmap_start, vmemmap_end); 616 } 617 618 static void __hugetlb_vmemmap_optimize_folios(struct hstate *h, 619 struct list_head *folio_list, 620 bool boot) 621 { 622 struct folio *folio; 623 int nr_to_optimize; 624 LIST_HEAD(vmemmap_pages); 625 unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH; 626 627 nr_to_optimize = 0; 628 list_for_each_entry(folio, folio_list, lru) { 629 int ret; 630 unsigned long spfn, epfn; 631 632 if (boot && folio_test_hugetlb_vmemmap_optimized(folio)) { 633 /* 634 * Already optimized by pre-HVO, just map the 635 * mirrored tail page structs RO. 636 */ 637 spfn = (unsigned long)&folio->page; 638 epfn = spfn + pages_per_huge_page(h); 639 vmemmap_wrprotect_hvo(spfn, epfn, folio_nid(folio), 640 HUGETLB_VMEMMAP_RESERVE_SIZE); 641 register_page_bootmem_memmap(pfn_to_section_nr(spfn), 642 &folio->page, 643 HUGETLB_VMEMMAP_RESERVE_SIZE); 644 continue; 645 } 646 647 nr_to_optimize++; 648 649 ret = hugetlb_vmemmap_split_folio(h, folio); 650 651 /* 652 * Splitting the PMD requires allocating a page, thus let's fail 653 * early once we encounter the first OOM. No point in retrying 654 * as it can be dynamically done on remap with the memory 655 * we get back from the vmemmap deduplication. 656 */ 657 if (ret == -ENOMEM) 658 break; 659 } 660 661 if (!nr_to_optimize) 662 /* 663 * All pre-HVO folios, nothing left to do. It's ok if 664 * there is a mix of pre-HVO and not yet HVO-ed folios 665 * here, as __hugetlb_vmemmap_optimize_folio() will 666 * skip any folios that already have the optimized flag 667 * set, see vmemmap_should_optimize_folio(). 668 */ 669 goto out; 670 671 flush_tlb_all(); 672 673 list_for_each_entry(folio, folio_list, lru) { 674 int ret; 675 676 ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags); 677 678 /* 679 * Pages to be freed may have been accumulated. If we 680 * encounter an ENOMEM, free what we have and try again. 681 * This can occur in the case that both splitting fails 682 * halfway and head page allocation also failed. In this 683 * case __hugetlb_vmemmap_optimize_folio() would free memory 684 * allowing more vmemmap remaps to occur. 685 */ 686 if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) { 687 flush_tlb_all(); 688 free_vmemmap_page_list(&vmemmap_pages); 689 INIT_LIST_HEAD(&vmemmap_pages); 690 __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags); 691 } 692 } 693 694 out: 695 flush_tlb_all(); 696 free_vmemmap_page_list(&vmemmap_pages); 697 } 698 699 void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) 700 { 701 __hugetlb_vmemmap_optimize_folios(h, folio_list, false); 702 } 703 704 void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list) 705 { 706 __hugetlb_vmemmap_optimize_folios(h, folio_list, true); 707 } 708 709 #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT 710 711 /* Return true of a bootmem allocated HugeTLB page should be pre-HVO-ed */ 712 static bool vmemmap_should_optimize_bootmem_page(struct huge_bootmem_page *m) 713 { 714 unsigned long section_size, psize, pmd_vmemmap_size; 715 phys_addr_t paddr; 716 717 if (!READ_ONCE(vmemmap_optimize_enabled)) 718 return false; 719 720 if (!hugetlb_vmemmap_optimizable(m->hstate)) 721 return false; 722 723 psize = huge_page_size(m->hstate); 724 paddr = virt_to_phys(m); 725 726 /* 727 * Pre-HVO only works if the bootmem huge page 728 * is aligned to the section size. 729 */ 730 section_size = (1UL << PA_SECTION_SHIFT); 731 if (!IS_ALIGNED(paddr, section_size) || 732 !IS_ALIGNED(psize, section_size)) 733 return false; 734 735 /* 736 * The pre-HVO code does not deal with splitting PMDS, 737 * so the bootmem page must be aligned to the number 738 * of base pages that can be mapped with one vmemmap PMD. 739 */ 740 pmd_vmemmap_size = (PMD_SIZE / (sizeof(struct page))) << PAGE_SHIFT; 741 if (!IS_ALIGNED(paddr, pmd_vmemmap_size) || 742 !IS_ALIGNED(psize, pmd_vmemmap_size)) 743 return false; 744 745 return true; 746 } 747 748 /* 749 * Initialize memmap section for a gigantic page, HVO-style. 750 */ 751 void __init hugetlb_vmemmap_init_early(int nid) 752 { 753 unsigned long psize, paddr, section_size; 754 unsigned long ns, i, pnum, pfn, nr_pages; 755 struct huge_bootmem_page *m = NULL; 756 void *map; 757 758 if (!READ_ONCE(vmemmap_optimize_enabled)) 759 return; 760 761 section_size = (1UL << PA_SECTION_SHIFT); 762 763 list_for_each_entry(m, &huge_boot_pages[nid], list) { 764 if (!vmemmap_should_optimize_bootmem_page(m)) 765 continue; 766 767 nr_pages = pages_per_huge_page(m->hstate); 768 psize = nr_pages << PAGE_SHIFT; 769 paddr = virt_to_phys(m); 770 pfn = PHYS_PFN(paddr); 771 map = pfn_to_page(pfn); 772 773 pnum = pfn_to_section_nr(pfn); 774 ns = psize / section_size; 775 776 for (i = 0; i < ns; i++) { 777 sparse_init_early_section(nid, map, pnum, 778 SECTION_IS_VMEMMAP_PREINIT); 779 map += section_map_size(); 780 pnum++; 781 } 782 783 m->flags |= HUGE_BOOTMEM_HVO; 784 } 785 } 786 787 static struct zone *pfn_to_zone(unsigned nid, unsigned long pfn) 788 { 789 struct zone *zone; 790 enum zone_type zone_type; 791 792 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 793 zone = &NODE_DATA(nid)->node_zones[zone_type]; 794 if (zone_spans_pfn(zone, pfn)) 795 return zone; 796 } 797 798 return NULL; 799 } 800 801 void __init hugetlb_vmemmap_init_late(int nid) 802 { 803 struct huge_bootmem_page *m, *tm; 804 unsigned long phys, nr_pages, start, end; 805 unsigned long pfn, nr_mmap; 806 struct zone *zone = NULL; 807 struct hstate *h; 808 void *map; 809 810 if (!READ_ONCE(vmemmap_optimize_enabled)) 811 return; 812 813 list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) { 814 if (!(m->flags & HUGE_BOOTMEM_HVO)) 815 continue; 816 817 phys = virt_to_phys(m); 818 h = m->hstate; 819 pfn = PHYS_PFN(phys); 820 nr_pages = pages_per_huge_page(h); 821 map = pfn_to_page(pfn); 822 start = (unsigned long)map; 823 end = start + nr_pages * sizeof(struct page); 824 825 if (!hugetlb_bootmem_page_zones_valid(nid, m)) { 826 /* 827 * Oops, the hugetlb page spans multiple zones. 828 * Remove it from the list, and populate it normally. 829 */ 830 list_del(&m->list); 831 832 vmemmap_populate(start, end, nid, NULL); 833 nr_mmap = end - start; 834 memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE)); 835 836 memblock_phys_free(phys, huge_page_size(h)); 837 continue; 838 } 839 840 if (!zone || !zone_spans_pfn(zone, pfn)) 841 zone = pfn_to_zone(nid, pfn); 842 if (WARN_ON_ONCE(!zone)) 843 continue; 844 845 if (vmemmap_populate_hvo(start, end, huge_page_order(h), zone, 846 HUGETLB_VMEMMAP_RESERVE_SIZE) < 0) { 847 /* Fallback if HVO population fails */ 848 vmemmap_populate(start, end, nid, NULL); 849 nr_mmap = end - start; 850 } else { 851 m->flags |= HUGE_BOOTMEM_ZONES_VALID; 852 nr_mmap = HUGETLB_VMEMMAP_RESERVE_SIZE; 853 } 854 855 memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE)); 856 } 857 } 858 #endif 859 860 static const struct ctl_table hugetlb_vmemmap_sysctls[] = { 861 { 862 .procname = "hugetlb_optimize_vmemmap", 863 .data = &vmemmap_optimize_enabled, 864 .maxlen = sizeof(vmemmap_optimize_enabled), 865 .mode = 0644, 866 .proc_handler = proc_dobool, 867 }, 868 }; 869 870 static int __init hugetlb_vmemmap_init(void) 871 { 872 const struct hstate *h; 873 struct zone *zone; 874 875 /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ 876 BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES); 877 878 for_each_zone(zone) { 879 for (int i = 0; i < NR_VMEMMAP_TAILS; i++) { 880 struct page *tail, *p; 881 unsigned int order; 882 883 tail = zone->vmemmap_tails[i]; 884 if (!tail) 885 continue; 886 887 order = i + VMEMMAP_TAIL_MIN_ORDER; 888 p = page_to_virt(tail); 889 for (int j = 0; j < PAGE_SIZE / sizeof(struct page); j++) 890 init_compound_tail(p + j, NULL, order, zone); 891 } 892 } 893 894 for_each_hstate(h) { 895 if (hugetlb_vmemmap_optimizable(h)) { 896 register_sysctl_init("vm", hugetlb_vmemmap_sysctls); 897 break; 898 } 899 } 900 return 0; 901 } 902 late_initcall(hugetlb_vmemmap_init); 903