1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * HugeTLB Vmemmap Optimization (HVO) 4 * 5 * Copyright (c) 2020, ByteDance. All rights reserved. 6 * 7 * Author: Muchun Song <songmuchun@bytedance.com> 8 * 9 * See Documentation/mm/vmemmap_dedup.rst 10 */ 11 #define pr_fmt(fmt) "HugeTLB: " fmt 12 13 #include <linux/pgtable.h> 14 #include <linux/moduleparam.h> 15 #include <linux/bootmem_info.h> 16 #include <linux/mmdebug.h> 17 #include <linux/pagewalk.h> 18 #include <linux/pgalloc.h> 19 20 #include <asm/tlbflush.h> 21 #include "hugetlb_vmemmap.h" 22 23 /** 24 * struct vmemmap_remap_walk - walk vmemmap page table 25 * 26 * @remap_pte: called for each lowest-level entry (PTE). 27 * @nr_walked: the number of walked pte. 28 * @reuse_page: the page which is reused for the tail vmemmap pages. 29 * @reuse_addr: the virtual address of the @reuse_page page. 30 * @vmemmap_pages: the list head of the vmemmap pages that can be freed 31 * or is mapped from. 32 * @flags: used to modify behavior in vmemmap page table walking 33 * operations. 34 */ 35 struct vmemmap_remap_walk { 36 void (*remap_pte)(pte_t *pte, unsigned long addr, 37 struct vmemmap_remap_walk *walk); 38 unsigned long nr_walked; 39 struct page *reuse_page; 40 unsigned long reuse_addr; 41 struct list_head *vmemmap_pages; 42 43 /* Skip the TLB flush when we split the PMD */ 44 #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) 45 /* Skip the TLB flush when we remap the PTE */ 46 #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1) 47 /* synchronize_rcu() to avoid writes from page_ref_add_unless() */ 48 #define VMEMMAP_SYNCHRONIZE_RCU BIT(2) 49 unsigned long flags; 50 }; 51 52 static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start, 53 struct vmemmap_remap_walk *walk) 54 { 55 pmd_t __pmd; 56 int i; 57 unsigned long addr = start; 58 pte_t *pgtable; 59 60 pgtable = pte_alloc_one_kernel(&init_mm); 61 if (!pgtable) 62 return -ENOMEM; 63 64 pmd_populate_kernel(&init_mm, &__pmd, pgtable); 65 66 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { 67 pte_t entry, *pte; 68 pgprot_t pgprot = PAGE_KERNEL; 69 70 entry = mk_pte(head + i, pgprot); 71 pte = pte_offset_kernel(&__pmd, addr); 72 set_pte_at(&init_mm, addr, pte, entry); 73 } 74 75 spin_lock(&init_mm.page_table_lock); 76 if (likely(pmd_leaf(*pmd))) { 77 /* 78 * Higher order allocations from buddy allocator must be able to 79 * be treated as independent small pages (as they can be freed 80 * individually). 81 */ 82 if (!PageReserved(head)) 83 split_page(head, get_order(PMD_SIZE)); 84 85 /* Make pte visible before pmd. See comment in pmd_install(). */ 86 smp_wmb(); 87 pmd_populate_kernel(&init_mm, pmd, pgtable); 88 if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)) 89 flush_tlb_kernel_range(start, start + PMD_SIZE); 90 } else { 91 pte_free_kernel(&init_mm, pgtable); 92 } 93 spin_unlock(&init_mm.page_table_lock); 94 95 return 0; 96 } 97 98 static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr, 99 unsigned long next, struct mm_walk *walk) 100 { 101 int ret = 0; 102 struct page *head; 103 struct vmemmap_remap_walk *vmemmap_walk = walk->private; 104 105 /* Only splitting, not remapping the vmemmap pages. */ 106 if (!vmemmap_walk->remap_pte) 107 walk->action = ACTION_CONTINUE; 108 109 spin_lock(&init_mm.page_table_lock); 110 head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL; 111 /* 112 * Due to HugeTLB alignment requirements and the vmemmap 113 * pages being at the start of the hotplugged memory 114 * region in memory_hotplug.memmap_on_memory case. Checking 115 * the vmemmap page associated with the first vmemmap page 116 * if it is self-hosted is sufficient. 117 * 118 * [ hotplugged memory ] 119 * [ section ][...][ section ] 120 * [ vmemmap ][ usable memory ] 121 * ^ | ^ | 122 * +--+ | | 123 * +------------------------+ 124 */ 125 if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) { 126 struct page *page = head ? head + pte_index(addr) : 127 pte_page(ptep_get(pte_offset_kernel(pmd, addr))); 128 129 if (PageVmemmapSelfHosted(page)) 130 ret = -ENOTSUPP; 131 } 132 spin_unlock(&init_mm.page_table_lock); 133 if (!head || ret) 134 return ret; 135 136 return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk); 137 } 138 139 static int vmemmap_pte_entry(pte_t *pte, unsigned long addr, 140 unsigned long next, struct mm_walk *walk) 141 { 142 struct vmemmap_remap_walk *vmemmap_walk = walk->private; 143 144 /* 145 * The reuse_page is found 'first' in page table walking before 146 * starting remapping. 147 */ 148 if (!vmemmap_walk->reuse_page) 149 vmemmap_walk->reuse_page = pte_page(ptep_get(pte)); 150 else 151 vmemmap_walk->remap_pte(pte, addr, vmemmap_walk); 152 vmemmap_walk->nr_walked++; 153 154 return 0; 155 } 156 157 static const struct mm_walk_ops vmemmap_remap_ops = { 158 .pmd_entry = vmemmap_pmd_entry, 159 .pte_entry = vmemmap_pte_entry, 160 }; 161 162 static int vmemmap_remap_range(unsigned long start, unsigned long end, 163 struct vmemmap_remap_walk *walk) 164 { 165 int ret; 166 167 VM_BUG_ON(!PAGE_ALIGNED(start | end)); 168 169 mmap_read_lock(&init_mm); 170 ret = walk_kernel_page_table_range(start, end, &vmemmap_remap_ops, 171 NULL, walk); 172 mmap_read_unlock(&init_mm); 173 if (ret) 174 return ret; 175 176 if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH)) 177 flush_tlb_kernel_range(start, end); 178 179 return 0; 180 } 181 182 /* 183 * Free a vmemmap page. A vmemmap page can be allocated from the memblock 184 * allocator or buddy allocator. If the PG_reserved flag is set, it means 185 * that it allocated from the memblock allocator, just free it via the 186 * free_bootmem_page(). Otherwise, use __free_page(). 187 */ 188 static inline void free_vmemmap_page(struct page *page) 189 { 190 if (PageReserved(page)) { 191 memmap_boot_pages_add(-1); 192 free_bootmem_page(page); 193 } else { 194 memmap_pages_add(-1); 195 __free_page(page); 196 } 197 } 198 199 /* Free a list of the vmemmap pages */ 200 static void free_vmemmap_page_list(struct list_head *list) 201 { 202 struct page *page, *next; 203 204 list_for_each_entry_safe(page, next, list, lru) 205 free_vmemmap_page(page); 206 } 207 208 static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, 209 struct vmemmap_remap_walk *walk) 210 { 211 /* 212 * Remap the tail pages as read-only to catch illegal write operation 213 * to the tail pages. 214 */ 215 pgprot_t pgprot = PAGE_KERNEL_RO; 216 struct page *page = pte_page(ptep_get(pte)); 217 pte_t entry; 218 219 /* Remapping the head page requires r/w */ 220 if (unlikely(addr == walk->reuse_addr)) { 221 pgprot = PAGE_KERNEL; 222 list_del(&walk->reuse_page->lru); 223 224 /* 225 * Makes sure that preceding stores to the page contents from 226 * vmemmap_remap_free() become visible before the set_pte_at() 227 * write. 228 */ 229 smp_wmb(); 230 } 231 232 entry = mk_pte(walk->reuse_page, pgprot); 233 list_add(&page->lru, walk->vmemmap_pages); 234 set_pte_at(&init_mm, addr, pte, entry); 235 } 236 237 /* 238 * How many struct page structs need to be reset. When we reuse the head 239 * struct page, the special metadata (e.g. page->flags or page->mapping) 240 * cannot copy to the tail struct page structs. The invalid value will be 241 * checked in the free_tail_page_prepare(). In order to avoid the message 242 * of "corrupted mapping in tail page". We need to reset at least 4 (one 243 * head struct page struct and three tail struct page structs) struct page 244 * structs. 245 */ 246 #define NR_RESET_STRUCT_PAGE 4 247 248 static inline void reset_struct_pages(struct page *start) 249 { 250 struct page *from = start + NR_RESET_STRUCT_PAGE; 251 252 BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page)); 253 memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE); 254 } 255 256 static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, 257 struct vmemmap_remap_walk *walk) 258 { 259 pgprot_t pgprot = PAGE_KERNEL; 260 struct page *page; 261 void *to; 262 263 BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page); 264 265 page = list_first_entry(walk->vmemmap_pages, struct page, lru); 266 list_del(&page->lru); 267 to = page_to_virt(page); 268 copy_page(to, (void *)walk->reuse_addr); 269 reset_struct_pages(to); 270 271 /* 272 * Makes sure that preceding stores to the page contents become visible 273 * before the set_pte_at() write. 274 */ 275 smp_wmb(); 276 set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); 277 } 278 279 /** 280 * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end) 281 * backing PMDs of the directmap into PTEs 282 * @start: start address of the vmemmap virtual address range that we want 283 * to remap. 284 * @end: end address of the vmemmap virtual address range that we want to 285 * remap. 286 * @reuse: reuse address. 287 * 288 * Return: %0 on success, negative error code otherwise. 289 */ 290 static int vmemmap_remap_split(unsigned long start, unsigned long end, 291 unsigned long reuse) 292 { 293 struct vmemmap_remap_walk walk = { 294 .remap_pte = NULL, 295 .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH, 296 }; 297 298 /* See the comment in the vmemmap_remap_free(). */ 299 BUG_ON(start - reuse != PAGE_SIZE); 300 301 return vmemmap_remap_range(reuse, end, &walk); 302 } 303 304 /** 305 * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) 306 * to the page which @reuse is mapped to, then free vmemmap 307 * which the range are mapped to. 308 * @start: start address of the vmemmap virtual address range that we want 309 * to remap. 310 * @end: end address of the vmemmap virtual address range that we want to 311 * remap. 312 * @reuse: reuse address. 313 * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers 314 * responsibility to free pages. 315 * @flags: modifications to vmemmap_remap_walk flags 316 * 317 * Return: %0 on success, negative error code otherwise. 318 */ 319 static int vmemmap_remap_free(unsigned long start, unsigned long end, 320 unsigned long reuse, 321 struct list_head *vmemmap_pages, 322 unsigned long flags) 323 { 324 int ret; 325 struct vmemmap_remap_walk walk = { 326 .remap_pte = vmemmap_remap_pte, 327 .reuse_addr = reuse, 328 .vmemmap_pages = vmemmap_pages, 329 .flags = flags, 330 }; 331 int nid = page_to_nid((struct page *)reuse); 332 gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; 333 334 /* 335 * Allocate a new head vmemmap page to avoid breaking a contiguous 336 * block of struct page memory when freeing it back to page allocator 337 * in free_vmemmap_page_list(). This will allow the likely contiguous 338 * struct page backing memory to be kept contiguous and allowing for 339 * more allocations of hugepages. Fallback to the currently 340 * mapped head page in case should it fail to allocate. 341 */ 342 walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0); 343 if (walk.reuse_page) { 344 copy_page(page_to_virt(walk.reuse_page), 345 (void *)walk.reuse_addr); 346 list_add(&walk.reuse_page->lru, vmemmap_pages); 347 memmap_pages_add(1); 348 } 349 350 /* 351 * In order to make remapping routine most efficient for the huge pages, 352 * the routine of vmemmap page table walking has the following rules 353 * (see more details from the vmemmap_pte_range()): 354 * 355 * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE) 356 * should be continuous. 357 * - The @reuse address is part of the range [@reuse, @end) that we are 358 * walking which is passed to vmemmap_remap_range(). 359 * - The @reuse address is the first in the complete range. 360 * 361 * So we need to make sure that @start and @reuse meet the above rules. 362 */ 363 BUG_ON(start - reuse != PAGE_SIZE); 364 365 ret = vmemmap_remap_range(reuse, end, &walk); 366 if (ret && walk.nr_walked) { 367 end = reuse + walk.nr_walked * PAGE_SIZE; 368 /* 369 * vmemmap_pages contains pages from the previous 370 * vmemmap_remap_range call which failed. These 371 * are pages which were removed from the vmemmap. 372 * They will be restored in the following call. 373 */ 374 walk = (struct vmemmap_remap_walk) { 375 .remap_pte = vmemmap_restore_pte, 376 .reuse_addr = reuse, 377 .vmemmap_pages = vmemmap_pages, 378 .flags = 0, 379 }; 380 381 vmemmap_remap_range(reuse, end, &walk); 382 } 383 384 return ret; 385 } 386 387 static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, 388 struct list_head *list) 389 { 390 gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL; 391 unsigned long nr_pages = (end - start) >> PAGE_SHIFT; 392 int nid = page_to_nid((struct page *)start); 393 struct page *page, *next; 394 int i; 395 396 for (i = 0; i < nr_pages; i++) { 397 page = alloc_pages_node(nid, gfp_mask, 0); 398 if (!page) 399 goto out; 400 list_add(&page->lru, list); 401 } 402 memmap_pages_add(nr_pages); 403 404 return 0; 405 out: 406 list_for_each_entry_safe(page, next, list, lru) 407 __free_page(page); 408 return -ENOMEM; 409 } 410 411 /** 412 * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) 413 * to the page which is from the @vmemmap_pages 414 * respectively. 415 * @start: start address of the vmemmap virtual address range that we want 416 * to remap. 417 * @end: end address of the vmemmap virtual address range that we want to 418 * remap. 419 * @reuse: reuse address. 420 * @flags: modifications to vmemmap_remap_walk flags 421 * 422 * Return: %0 on success, negative error code otherwise. 423 */ 424 static int vmemmap_remap_alloc(unsigned long start, unsigned long end, 425 unsigned long reuse, unsigned long flags) 426 { 427 LIST_HEAD(vmemmap_pages); 428 struct vmemmap_remap_walk walk = { 429 .remap_pte = vmemmap_restore_pte, 430 .reuse_addr = reuse, 431 .vmemmap_pages = &vmemmap_pages, 432 .flags = flags, 433 }; 434 435 /* See the comment in the vmemmap_remap_free(). */ 436 BUG_ON(start - reuse != PAGE_SIZE); 437 438 if (alloc_vmemmap_page_list(start, end, &vmemmap_pages)) 439 return -ENOMEM; 440 441 return vmemmap_remap_range(reuse, end, &walk); 442 } 443 444 DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); 445 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); 446 447 static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); 448 static int __init hugetlb_vmemmap_optimize_param(char *buf) 449 { 450 return kstrtobool(buf, &vmemmap_optimize_enabled); 451 } 452 early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_optimize_param); 453 454 static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, 455 struct folio *folio, unsigned long flags) 456 { 457 int ret; 458 unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; 459 unsigned long vmemmap_reuse; 460 461 VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); 462 VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); 463 464 if (!folio_test_hugetlb_vmemmap_optimized(folio)) 465 return 0; 466 467 if (flags & VMEMMAP_SYNCHRONIZE_RCU) 468 synchronize_rcu(); 469 470 vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); 471 vmemmap_reuse = vmemmap_start; 472 vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; 473 474 /* 475 * The pages which the vmemmap virtual address range [@vmemmap_start, 476 * @vmemmap_end) are mapped to are freed to the buddy allocator, and 477 * the range is mapped to the page which @vmemmap_reuse is mapped to. 478 * When a HugeTLB page is freed to the buddy allocator, previously 479 * discarded vmemmap pages must be allocated and remapping. 480 */ 481 ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags); 482 if (!ret) { 483 folio_clear_hugetlb_vmemmap_optimized(folio); 484 static_branch_dec(&hugetlb_optimize_vmemmap_key); 485 } 486 487 return ret; 488 } 489 490 /** 491 * hugetlb_vmemmap_restore_folio - restore previously optimized (by 492 * hugetlb_vmemmap_optimize_folio()) vmemmap pages which 493 * will be reallocated and remapped. 494 * @h: struct hstate. 495 * @folio: the folio whose vmemmap pages will be restored. 496 * 497 * Return: %0 if @folio's vmemmap pages have been reallocated and remapped, 498 * negative error code otherwise. 499 */ 500 int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) 501 { 502 return __hugetlb_vmemmap_restore_folio(h, folio, VMEMMAP_SYNCHRONIZE_RCU); 503 } 504 505 /** 506 * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list. 507 * @h: hstate. 508 * @folio_list: list of folios. 509 * @non_hvo_folios: Output list of folios for which vmemmap exists. 510 * 511 * Return: number of folios for which vmemmap was restored, or an error code 512 * if an error was encountered restoring vmemmap for a folio. 513 * Folios that have vmemmap are moved to the non_hvo_folios 514 * list. Processing of entries stops when the first error is 515 * encountered. The folio that experienced the error and all 516 * non-processed folios will remain on folio_list. 517 */ 518 long hugetlb_vmemmap_restore_folios(const struct hstate *h, 519 struct list_head *folio_list, 520 struct list_head *non_hvo_folios) 521 { 522 struct folio *folio, *t_folio; 523 long restored = 0; 524 long ret = 0; 525 unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU; 526 527 list_for_each_entry_safe(folio, t_folio, folio_list, lru) { 528 if (folio_test_hugetlb_vmemmap_optimized(folio)) { 529 ret = __hugetlb_vmemmap_restore_folio(h, folio, flags); 530 /* only need to synchronize_rcu() once for each batch */ 531 flags &= ~VMEMMAP_SYNCHRONIZE_RCU; 532 533 if (ret) 534 break; 535 restored++; 536 } 537 538 /* Add non-optimized folios to output list */ 539 list_move(&folio->lru, non_hvo_folios); 540 } 541 542 if (restored) 543 flush_tlb_all(); 544 if (!ret) 545 ret = restored; 546 return ret; 547 } 548 549 /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ 550 static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio) 551 { 552 if (folio_test_hugetlb_vmemmap_optimized(folio)) 553 return false; 554 555 if (!READ_ONCE(vmemmap_optimize_enabled)) 556 return false; 557 558 if (!hugetlb_vmemmap_optimizable(h)) 559 return false; 560 561 return true; 562 } 563 564 static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, 565 struct folio *folio, 566 struct list_head *vmemmap_pages, 567 unsigned long flags) 568 { 569 int ret = 0; 570 unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; 571 unsigned long vmemmap_reuse; 572 573 VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); 574 VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); 575 576 if (!vmemmap_should_optimize_folio(h, folio)) 577 return ret; 578 579 static_branch_inc(&hugetlb_optimize_vmemmap_key); 580 581 if (flags & VMEMMAP_SYNCHRONIZE_RCU) 582 synchronize_rcu(); 583 /* 584 * Very Subtle 585 * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed 586 * immediately after remapping. As a result, subsequent accesses 587 * and modifications to struct pages associated with the hugetlb 588 * page could be to the OLD struct pages. Set the vmemmap optimized 589 * flag here so that it is copied to the new head page. This keeps 590 * the old and new struct pages in sync. 591 * If there is an error during optimization, we will immediately FLUSH 592 * the TLB and clear the flag below. 593 */ 594 folio_set_hugetlb_vmemmap_optimized(folio); 595 596 vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); 597 vmemmap_reuse = vmemmap_start; 598 vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; 599 600 /* 601 * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end) 602 * to the page which @vmemmap_reuse is mapped to. Add pages previously 603 * mapping the range to vmemmap_pages list so that they can be freed by 604 * the caller. 605 */ 606 ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, 607 vmemmap_pages, flags); 608 if (ret) { 609 static_branch_dec(&hugetlb_optimize_vmemmap_key); 610 folio_clear_hugetlb_vmemmap_optimized(folio); 611 } 612 613 return ret; 614 } 615 616 /** 617 * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages. 618 * @h: struct hstate. 619 * @folio: the folio whose vmemmap pages will be optimized. 620 * 621 * This function only tries to optimize @folio's vmemmap pages and does not 622 * guarantee that the optimization will succeed after it returns. The caller 623 * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's 624 * vmemmap pages have been optimized. 625 */ 626 void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) 627 { 628 LIST_HEAD(vmemmap_pages); 629 630 __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_SYNCHRONIZE_RCU); 631 free_vmemmap_page_list(&vmemmap_pages); 632 } 633 634 static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio) 635 { 636 unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; 637 unsigned long vmemmap_reuse; 638 639 if (!vmemmap_should_optimize_folio(h, folio)) 640 return 0; 641 642 vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); 643 vmemmap_reuse = vmemmap_start; 644 vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; 645 646 /* 647 * Split PMDs on the vmemmap virtual address range [@vmemmap_start, 648 * @vmemmap_end] 649 */ 650 return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); 651 } 652 653 static void __hugetlb_vmemmap_optimize_folios(struct hstate *h, 654 struct list_head *folio_list, 655 bool boot) 656 { 657 struct folio *folio; 658 int nr_to_optimize; 659 LIST_HEAD(vmemmap_pages); 660 unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU; 661 662 nr_to_optimize = 0; 663 list_for_each_entry(folio, folio_list, lru) { 664 int ret; 665 unsigned long spfn, epfn; 666 667 if (boot && folio_test_hugetlb_vmemmap_optimized(folio)) { 668 /* 669 * Already optimized by pre-HVO, just map the 670 * mirrored tail page structs RO. 671 */ 672 spfn = (unsigned long)&folio->page; 673 epfn = spfn + pages_per_huge_page(h); 674 vmemmap_wrprotect_hvo(spfn, epfn, folio_nid(folio), 675 HUGETLB_VMEMMAP_RESERVE_SIZE); 676 register_page_bootmem_memmap(pfn_to_section_nr(spfn), 677 &folio->page, 678 HUGETLB_VMEMMAP_RESERVE_SIZE); 679 static_branch_inc(&hugetlb_optimize_vmemmap_key); 680 continue; 681 } 682 683 nr_to_optimize++; 684 685 ret = hugetlb_vmemmap_split_folio(h, folio); 686 687 /* 688 * Splitting the PMD requires allocating a page, thus let's fail 689 * early once we encounter the first OOM. No point in retrying 690 * as it can be dynamically done on remap with the memory 691 * we get back from the vmemmap deduplication. 692 */ 693 if (ret == -ENOMEM) 694 break; 695 } 696 697 if (!nr_to_optimize) 698 /* 699 * All pre-HVO folios, nothing left to do. It's ok if 700 * there is a mix of pre-HVO and not yet HVO-ed folios 701 * here, as __hugetlb_vmemmap_optimize_folio() will 702 * skip any folios that already have the optimized flag 703 * set, see vmemmap_should_optimize_folio(). 704 */ 705 goto out; 706 707 flush_tlb_all(); 708 709 list_for_each_entry(folio, folio_list, lru) { 710 int ret; 711 712 ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags); 713 /* only need to synchronize_rcu() once for each batch */ 714 flags &= ~VMEMMAP_SYNCHRONIZE_RCU; 715 716 /* 717 * Pages to be freed may have been accumulated. If we 718 * encounter an ENOMEM, free what we have and try again. 719 * This can occur in the case that both splitting fails 720 * halfway and head page allocation also failed. In this 721 * case __hugetlb_vmemmap_optimize_folio() would free memory 722 * allowing more vmemmap remaps to occur. 723 */ 724 if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) { 725 flush_tlb_all(); 726 free_vmemmap_page_list(&vmemmap_pages); 727 INIT_LIST_HEAD(&vmemmap_pages); 728 __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags); 729 } 730 } 731 732 out: 733 flush_tlb_all(); 734 free_vmemmap_page_list(&vmemmap_pages); 735 } 736 737 void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) 738 { 739 __hugetlb_vmemmap_optimize_folios(h, folio_list, false); 740 } 741 742 void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list) 743 { 744 __hugetlb_vmemmap_optimize_folios(h, folio_list, true); 745 } 746 747 #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT 748 749 /* Return true of a bootmem allocated HugeTLB page should be pre-HVO-ed */ 750 static bool vmemmap_should_optimize_bootmem_page(struct huge_bootmem_page *m) 751 { 752 unsigned long section_size, psize, pmd_vmemmap_size; 753 phys_addr_t paddr; 754 755 if (!READ_ONCE(vmemmap_optimize_enabled)) 756 return false; 757 758 if (!hugetlb_vmemmap_optimizable(m->hstate)) 759 return false; 760 761 psize = huge_page_size(m->hstate); 762 paddr = virt_to_phys(m); 763 764 /* 765 * Pre-HVO only works if the bootmem huge page 766 * is aligned to the section size. 767 */ 768 section_size = (1UL << PA_SECTION_SHIFT); 769 if (!IS_ALIGNED(paddr, section_size) || 770 !IS_ALIGNED(psize, section_size)) 771 return false; 772 773 /* 774 * The pre-HVO code does not deal with splitting PMDS, 775 * so the bootmem page must be aligned to the number 776 * of base pages that can be mapped with one vmemmap PMD. 777 */ 778 pmd_vmemmap_size = (PMD_SIZE / (sizeof(struct page))) << PAGE_SHIFT; 779 if (!IS_ALIGNED(paddr, pmd_vmemmap_size) || 780 !IS_ALIGNED(psize, pmd_vmemmap_size)) 781 return false; 782 783 return true; 784 } 785 786 /* 787 * Initialize memmap section for a gigantic page, HVO-style. 788 */ 789 void __init hugetlb_vmemmap_init_early(int nid) 790 { 791 unsigned long psize, paddr, section_size; 792 unsigned long ns, i, pnum, pfn, nr_pages; 793 unsigned long start, end; 794 struct huge_bootmem_page *m = NULL; 795 void *map; 796 797 /* 798 * Noting to do if bootmem pages were not allocated 799 * early in boot, or if HVO wasn't enabled in the 800 * first place. 801 */ 802 if (!hugetlb_bootmem_allocated()) 803 return; 804 805 if (!READ_ONCE(vmemmap_optimize_enabled)) 806 return; 807 808 section_size = (1UL << PA_SECTION_SHIFT); 809 810 list_for_each_entry(m, &huge_boot_pages[nid], list) { 811 if (!vmemmap_should_optimize_bootmem_page(m)) 812 continue; 813 814 nr_pages = pages_per_huge_page(m->hstate); 815 psize = nr_pages << PAGE_SHIFT; 816 paddr = virt_to_phys(m); 817 pfn = PHYS_PFN(paddr); 818 map = pfn_to_page(pfn); 819 start = (unsigned long)map; 820 end = start + nr_pages * sizeof(struct page); 821 822 if (vmemmap_populate_hvo(start, end, nid, 823 HUGETLB_VMEMMAP_RESERVE_SIZE) < 0) 824 continue; 825 826 memmap_boot_pages_add(HUGETLB_VMEMMAP_RESERVE_SIZE / PAGE_SIZE); 827 828 pnum = pfn_to_section_nr(pfn); 829 ns = psize / section_size; 830 831 for (i = 0; i < ns; i++) { 832 sparse_init_early_section(nid, map, pnum, 833 SECTION_IS_VMEMMAP_PREINIT); 834 map += section_map_size(); 835 pnum++; 836 } 837 838 m->flags |= HUGE_BOOTMEM_HVO; 839 } 840 } 841 842 void __init hugetlb_vmemmap_init_late(int nid) 843 { 844 struct huge_bootmem_page *m, *tm; 845 unsigned long phys, nr_pages, start, end; 846 unsigned long pfn, nr_mmap; 847 struct hstate *h; 848 void *map; 849 850 if (!hugetlb_bootmem_allocated()) 851 return; 852 853 if (!READ_ONCE(vmemmap_optimize_enabled)) 854 return; 855 856 list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) { 857 if (!(m->flags & HUGE_BOOTMEM_HVO)) 858 continue; 859 860 phys = virt_to_phys(m); 861 h = m->hstate; 862 pfn = PHYS_PFN(phys); 863 nr_pages = pages_per_huge_page(h); 864 865 if (!hugetlb_bootmem_page_zones_valid(nid, m)) { 866 /* 867 * Oops, the hugetlb page spans multiple zones. 868 * Remove it from the list, and undo HVO. 869 */ 870 list_del(&m->list); 871 872 map = pfn_to_page(pfn); 873 874 start = (unsigned long)map; 875 end = start + nr_pages * sizeof(struct page); 876 877 vmemmap_undo_hvo(start, end, nid, 878 HUGETLB_VMEMMAP_RESERVE_SIZE); 879 nr_mmap = end - start - HUGETLB_VMEMMAP_RESERVE_SIZE; 880 memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE)); 881 882 memblock_phys_free(phys, huge_page_size(h)); 883 continue; 884 } else 885 m->flags |= HUGE_BOOTMEM_ZONES_VALID; 886 } 887 } 888 #endif 889 890 static const struct ctl_table hugetlb_vmemmap_sysctls[] = { 891 { 892 .procname = "hugetlb_optimize_vmemmap", 893 .data = &vmemmap_optimize_enabled, 894 .maxlen = sizeof(vmemmap_optimize_enabled), 895 .mode = 0644, 896 .proc_handler = proc_dobool, 897 }, 898 }; 899 900 static int __init hugetlb_vmemmap_init(void) 901 { 902 const struct hstate *h; 903 904 /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ 905 BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES); 906 907 for_each_hstate(h) { 908 if (hugetlb_vmemmap_optimizable(h)) { 909 register_sysctl_init("vm", hugetlb_vmemmap_sysctls); 910 break; 911 } 912 } 913 return 0; 914 } 915 late_initcall(hugetlb_vmemmap_init); 916