1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * HugeTLB Vmemmap Optimization (HVO) 4 * 5 * Copyright (c) 2020, ByteDance. All rights reserved. 6 * 7 * Author: Muchun Song <songmuchun@bytedance.com> 8 * 9 * See Documentation/mm/vmemmap_dedup.rst 10 */ 11 #define pr_fmt(fmt) "HugeTLB: " fmt 12 13 #include <linux/pgtable.h> 14 #include <linux/moduleparam.h> 15 #include <linux/bootmem_info.h> 16 #include <linux/mmdebug.h> 17 #include <linux/pagewalk.h> 18 #include <asm/pgalloc.h> 19 #include <asm/tlbflush.h> 20 #include "hugetlb_vmemmap.h" 21 22 /** 23 * struct vmemmap_remap_walk - walk vmemmap page table 24 * 25 * @remap_pte: called for each lowest-level entry (PTE). 26 * @nr_walked: the number of walked pte. 27 * @reuse_page: the page which is reused for the tail vmemmap pages. 28 * @reuse_addr: the virtual address of the @reuse_page page. 29 * @vmemmap_pages: the list head of the vmemmap pages that can be freed 30 * or is mapped from. 31 * @flags: used to modify behavior in vmemmap page table walking 32 * operations. 33 */ 34 struct vmemmap_remap_walk { 35 void (*remap_pte)(pte_t *pte, unsigned long addr, 36 struct vmemmap_remap_walk *walk); 37 unsigned long nr_walked; 38 struct page *reuse_page; 39 unsigned long reuse_addr; 40 struct list_head *vmemmap_pages; 41 42 /* Skip the TLB flush when we split the PMD */ 43 #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) 44 /* Skip the TLB flush when we remap the PTE */ 45 #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1) 46 /* synchronize_rcu() to avoid writes from page_ref_add_unless() */ 47 #define VMEMMAP_SYNCHRONIZE_RCU BIT(2) 48 unsigned long flags; 49 }; 50 51 static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start, 52 struct vmemmap_remap_walk *walk) 53 { 54 pmd_t __pmd; 55 int i; 56 unsigned long addr = start; 57 pte_t *pgtable; 58 59 pgtable = pte_alloc_one_kernel(&init_mm); 60 if (!pgtable) 61 return -ENOMEM; 62 63 pmd_populate_kernel(&init_mm, &__pmd, pgtable); 64 65 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { 66 pte_t entry, *pte; 67 pgprot_t pgprot = PAGE_KERNEL; 68 69 entry = mk_pte(head + i, pgprot); 70 pte = pte_offset_kernel(&__pmd, addr); 71 set_pte_at(&init_mm, addr, pte, entry); 72 } 73 74 spin_lock(&init_mm.page_table_lock); 75 if (likely(pmd_leaf(*pmd))) { 76 /* 77 * Higher order allocations from buddy allocator must be able to 78 * be treated as indepdenent small pages (as they can be freed 79 * individually). 80 */ 81 if (!PageReserved(head)) 82 split_page(head, get_order(PMD_SIZE)); 83 84 /* Make pte visible before pmd. See comment in pmd_install(). */ 85 smp_wmb(); 86 pmd_populate_kernel(&init_mm, pmd, pgtable); 87 if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)) 88 flush_tlb_kernel_range(start, start + PMD_SIZE); 89 } else { 90 pte_free_kernel(&init_mm, pgtable); 91 } 92 spin_unlock(&init_mm.page_table_lock); 93 94 return 0; 95 } 96 97 static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr, 98 unsigned long next, struct mm_walk *walk) 99 { 100 int ret = 0; 101 struct page *head; 102 struct vmemmap_remap_walk *vmemmap_walk = walk->private; 103 104 /* Only splitting, not remapping the vmemmap pages. */ 105 if (!vmemmap_walk->remap_pte) 106 walk->action = ACTION_CONTINUE; 107 108 spin_lock(&init_mm.page_table_lock); 109 head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL; 110 /* 111 * Due to HugeTLB alignment requirements and the vmemmap 112 * pages being at the start of the hotplugged memory 113 * region in memory_hotplug.memmap_on_memory case. Checking 114 * the vmemmap page associated with the first vmemmap page 115 * if it is self-hosted is sufficient. 116 * 117 * [ hotplugged memory ] 118 * [ section ][...][ section ] 119 * [ vmemmap ][ usable memory ] 120 * ^ | ^ | 121 * +--+ | | 122 * +------------------------+ 123 */ 124 if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) { 125 struct page *page = head ? head + pte_index(addr) : 126 pte_page(ptep_get(pte_offset_kernel(pmd, addr))); 127 128 if (PageVmemmapSelfHosted(page)) 129 ret = -ENOTSUPP; 130 } 131 spin_unlock(&init_mm.page_table_lock); 132 if (!head || ret) 133 return ret; 134 135 return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk); 136 } 137 138 static int vmemmap_pte_entry(pte_t *pte, unsigned long addr, 139 unsigned long next, struct mm_walk *walk) 140 { 141 struct vmemmap_remap_walk *vmemmap_walk = walk->private; 142 143 /* 144 * The reuse_page is found 'first' in page table walking before 145 * starting remapping. 146 */ 147 if (!vmemmap_walk->reuse_page) 148 vmemmap_walk->reuse_page = pte_page(ptep_get(pte)); 149 else 150 vmemmap_walk->remap_pte(pte, addr, vmemmap_walk); 151 vmemmap_walk->nr_walked++; 152 153 return 0; 154 } 155 156 static const struct mm_walk_ops vmemmap_remap_ops = { 157 .pmd_entry = vmemmap_pmd_entry, 158 .pte_entry = vmemmap_pte_entry, 159 }; 160 161 static int vmemmap_remap_range(unsigned long start, unsigned long end, 162 struct vmemmap_remap_walk *walk) 163 { 164 int ret; 165 166 VM_BUG_ON(!PAGE_ALIGNED(start | end)); 167 168 mmap_read_lock(&init_mm); 169 ret = walk_kernel_page_table_range(start, end, &vmemmap_remap_ops, 170 NULL, walk); 171 mmap_read_unlock(&init_mm); 172 if (ret) 173 return ret; 174 175 if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH)) 176 flush_tlb_kernel_range(start, end); 177 178 return 0; 179 } 180 181 /* 182 * Free a vmemmap page. A vmemmap page can be allocated from the memblock 183 * allocator or buddy allocator. If the PG_reserved flag is set, it means 184 * that it allocated from the memblock allocator, just free it via the 185 * free_bootmem_page(). Otherwise, use __free_page(). 186 */ 187 static inline void free_vmemmap_page(struct page *page) 188 { 189 if (PageReserved(page)) { 190 memmap_boot_pages_add(-1); 191 free_bootmem_page(page); 192 } else { 193 memmap_pages_add(-1); 194 __free_page(page); 195 } 196 } 197 198 /* Free a list of the vmemmap pages */ 199 static void free_vmemmap_page_list(struct list_head *list) 200 { 201 struct page *page, *next; 202 203 list_for_each_entry_safe(page, next, list, lru) 204 free_vmemmap_page(page); 205 } 206 207 static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, 208 struct vmemmap_remap_walk *walk) 209 { 210 /* 211 * Remap the tail pages as read-only to catch illegal write operation 212 * to the tail pages. 213 */ 214 pgprot_t pgprot = PAGE_KERNEL_RO; 215 struct page *page = pte_page(ptep_get(pte)); 216 pte_t entry; 217 218 /* Remapping the head page requires r/w */ 219 if (unlikely(addr == walk->reuse_addr)) { 220 pgprot = PAGE_KERNEL; 221 list_del(&walk->reuse_page->lru); 222 223 /* 224 * Makes sure that preceding stores to the page contents from 225 * vmemmap_remap_free() become visible before the set_pte_at() 226 * write. 227 */ 228 smp_wmb(); 229 } 230 231 entry = mk_pte(walk->reuse_page, pgprot); 232 list_add(&page->lru, walk->vmemmap_pages); 233 set_pte_at(&init_mm, addr, pte, entry); 234 } 235 236 /* 237 * How many struct page structs need to be reset. When we reuse the head 238 * struct page, the special metadata (e.g. page->flags or page->mapping) 239 * cannot copy to the tail struct page structs. The invalid value will be 240 * checked in the free_tail_page_prepare(). In order to avoid the message 241 * of "corrupted mapping in tail page". We need to reset at least 4 (one 242 * head struct page struct and three tail struct page structs) struct page 243 * structs. 244 */ 245 #define NR_RESET_STRUCT_PAGE 4 246 247 static inline void reset_struct_pages(struct page *start) 248 { 249 struct page *from = start + NR_RESET_STRUCT_PAGE; 250 251 BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page)); 252 memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE); 253 } 254 255 static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, 256 struct vmemmap_remap_walk *walk) 257 { 258 pgprot_t pgprot = PAGE_KERNEL; 259 struct page *page; 260 void *to; 261 262 BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page); 263 264 page = list_first_entry(walk->vmemmap_pages, struct page, lru); 265 list_del(&page->lru); 266 to = page_to_virt(page); 267 copy_page(to, (void *)walk->reuse_addr); 268 reset_struct_pages(to); 269 270 /* 271 * Makes sure that preceding stores to the page contents become visible 272 * before the set_pte_at() write. 273 */ 274 smp_wmb(); 275 set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); 276 } 277 278 /** 279 * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end) 280 * backing PMDs of the directmap into PTEs 281 * @start: start address of the vmemmap virtual address range that we want 282 * to remap. 283 * @end: end address of the vmemmap virtual address range that we want to 284 * remap. 285 * @reuse: reuse address. 286 * 287 * Return: %0 on success, negative error code otherwise. 288 */ 289 static int vmemmap_remap_split(unsigned long start, unsigned long end, 290 unsigned long reuse) 291 { 292 struct vmemmap_remap_walk walk = { 293 .remap_pte = NULL, 294 .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH, 295 }; 296 297 /* See the comment in the vmemmap_remap_free(). */ 298 BUG_ON(start - reuse != PAGE_SIZE); 299 300 return vmemmap_remap_range(reuse, end, &walk); 301 } 302 303 /** 304 * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) 305 * to the page which @reuse is mapped to, then free vmemmap 306 * which the range are mapped to. 307 * @start: start address of the vmemmap virtual address range that we want 308 * to remap. 309 * @end: end address of the vmemmap virtual address range that we want to 310 * remap. 311 * @reuse: reuse address. 312 * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers 313 * responsibility to free pages. 314 * @flags: modifications to vmemmap_remap_walk flags 315 * 316 * Return: %0 on success, negative error code otherwise. 317 */ 318 static int vmemmap_remap_free(unsigned long start, unsigned long end, 319 unsigned long reuse, 320 struct list_head *vmemmap_pages, 321 unsigned long flags) 322 { 323 int ret; 324 struct vmemmap_remap_walk walk = { 325 .remap_pte = vmemmap_remap_pte, 326 .reuse_addr = reuse, 327 .vmemmap_pages = vmemmap_pages, 328 .flags = flags, 329 }; 330 int nid = page_to_nid((struct page *)reuse); 331 gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; 332 333 /* 334 * Allocate a new head vmemmap page to avoid breaking a contiguous 335 * block of struct page memory when freeing it back to page allocator 336 * in free_vmemmap_page_list(). This will allow the likely contiguous 337 * struct page backing memory to be kept contiguous and allowing for 338 * more allocations of hugepages. Fallback to the currently 339 * mapped head page in case should it fail to allocate. 340 */ 341 walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0); 342 if (walk.reuse_page) { 343 copy_page(page_to_virt(walk.reuse_page), 344 (void *)walk.reuse_addr); 345 list_add(&walk.reuse_page->lru, vmemmap_pages); 346 memmap_pages_add(1); 347 } 348 349 /* 350 * In order to make remapping routine most efficient for the huge pages, 351 * the routine of vmemmap page table walking has the following rules 352 * (see more details from the vmemmap_pte_range()): 353 * 354 * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE) 355 * should be continuous. 356 * - The @reuse address is part of the range [@reuse, @end) that we are 357 * walking which is passed to vmemmap_remap_range(). 358 * - The @reuse address is the first in the complete range. 359 * 360 * So we need to make sure that @start and @reuse meet the above rules. 361 */ 362 BUG_ON(start - reuse != PAGE_SIZE); 363 364 ret = vmemmap_remap_range(reuse, end, &walk); 365 if (ret && walk.nr_walked) { 366 end = reuse + walk.nr_walked * PAGE_SIZE; 367 /* 368 * vmemmap_pages contains pages from the previous 369 * vmemmap_remap_range call which failed. These 370 * are pages which were removed from the vmemmap. 371 * They will be restored in the following call. 372 */ 373 walk = (struct vmemmap_remap_walk) { 374 .remap_pte = vmemmap_restore_pte, 375 .reuse_addr = reuse, 376 .vmemmap_pages = vmemmap_pages, 377 .flags = 0, 378 }; 379 380 vmemmap_remap_range(reuse, end, &walk); 381 } 382 383 return ret; 384 } 385 386 static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, 387 struct list_head *list) 388 { 389 gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL; 390 unsigned long nr_pages = (end - start) >> PAGE_SHIFT; 391 int nid = page_to_nid((struct page *)start); 392 struct page *page, *next; 393 int i; 394 395 for (i = 0; i < nr_pages; i++) { 396 page = alloc_pages_node(nid, gfp_mask, 0); 397 if (!page) 398 goto out; 399 list_add(&page->lru, list); 400 } 401 memmap_pages_add(nr_pages); 402 403 return 0; 404 out: 405 list_for_each_entry_safe(page, next, list, lru) 406 __free_page(page); 407 return -ENOMEM; 408 } 409 410 /** 411 * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) 412 * to the page which is from the @vmemmap_pages 413 * respectively. 414 * @start: start address of the vmemmap virtual address range that we want 415 * to remap. 416 * @end: end address of the vmemmap virtual address range that we want to 417 * remap. 418 * @reuse: reuse address. 419 * @flags: modifications to vmemmap_remap_walk flags 420 * 421 * Return: %0 on success, negative error code otherwise. 422 */ 423 static int vmemmap_remap_alloc(unsigned long start, unsigned long end, 424 unsigned long reuse, unsigned long flags) 425 { 426 LIST_HEAD(vmemmap_pages); 427 struct vmemmap_remap_walk walk = { 428 .remap_pte = vmemmap_restore_pte, 429 .reuse_addr = reuse, 430 .vmemmap_pages = &vmemmap_pages, 431 .flags = flags, 432 }; 433 434 /* See the comment in the vmemmap_remap_free(). */ 435 BUG_ON(start - reuse != PAGE_SIZE); 436 437 if (alloc_vmemmap_page_list(start, end, &vmemmap_pages)) 438 return -ENOMEM; 439 440 return vmemmap_remap_range(reuse, end, &walk); 441 } 442 443 DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); 444 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); 445 446 static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); 447 static int __init hugetlb_vmemmap_optimize_param(char *buf) 448 { 449 return kstrtobool(buf, &vmemmap_optimize_enabled); 450 } 451 early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_optimize_param); 452 453 static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, 454 struct folio *folio, unsigned long flags) 455 { 456 int ret; 457 unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; 458 unsigned long vmemmap_reuse; 459 460 VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); 461 VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); 462 463 if (!folio_test_hugetlb_vmemmap_optimized(folio)) 464 return 0; 465 466 if (flags & VMEMMAP_SYNCHRONIZE_RCU) 467 synchronize_rcu(); 468 469 vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); 470 vmemmap_reuse = vmemmap_start; 471 vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; 472 473 /* 474 * The pages which the vmemmap virtual address range [@vmemmap_start, 475 * @vmemmap_end) are mapped to are freed to the buddy allocator, and 476 * the range is mapped to the page which @vmemmap_reuse is mapped to. 477 * When a HugeTLB page is freed to the buddy allocator, previously 478 * discarded vmemmap pages must be allocated and remapping. 479 */ 480 ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags); 481 if (!ret) { 482 folio_clear_hugetlb_vmemmap_optimized(folio); 483 static_branch_dec(&hugetlb_optimize_vmemmap_key); 484 } 485 486 return ret; 487 } 488 489 /** 490 * hugetlb_vmemmap_restore_folio - restore previously optimized (by 491 * hugetlb_vmemmap_optimize_folio()) vmemmap pages which 492 * will be reallocated and remapped. 493 * @h: struct hstate. 494 * @folio: the folio whose vmemmap pages will be restored. 495 * 496 * Return: %0 if @folio's vmemmap pages have been reallocated and remapped, 497 * negative error code otherwise. 498 */ 499 int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) 500 { 501 return __hugetlb_vmemmap_restore_folio(h, folio, VMEMMAP_SYNCHRONIZE_RCU); 502 } 503 504 /** 505 * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list. 506 * @h: hstate. 507 * @folio_list: list of folios. 508 * @non_hvo_folios: Output list of folios for which vmemmap exists. 509 * 510 * Return: number of folios for which vmemmap was restored, or an error code 511 * if an error was encountered restoring vmemmap for a folio. 512 * Folios that have vmemmap are moved to the non_hvo_folios 513 * list. Processing of entries stops when the first error is 514 * encountered. The folio that experienced the error and all 515 * non-processed folios will remain on folio_list. 516 */ 517 long hugetlb_vmemmap_restore_folios(const struct hstate *h, 518 struct list_head *folio_list, 519 struct list_head *non_hvo_folios) 520 { 521 struct folio *folio, *t_folio; 522 long restored = 0; 523 long ret = 0; 524 unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU; 525 526 list_for_each_entry_safe(folio, t_folio, folio_list, lru) { 527 if (folio_test_hugetlb_vmemmap_optimized(folio)) { 528 ret = __hugetlb_vmemmap_restore_folio(h, folio, flags); 529 /* only need to synchronize_rcu() once for each batch */ 530 flags &= ~VMEMMAP_SYNCHRONIZE_RCU; 531 532 if (ret) 533 break; 534 restored++; 535 } 536 537 /* Add non-optimized folios to output list */ 538 list_move(&folio->lru, non_hvo_folios); 539 } 540 541 if (restored) 542 flush_tlb_all(); 543 if (!ret) 544 ret = restored; 545 return ret; 546 } 547 548 /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ 549 static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio) 550 { 551 if (folio_test_hugetlb_vmemmap_optimized(folio)) 552 return false; 553 554 if (!READ_ONCE(vmemmap_optimize_enabled)) 555 return false; 556 557 if (!hugetlb_vmemmap_optimizable(h)) 558 return false; 559 560 return true; 561 } 562 563 static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, 564 struct folio *folio, 565 struct list_head *vmemmap_pages, 566 unsigned long flags) 567 { 568 int ret = 0; 569 unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; 570 unsigned long vmemmap_reuse; 571 572 VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); 573 VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); 574 575 if (!vmemmap_should_optimize_folio(h, folio)) 576 return ret; 577 578 static_branch_inc(&hugetlb_optimize_vmemmap_key); 579 580 if (flags & VMEMMAP_SYNCHRONIZE_RCU) 581 synchronize_rcu(); 582 /* 583 * Very Subtle 584 * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed 585 * immediately after remapping. As a result, subsequent accesses 586 * and modifications to struct pages associated with the hugetlb 587 * page could be to the OLD struct pages. Set the vmemmap optimized 588 * flag here so that it is copied to the new head page. This keeps 589 * the old and new struct pages in sync. 590 * If there is an error during optimization, we will immediately FLUSH 591 * the TLB and clear the flag below. 592 */ 593 folio_set_hugetlb_vmemmap_optimized(folio); 594 595 vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); 596 vmemmap_reuse = vmemmap_start; 597 vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; 598 599 /* 600 * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end) 601 * to the page which @vmemmap_reuse is mapped to. Add pages previously 602 * mapping the range to vmemmap_pages list so that they can be freed by 603 * the caller. 604 */ 605 ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, 606 vmemmap_pages, flags); 607 if (ret) { 608 static_branch_dec(&hugetlb_optimize_vmemmap_key); 609 folio_clear_hugetlb_vmemmap_optimized(folio); 610 } 611 612 return ret; 613 } 614 615 /** 616 * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages. 617 * @h: struct hstate. 618 * @folio: the folio whose vmemmap pages will be optimized. 619 * 620 * This function only tries to optimize @folio's vmemmap pages and does not 621 * guarantee that the optimization will succeed after it returns. The caller 622 * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's 623 * vmemmap pages have been optimized. 624 */ 625 void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) 626 { 627 LIST_HEAD(vmemmap_pages); 628 629 __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_SYNCHRONIZE_RCU); 630 free_vmemmap_page_list(&vmemmap_pages); 631 } 632 633 static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio) 634 { 635 unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; 636 unsigned long vmemmap_reuse; 637 638 if (!vmemmap_should_optimize_folio(h, folio)) 639 return 0; 640 641 vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); 642 vmemmap_reuse = vmemmap_start; 643 vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; 644 645 /* 646 * Split PMDs on the vmemmap virtual address range [@vmemmap_start, 647 * @vmemmap_end] 648 */ 649 return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); 650 } 651 652 static void __hugetlb_vmemmap_optimize_folios(struct hstate *h, 653 struct list_head *folio_list, 654 bool boot) 655 { 656 struct folio *folio; 657 int nr_to_optimize; 658 LIST_HEAD(vmemmap_pages); 659 unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU; 660 661 nr_to_optimize = 0; 662 list_for_each_entry(folio, folio_list, lru) { 663 int ret; 664 unsigned long spfn, epfn; 665 666 if (boot && folio_test_hugetlb_vmemmap_optimized(folio)) { 667 /* 668 * Already optimized by pre-HVO, just map the 669 * mirrored tail page structs RO. 670 */ 671 spfn = (unsigned long)&folio->page; 672 epfn = spfn + pages_per_huge_page(h); 673 vmemmap_wrprotect_hvo(spfn, epfn, folio_nid(folio), 674 HUGETLB_VMEMMAP_RESERVE_SIZE); 675 register_page_bootmem_memmap(pfn_to_section_nr(spfn), 676 &folio->page, 677 HUGETLB_VMEMMAP_RESERVE_SIZE); 678 static_branch_inc(&hugetlb_optimize_vmemmap_key); 679 continue; 680 } 681 682 nr_to_optimize++; 683 684 ret = hugetlb_vmemmap_split_folio(h, folio); 685 686 /* 687 * Spliting the PMD requires allocating a page, thus lets fail 688 * early once we encounter the first OOM. No point in retrying 689 * as it can be dynamically done on remap with the memory 690 * we get back from the vmemmap deduplication. 691 */ 692 if (ret == -ENOMEM) 693 break; 694 } 695 696 if (!nr_to_optimize) 697 /* 698 * All pre-HVO folios, nothing left to do. It's ok if 699 * there is a mix of pre-HVO and not yet HVO-ed folios 700 * here, as __hugetlb_vmemmap_optimize_folio() will 701 * skip any folios that already have the optimized flag 702 * set, see vmemmap_should_optimize_folio(). 703 */ 704 goto out; 705 706 flush_tlb_all(); 707 708 list_for_each_entry(folio, folio_list, lru) { 709 int ret; 710 711 ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags); 712 /* only need to synchronize_rcu() once for each batch */ 713 flags &= ~VMEMMAP_SYNCHRONIZE_RCU; 714 715 /* 716 * Pages to be freed may have been accumulated. If we 717 * encounter an ENOMEM, free what we have and try again. 718 * This can occur in the case that both spliting fails 719 * halfway and head page allocation also failed. In this 720 * case __hugetlb_vmemmap_optimize_folio() would free memory 721 * allowing more vmemmap remaps to occur. 722 */ 723 if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) { 724 flush_tlb_all(); 725 free_vmemmap_page_list(&vmemmap_pages); 726 INIT_LIST_HEAD(&vmemmap_pages); 727 __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags); 728 } 729 } 730 731 out: 732 flush_tlb_all(); 733 free_vmemmap_page_list(&vmemmap_pages); 734 } 735 736 void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) 737 { 738 __hugetlb_vmemmap_optimize_folios(h, folio_list, false); 739 } 740 741 void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list) 742 { 743 __hugetlb_vmemmap_optimize_folios(h, folio_list, true); 744 } 745 746 #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT 747 748 /* Return true of a bootmem allocated HugeTLB page should be pre-HVO-ed */ 749 static bool vmemmap_should_optimize_bootmem_page(struct huge_bootmem_page *m) 750 { 751 unsigned long section_size, psize, pmd_vmemmap_size; 752 phys_addr_t paddr; 753 754 if (!READ_ONCE(vmemmap_optimize_enabled)) 755 return false; 756 757 if (!hugetlb_vmemmap_optimizable(m->hstate)) 758 return false; 759 760 psize = huge_page_size(m->hstate); 761 paddr = virt_to_phys(m); 762 763 /* 764 * Pre-HVO only works if the bootmem huge page 765 * is aligned to the section size. 766 */ 767 section_size = (1UL << PA_SECTION_SHIFT); 768 if (!IS_ALIGNED(paddr, section_size) || 769 !IS_ALIGNED(psize, section_size)) 770 return false; 771 772 /* 773 * The pre-HVO code does not deal with splitting PMDS, 774 * so the bootmem page must be aligned to the number 775 * of base pages that can be mapped with one vmemmap PMD. 776 */ 777 pmd_vmemmap_size = (PMD_SIZE / (sizeof(struct page))) << PAGE_SHIFT; 778 if (!IS_ALIGNED(paddr, pmd_vmemmap_size) || 779 !IS_ALIGNED(psize, pmd_vmemmap_size)) 780 return false; 781 782 return true; 783 } 784 785 /* 786 * Initialize memmap section for a gigantic page, HVO-style. 787 */ 788 void __init hugetlb_vmemmap_init_early(int nid) 789 { 790 unsigned long psize, paddr, section_size; 791 unsigned long ns, i, pnum, pfn, nr_pages; 792 unsigned long start, end; 793 struct huge_bootmem_page *m = NULL; 794 void *map; 795 796 /* 797 * Noting to do if bootmem pages were not allocated 798 * early in boot, or if HVO wasn't enabled in the 799 * first place. 800 */ 801 if (!hugetlb_bootmem_allocated()) 802 return; 803 804 if (!READ_ONCE(vmemmap_optimize_enabled)) 805 return; 806 807 section_size = (1UL << PA_SECTION_SHIFT); 808 809 list_for_each_entry(m, &huge_boot_pages[nid], list) { 810 if (!vmemmap_should_optimize_bootmem_page(m)) 811 continue; 812 813 nr_pages = pages_per_huge_page(m->hstate); 814 psize = nr_pages << PAGE_SHIFT; 815 paddr = virt_to_phys(m); 816 pfn = PHYS_PFN(paddr); 817 map = pfn_to_page(pfn); 818 start = (unsigned long)map; 819 end = start + nr_pages * sizeof(struct page); 820 821 if (vmemmap_populate_hvo(start, end, nid, 822 HUGETLB_VMEMMAP_RESERVE_SIZE) < 0) 823 continue; 824 825 memmap_boot_pages_add(HUGETLB_VMEMMAP_RESERVE_SIZE / PAGE_SIZE); 826 827 pnum = pfn_to_section_nr(pfn); 828 ns = psize / section_size; 829 830 for (i = 0; i < ns; i++) { 831 sparse_init_early_section(nid, map, pnum, 832 SECTION_IS_VMEMMAP_PREINIT); 833 map += section_map_size(); 834 pnum++; 835 } 836 837 m->flags |= HUGE_BOOTMEM_HVO; 838 } 839 } 840 841 void __init hugetlb_vmemmap_init_late(int nid) 842 { 843 struct huge_bootmem_page *m, *tm; 844 unsigned long phys, nr_pages, start, end; 845 unsigned long pfn, nr_mmap; 846 struct hstate *h; 847 void *map; 848 849 if (!hugetlb_bootmem_allocated()) 850 return; 851 852 if (!READ_ONCE(vmemmap_optimize_enabled)) 853 return; 854 855 list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) { 856 if (!(m->flags & HUGE_BOOTMEM_HVO)) 857 continue; 858 859 phys = virt_to_phys(m); 860 h = m->hstate; 861 pfn = PHYS_PFN(phys); 862 nr_pages = pages_per_huge_page(h); 863 864 if (!hugetlb_bootmem_page_zones_valid(nid, m)) { 865 /* 866 * Oops, the hugetlb page spans multiple zones. 867 * Remove it from the list, and undo HVO. 868 */ 869 list_del(&m->list); 870 871 map = pfn_to_page(pfn); 872 873 start = (unsigned long)map; 874 end = start + nr_pages * sizeof(struct page); 875 876 vmemmap_undo_hvo(start, end, nid, 877 HUGETLB_VMEMMAP_RESERVE_SIZE); 878 nr_mmap = end - start - HUGETLB_VMEMMAP_RESERVE_SIZE; 879 memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE)); 880 881 memblock_phys_free(phys, huge_page_size(h)); 882 continue; 883 } else 884 m->flags |= HUGE_BOOTMEM_ZONES_VALID; 885 } 886 } 887 #endif 888 889 static const struct ctl_table hugetlb_vmemmap_sysctls[] = { 890 { 891 .procname = "hugetlb_optimize_vmemmap", 892 .data = &vmemmap_optimize_enabled, 893 .maxlen = sizeof(vmemmap_optimize_enabled), 894 .mode = 0644, 895 .proc_handler = proc_dobool, 896 }, 897 }; 898 899 static int __init hugetlb_vmemmap_init(void) 900 { 901 const struct hstate *h; 902 903 /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ 904 BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES); 905 906 for_each_hstate(h) { 907 if (hugetlb_vmemmap_optimizable(h)) { 908 register_sysctl_init("vm", hugetlb_vmemmap_sysctls); 909 break; 910 } 911 } 912 return 0; 913 } 914 late_initcall(hugetlb_vmemmap_init); 915