1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * HugeTLB Vmemmap Optimization (HVO) 4 * 5 * Copyright (c) 2020, ByteDance. All rights reserved. 6 * 7 * Author: Muchun Song <songmuchun@bytedance.com> 8 * 9 * See Documentation/mm/vmemmap_dedup.rst 10 */ 11 #define pr_fmt(fmt) "HugeTLB: " fmt 12 13 #include <linux/pgtable.h> 14 #include <linux/moduleparam.h> 15 #include <linux/bootmem_info.h> 16 #include <linux/mmdebug.h> 17 #include <linux/pagewalk.h> 18 #include <asm/pgalloc.h> 19 #include <asm/tlbflush.h> 20 #include "hugetlb_vmemmap.h" 21 22 /** 23 * struct vmemmap_remap_walk - walk vmemmap page table 24 * 25 * @remap_pte: called for each lowest-level entry (PTE). 26 * @nr_walked: the number of walked pte. 27 * @reuse_page: the page which is reused for the tail vmemmap pages. 28 * @reuse_addr: the virtual address of the @reuse_page page. 29 * @vmemmap_pages: the list head of the vmemmap pages that can be freed 30 * or is mapped from. 31 * @flags: used to modify behavior in vmemmap page table walking 32 * operations. 33 */ 34 struct vmemmap_remap_walk { 35 void (*remap_pte)(pte_t *pte, unsigned long addr, 36 struct vmemmap_remap_walk *walk); 37 unsigned long nr_walked; 38 struct page *reuse_page; 39 unsigned long reuse_addr; 40 struct list_head *vmemmap_pages; 41 42 /* Skip the TLB flush when we split the PMD */ 43 #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) 44 /* Skip the TLB flush when we remap the PTE */ 45 #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1) 46 unsigned long flags; 47 }; 48 49 static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start, 50 struct vmemmap_remap_walk *walk) 51 { 52 pmd_t __pmd; 53 int i; 54 unsigned long addr = start; 55 pte_t *pgtable; 56 57 pgtable = pte_alloc_one_kernel(&init_mm); 58 if (!pgtable) 59 return -ENOMEM; 60 61 pmd_populate_kernel(&init_mm, &__pmd, pgtable); 62 63 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { 64 pte_t entry, *pte; 65 pgprot_t pgprot = PAGE_KERNEL; 66 67 entry = mk_pte(head + i, pgprot); 68 pte = pte_offset_kernel(&__pmd, addr); 69 set_pte_at(&init_mm, addr, pte, entry); 70 } 71 72 spin_lock(&init_mm.page_table_lock); 73 if (likely(pmd_leaf(*pmd))) { 74 /* 75 * Higher order allocations from buddy allocator must be able to 76 * be treated as indepdenent small pages (as they can be freed 77 * individually). 78 */ 79 if (!PageReserved(head)) 80 split_page(head, get_order(PMD_SIZE)); 81 82 /* Make pte visible before pmd. See comment in pmd_install(). */ 83 smp_wmb(); 84 pmd_populate_kernel(&init_mm, pmd, pgtable); 85 if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)) 86 flush_tlb_kernel_range(start, start + PMD_SIZE); 87 } else { 88 pte_free_kernel(&init_mm, pgtable); 89 } 90 spin_unlock(&init_mm.page_table_lock); 91 92 return 0; 93 } 94 95 static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr, 96 unsigned long next, struct mm_walk *walk) 97 { 98 int ret = 0; 99 struct page *head; 100 struct vmemmap_remap_walk *vmemmap_walk = walk->private; 101 102 /* Only splitting, not remapping the vmemmap pages. */ 103 if (!vmemmap_walk->remap_pte) 104 walk->action = ACTION_CONTINUE; 105 106 spin_lock(&init_mm.page_table_lock); 107 head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL; 108 /* 109 * Due to HugeTLB alignment requirements and the vmemmap 110 * pages being at the start of the hotplugged memory 111 * region in memory_hotplug.memmap_on_memory case. Checking 112 * the vmemmap page associated with the first vmemmap page 113 * if it is self-hosted is sufficient. 114 * 115 * [ hotplugged memory ] 116 * [ section ][...][ section ] 117 * [ vmemmap ][ usable memory ] 118 * ^ | ^ | 119 * +--+ | | 120 * +------------------------+ 121 */ 122 if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) { 123 struct page *page = head ? head + pte_index(addr) : 124 pte_page(ptep_get(pte_offset_kernel(pmd, addr))); 125 126 if (PageVmemmapSelfHosted(page)) 127 ret = -ENOTSUPP; 128 } 129 spin_unlock(&init_mm.page_table_lock); 130 if (!head || ret) 131 return ret; 132 133 return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk); 134 } 135 136 static int vmemmap_pte_entry(pte_t *pte, unsigned long addr, 137 unsigned long next, struct mm_walk *walk) 138 { 139 struct vmemmap_remap_walk *vmemmap_walk = walk->private; 140 141 /* 142 * The reuse_page is found 'first' in page table walking before 143 * starting remapping. 144 */ 145 if (!vmemmap_walk->reuse_page) 146 vmemmap_walk->reuse_page = pte_page(ptep_get(pte)); 147 else 148 vmemmap_walk->remap_pte(pte, addr, vmemmap_walk); 149 vmemmap_walk->nr_walked++; 150 151 return 0; 152 } 153 154 static const struct mm_walk_ops vmemmap_remap_ops = { 155 .pmd_entry = vmemmap_pmd_entry, 156 .pte_entry = vmemmap_pte_entry, 157 }; 158 159 static int vmemmap_remap_range(unsigned long start, unsigned long end, 160 struct vmemmap_remap_walk *walk) 161 { 162 int ret; 163 164 VM_BUG_ON(!PAGE_ALIGNED(start | end)); 165 166 mmap_read_lock(&init_mm); 167 ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops, 168 NULL, walk); 169 mmap_read_unlock(&init_mm); 170 if (ret) 171 return ret; 172 173 if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH)) 174 flush_tlb_kernel_range(start, end); 175 176 return 0; 177 } 178 179 /* 180 * Free a vmemmap page. A vmemmap page can be allocated from the memblock 181 * allocator or buddy allocator. If the PG_reserved flag is set, it means 182 * that it allocated from the memblock allocator, just free it via the 183 * free_bootmem_page(). Otherwise, use __free_page(). 184 */ 185 static inline void free_vmemmap_page(struct page *page) 186 { 187 if (PageReserved(page)) { 188 memmap_boot_pages_add(-1); 189 free_bootmem_page(page); 190 } else { 191 memmap_pages_add(-1); 192 __free_page(page); 193 } 194 } 195 196 /* Free a list of the vmemmap pages */ 197 static void free_vmemmap_page_list(struct list_head *list) 198 { 199 struct page *page, *next; 200 201 list_for_each_entry_safe(page, next, list, lru) 202 free_vmemmap_page(page); 203 } 204 205 static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, 206 struct vmemmap_remap_walk *walk) 207 { 208 /* 209 * Remap the tail pages as read-only to catch illegal write operation 210 * to the tail pages. 211 */ 212 pgprot_t pgprot = PAGE_KERNEL_RO; 213 struct page *page = pte_page(ptep_get(pte)); 214 pte_t entry; 215 216 /* Remapping the head page requires r/w */ 217 if (unlikely(addr == walk->reuse_addr)) { 218 pgprot = PAGE_KERNEL; 219 list_del(&walk->reuse_page->lru); 220 221 /* 222 * Makes sure that preceding stores to the page contents from 223 * vmemmap_remap_free() become visible before the set_pte_at() 224 * write. 225 */ 226 smp_wmb(); 227 } 228 229 entry = mk_pte(walk->reuse_page, pgprot); 230 list_add(&page->lru, walk->vmemmap_pages); 231 set_pte_at(&init_mm, addr, pte, entry); 232 } 233 234 /* 235 * How many struct page structs need to be reset. When we reuse the head 236 * struct page, the special metadata (e.g. page->flags or page->mapping) 237 * cannot copy to the tail struct page structs. The invalid value will be 238 * checked in the free_tail_page_prepare(). In order to avoid the message 239 * of "corrupted mapping in tail page". We need to reset at least 3 (one 240 * head struct page struct and two tail struct page structs) struct page 241 * structs. 242 */ 243 #define NR_RESET_STRUCT_PAGE 3 244 245 static inline void reset_struct_pages(struct page *start) 246 { 247 struct page *from = start + NR_RESET_STRUCT_PAGE; 248 249 BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page)); 250 memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE); 251 } 252 253 static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, 254 struct vmemmap_remap_walk *walk) 255 { 256 pgprot_t pgprot = PAGE_KERNEL; 257 struct page *page; 258 void *to; 259 260 BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page); 261 262 page = list_first_entry(walk->vmemmap_pages, struct page, lru); 263 list_del(&page->lru); 264 to = page_to_virt(page); 265 copy_page(to, (void *)walk->reuse_addr); 266 reset_struct_pages(to); 267 268 /* 269 * Makes sure that preceding stores to the page contents become visible 270 * before the set_pte_at() write. 271 */ 272 smp_wmb(); 273 set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); 274 } 275 276 /** 277 * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end) 278 * backing PMDs of the directmap into PTEs 279 * @start: start address of the vmemmap virtual address range that we want 280 * to remap. 281 * @end: end address of the vmemmap virtual address range that we want to 282 * remap. 283 * @reuse: reuse address. 284 * 285 * Return: %0 on success, negative error code otherwise. 286 */ 287 static int vmemmap_remap_split(unsigned long start, unsigned long end, 288 unsigned long reuse) 289 { 290 struct vmemmap_remap_walk walk = { 291 .remap_pte = NULL, 292 .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH, 293 }; 294 295 /* See the comment in the vmemmap_remap_free(). */ 296 BUG_ON(start - reuse != PAGE_SIZE); 297 298 return vmemmap_remap_range(reuse, end, &walk); 299 } 300 301 /** 302 * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) 303 * to the page which @reuse is mapped to, then free vmemmap 304 * which the range are mapped to. 305 * @start: start address of the vmemmap virtual address range that we want 306 * to remap. 307 * @end: end address of the vmemmap virtual address range that we want to 308 * remap. 309 * @reuse: reuse address. 310 * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers 311 * responsibility to free pages. 312 * @flags: modifications to vmemmap_remap_walk flags 313 * 314 * Return: %0 on success, negative error code otherwise. 315 */ 316 static int vmemmap_remap_free(unsigned long start, unsigned long end, 317 unsigned long reuse, 318 struct list_head *vmemmap_pages, 319 unsigned long flags) 320 { 321 int ret; 322 struct vmemmap_remap_walk walk = { 323 .remap_pte = vmemmap_remap_pte, 324 .reuse_addr = reuse, 325 .vmemmap_pages = vmemmap_pages, 326 .flags = flags, 327 }; 328 int nid = page_to_nid((struct page *)reuse); 329 gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; 330 331 /* 332 * Allocate a new head vmemmap page to avoid breaking a contiguous 333 * block of struct page memory when freeing it back to page allocator 334 * in free_vmemmap_page_list(). This will allow the likely contiguous 335 * struct page backing memory to be kept contiguous and allowing for 336 * more allocations of hugepages. Fallback to the currently 337 * mapped head page in case should it fail to allocate. 338 */ 339 walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0); 340 if (walk.reuse_page) { 341 copy_page(page_to_virt(walk.reuse_page), 342 (void *)walk.reuse_addr); 343 list_add(&walk.reuse_page->lru, vmemmap_pages); 344 memmap_pages_add(1); 345 } 346 347 /* 348 * In order to make remapping routine most efficient for the huge pages, 349 * the routine of vmemmap page table walking has the following rules 350 * (see more details from the vmemmap_pte_range()): 351 * 352 * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE) 353 * should be continuous. 354 * - The @reuse address is part of the range [@reuse, @end) that we are 355 * walking which is passed to vmemmap_remap_range(). 356 * - The @reuse address is the first in the complete range. 357 * 358 * So we need to make sure that @start and @reuse meet the above rules. 359 */ 360 BUG_ON(start - reuse != PAGE_SIZE); 361 362 ret = vmemmap_remap_range(reuse, end, &walk); 363 if (ret && walk.nr_walked) { 364 end = reuse + walk.nr_walked * PAGE_SIZE; 365 /* 366 * vmemmap_pages contains pages from the previous 367 * vmemmap_remap_range call which failed. These 368 * are pages which were removed from the vmemmap. 369 * They will be restored in the following call. 370 */ 371 walk = (struct vmemmap_remap_walk) { 372 .remap_pte = vmemmap_restore_pte, 373 .reuse_addr = reuse, 374 .vmemmap_pages = vmemmap_pages, 375 .flags = 0, 376 }; 377 378 vmemmap_remap_range(reuse, end, &walk); 379 } 380 381 return ret; 382 } 383 384 static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, 385 struct list_head *list) 386 { 387 gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL; 388 unsigned long nr_pages = (end - start) >> PAGE_SHIFT; 389 int nid = page_to_nid((struct page *)start); 390 struct page *page, *next; 391 int i; 392 393 for (i = 0; i < nr_pages; i++) { 394 page = alloc_pages_node(nid, gfp_mask, 0); 395 if (!page) 396 goto out; 397 list_add(&page->lru, list); 398 } 399 memmap_pages_add(nr_pages); 400 401 return 0; 402 out: 403 list_for_each_entry_safe(page, next, list, lru) 404 __free_page(page); 405 return -ENOMEM; 406 } 407 408 /** 409 * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) 410 * to the page which is from the @vmemmap_pages 411 * respectively. 412 * @start: start address of the vmemmap virtual address range that we want 413 * to remap. 414 * @end: end address of the vmemmap virtual address range that we want to 415 * remap. 416 * @reuse: reuse address. 417 * @flags: modifications to vmemmap_remap_walk flags 418 * 419 * Return: %0 on success, negative error code otherwise. 420 */ 421 static int vmemmap_remap_alloc(unsigned long start, unsigned long end, 422 unsigned long reuse, unsigned long flags) 423 { 424 LIST_HEAD(vmemmap_pages); 425 struct vmemmap_remap_walk walk = { 426 .remap_pte = vmemmap_restore_pte, 427 .reuse_addr = reuse, 428 .vmemmap_pages = &vmemmap_pages, 429 .flags = flags, 430 }; 431 432 /* See the comment in the vmemmap_remap_free(). */ 433 BUG_ON(start - reuse != PAGE_SIZE); 434 435 if (alloc_vmemmap_page_list(start, end, &vmemmap_pages)) 436 return -ENOMEM; 437 438 return vmemmap_remap_range(reuse, end, &walk); 439 } 440 441 DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); 442 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); 443 444 static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); 445 core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0); 446 447 static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, 448 struct folio *folio, unsigned long flags) 449 { 450 int ret; 451 unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; 452 unsigned long vmemmap_reuse; 453 454 VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); 455 VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); 456 457 if (!folio_test_hugetlb_vmemmap_optimized(folio)) 458 return 0; 459 460 vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); 461 vmemmap_reuse = vmemmap_start; 462 vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; 463 464 /* 465 * The pages which the vmemmap virtual address range [@vmemmap_start, 466 * @vmemmap_end) are mapped to are freed to the buddy allocator, and 467 * the range is mapped to the page which @vmemmap_reuse is mapped to. 468 * When a HugeTLB page is freed to the buddy allocator, previously 469 * discarded vmemmap pages must be allocated and remapping. 470 */ 471 ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags); 472 if (!ret) { 473 folio_clear_hugetlb_vmemmap_optimized(folio); 474 static_branch_dec(&hugetlb_optimize_vmemmap_key); 475 } 476 477 return ret; 478 } 479 480 /** 481 * hugetlb_vmemmap_restore_folio - restore previously optimized (by 482 * hugetlb_vmemmap_optimize_folio()) vmemmap pages which 483 * will be reallocated and remapped. 484 * @h: struct hstate. 485 * @folio: the folio whose vmemmap pages will be restored. 486 * 487 * Return: %0 if @folio's vmemmap pages have been reallocated and remapped, 488 * negative error code otherwise. 489 */ 490 int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) 491 { 492 /* avoid writes from page_ref_add_unless() while unfolding vmemmap */ 493 synchronize_rcu(); 494 495 return __hugetlb_vmemmap_restore_folio(h, folio, 0); 496 } 497 498 /** 499 * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list. 500 * @h: hstate. 501 * @folio_list: list of folios. 502 * @non_hvo_folios: Output list of folios for which vmemmap exists. 503 * 504 * Return: number of folios for which vmemmap was restored, or an error code 505 * if an error was encountered restoring vmemmap for a folio. 506 * Folios that have vmemmap are moved to the non_hvo_folios 507 * list. Processing of entries stops when the first error is 508 * encountered. The folio that experienced the error and all 509 * non-processed folios will remain on folio_list. 510 */ 511 long hugetlb_vmemmap_restore_folios(const struct hstate *h, 512 struct list_head *folio_list, 513 struct list_head *non_hvo_folios) 514 { 515 struct folio *folio, *t_folio; 516 long restored = 0; 517 long ret = 0; 518 519 /* avoid writes from page_ref_add_unless() while unfolding vmemmap */ 520 synchronize_rcu(); 521 522 list_for_each_entry_safe(folio, t_folio, folio_list, lru) { 523 if (folio_test_hugetlb_vmemmap_optimized(folio)) { 524 ret = __hugetlb_vmemmap_restore_folio(h, folio, 525 VMEMMAP_REMAP_NO_TLB_FLUSH); 526 if (ret) 527 break; 528 restored++; 529 } 530 531 /* Add non-optimized folios to output list */ 532 list_move(&folio->lru, non_hvo_folios); 533 } 534 535 if (restored) 536 flush_tlb_all(); 537 if (!ret) 538 ret = restored; 539 return ret; 540 } 541 542 /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ 543 static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio) 544 { 545 if (folio_test_hugetlb_vmemmap_optimized(folio)) 546 return false; 547 548 if (!READ_ONCE(vmemmap_optimize_enabled)) 549 return false; 550 551 if (!hugetlb_vmemmap_optimizable(h)) 552 return false; 553 554 return true; 555 } 556 557 static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, 558 struct folio *folio, 559 struct list_head *vmemmap_pages, 560 unsigned long flags) 561 { 562 int ret = 0; 563 unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; 564 unsigned long vmemmap_reuse; 565 566 VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); 567 VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); 568 569 if (!vmemmap_should_optimize_folio(h, folio)) 570 return ret; 571 572 static_branch_inc(&hugetlb_optimize_vmemmap_key); 573 /* 574 * Very Subtle 575 * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed 576 * immediately after remapping. As a result, subsequent accesses 577 * and modifications to struct pages associated with the hugetlb 578 * page could be to the OLD struct pages. Set the vmemmap optimized 579 * flag here so that it is copied to the new head page. This keeps 580 * the old and new struct pages in sync. 581 * If there is an error during optimization, we will immediately FLUSH 582 * the TLB and clear the flag below. 583 */ 584 folio_set_hugetlb_vmemmap_optimized(folio); 585 586 vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); 587 vmemmap_reuse = vmemmap_start; 588 vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; 589 590 /* 591 * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end) 592 * to the page which @vmemmap_reuse is mapped to. Add pages previously 593 * mapping the range to vmemmap_pages list so that they can be freed by 594 * the caller. 595 */ 596 ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, 597 vmemmap_pages, flags); 598 if (ret) { 599 static_branch_dec(&hugetlb_optimize_vmemmap_key); 600 folio_clear_hugetlb_vmemmap_optimized(folio); 601 } 602 603 return ret; 604 } 605 606 /** 607 * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages. 608 * @h: struct hstate. 609 * @folio: the folio whose vmemmap pages will be optimized. 610 * 611 * This function only tries to optimize @folio's vmemmap pages and does not 612 * guarantee that the optimization will succeed after it returns. The caller 613 * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's 614 * vmemmap pages have been optimized. 615 */ 616 void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) 617 { 618 LIST_HEAD(vmemmap_pages); 619 620 /* avoid writes from page_ref_add_unless() while folding vmemmap */ 621 synchronize_rcu(); 622 623 __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0); 624 free_vmemmap_page_list(&vmemmap_pages); 625 } 626 627 static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio) 628 { 629 unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; 630 unsigned long vmemmap_reuse; 631 632 if (!vmemmap_should_optimize_folio(h, folio)) 633 return 0; 634 635 vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); 636 vmemmap_reuse = vmemmap_start; 637 vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; 638 639 /* 640 * Split PMDs on the vmemmap virtual address range [@vmemmap_start, 641 * @vmemmap_end] 642 */ 643 return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); 644 } 645 646 void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) 647 { 648 struct folio *folio; 649 LIST_HEAD(vmemmap_pages); 650 651 list_for_each_entry(folio, folio_list, lru) { 652 int ret = hugetlb_vmemmap_split_folio(h, folio); 653 654 /* 655 * Spliting the PMD requires allocating a page, thus lets fail 656 * early once we encounter the first OOM. No point in retrying 657 * as it can be dynamically done on remap with the memory 658 * we get back from the vmemmap deduplication. 659 */ 660 if (ret == -ENOMEM) 661 break; 662 } 663 664 flush_tlb_all(); 665 666 /* avoid writes from page_ref_add_unless() while folding vmemmap */ 667 synchronize_rcu(); 668 669 list_for_each_entry(folio, folio_list, lru) { 670 int ret; 671 672 ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 673 VMEMMAP_REMAP_NO_TLB_FLUSH); 674 675 /* 676 * Pages to be freed may have been accumulated. If we 677 * encounter an ENOMEM, free what we have and try again. 678 * This can occur in the case that both spliting fails 679 * halfway and head page allocation also failed. In this 680 * case __hugetlb_vmemmap_optimize_folio() would free memory 681 * allowing more vmemmap remaps to occur. 682 */ 683 if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) { 684 flush_tlb_all(); 685 free_vmemmap_page_list(&vmemmap_pages); 686 INIT_LIST_HEAD(&vmemmap_pages); 687 __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 688 VMEMMAP_REMAP_NO_TLB_FLUSH); 689 } 690 } 691 692 flush_tlb_all(); 693 free_vmemmap_page_list(&vmemmap_pages); 694 } 695 696 static struct ctl_table hugetlb_vmemmap_sysctls[] = { 697 { 698 .procname = "hugetlb_optimize_vmemmap", 699 .data = &vmemmap_optimize_enabled, 700 .maxlen = sizeof(vmemmap_optimize_enabled), 701 .mode = 0644, 702 .proc_handler = proc_dobool, 703 }, 704 }; 705 706 static int __init hugetlb_vmemmap_init(void) 707 { 708 const struct hstate *h; 709 710 /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ 711 BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES); 712 713 for_each_hstate(h) { 714 if (hugetlb_vmemmap_optimizable(h)) { 715 register_sysctl_init("vm", hugetlb_vmemmap_sysctls); 716 break; 717 } 718 } 719 return 0; 720 } 721 late_initcall(hugetlb_vmemmap_init); 722