1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * HugeTLB Vmemmap Optimization (HVO) 4 * 5 * Copyright (c) 2020, ByteDance. All rights reserved. 6 * 7 * Author: Muchun Song <songmuchun@bytedance.com> 8 * 9 * See Documentation/mm/vmemmap_dedup.rst 10 */ 11 #define pr_fmt(fmt) "HugeTLB: " fmt 12 13 #include <linux/pgtable.h> 14 #include <linux/moduleparam.h> 15 #include <linux/bootmem_info.h> 16 #include <linux/mmdebug.h> 17 #include <linux/pagewalk.h> 18 #include <asm/pgalloc.h> 19 #include <asm/tlbflush.h> 20 #include "hugetlb_vmemmap.h" 21 22 /** 23 * struct vmemmap_remap_walk - walk vmemmap page table 24 * 25 * @remap_pte: called for each lowest-level entry (PTE). 26 * @nr_walked: the number of walked pte. 27 * @reuse_page: the page which is reused for the tail vmemmap pages. 28 * @reuse_addr: the virtual address of the @reuse_page page. 29 * @vmemmap_pages: the list head of the vmemmap pages that can be freed 30 * or is mapped from. 31 * @flags: used to modify behavior in vmemmap page table walking 32 * operations. 33 */ 34 struct vmemmap_remap_walk { 35 void (*remap_pte)(pte_t *pte, unsigned long addr, 36 struct vmemmap_remap_walk *walk); 37 unsigned long nr_walked; 38 struct page *reuse_page; 39 unsigned long reuse_addr; 40 struct list_head *vmemmap_pages; 41 42 /* Skip the TLB flush when we split the PMD */ 43 #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) 44 /* Skip the TLB flush when we remap the PTE */ 45 #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1) 46 /* synchronize_rcu() to avoid writes from page_ref_add_unless() */ 47 #define VMEMMAP_SYNCHRONIZE_RCU BIT(2) 48 unsigned long flags; 49 }; 50 51 static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start, 52 struct vmemmap_remap_walk *walk) 53 { 54 pmd_t __pmd; 55 int i; 56 unsigned long addr = start; 57 pte_t *pgtable; 58 59 pgtable = pte_alloc_one_kernel(&init_mm); 60 if (!pgtable) 61 return -ENOMEM; 62 63 pmd_populate_kernel(&init_mm, &__pmd, pgtable); 64 65 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { 66 pte_t entry, *pte; 67 pgprot_t pgprot = PAGE_KERNEL; 68 69 entry = mk_pte(head + i, pgprot); 70 pte = pte_offset_kernel(&__pmd, addr); 71 set_pte_at(&init_mm, addr, pte, entry); 72 } 73 74 spin_lock(&init_mm.page_table_lock); 75 if (likely(pmd_leaf(*pmd))) { 76 /* 77 * Higher order allocations from buddy allocator must be able to 78 * be treated as indepdenent small pages (as they can be freed 79 * individually). 80 */ 81 if (!PageReserved(head)) 82 split_page(head, get_order(PMD_SIZE)); 83 84 /* Make pte visible before pmd. See comment in pmd_install(). */ 85 smp_wmb(); 86 pmd_populate_kernel(&init_mm, pmd, pgtable); 87 if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)) 88 flush_tlb_kernel_range(start, start + PMD_SIZE); 89 } else { 90 pte_free_kernel(&init_mm, pgtable); 91 } 92 spin_unlock(&init_mm.page_table_lock); 93 94 return 0; 95 } 96 97 static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr, 98 unsigned long next, struct mm_walk *walk) 99 { 100 int ret = 0; 101 struct page *head; 102 struct vmemmap_remap_walk *vmemmap_walk = walk->private; 103 104 /* Only splitting, not remapping the vmemmap pages. */ 105 if (!vmemmap_walk->remap_pte) 106 walk->action = ACTION_CONTINUE; 107 108 spin_lock(&init_mm.page_table_lock); 109 head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL; 110 /* 111 * Due to HugeTLB alignment requirements and the vmemmap 112 * pages being at the start of the hotplugged memory 113 * region in memory_hotplug.memmap_on_memory case. Checking 114 * the vmemmap page associated with the first vmemmap page 115 * if it is self-hosted is sufficient. 116 * 117 * [ hotplugged memory ] 118 * [ section ][...][ section ] 119 * [ vmemmap ][ usable memory ] 120 * ^ | ^ | 121 * +--+ | | 122 * +------------------------+ 123 */ 124 if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) { 125 struct page *page = head ? head + pte_index(addr) : 126 pte_page(ptep_get(pte_offset_kernel(pmd, addr))); 127 128 if (PageVmemmapSelfHosted(page)) 129 ret = -ENOTSUPP; 130 } 131 spin_unlock(&init_mm.page_table_lock); 132 if (!head || ret) 133 return ret; 134 135 return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk); 136 } 137 138 static int vmemmap_pte_entry(pte_t *pte, unsigned long addr, 139 unsigned long next, struct mm_walk *walk) 140 { 141 struct vmemmap_remap_walk *vmemmap_walk = walk->private; 142 143 /* 144 * The reuse_page is found 'first' in page table walking before 145 * starting remapping. 146 */ 147 if (!vmemmap_walk->reuse_page) 148 vmemmap_walk->reuse_page = pte_page(ptep_get(pte)); 149 else 150 vmemmap_walk->remap_pte(pte, addr, vmemmap_walk); 151 vmemmap_walk->nr_walked++; 152 153 return 0; 154 } 155 156 static const struct mm_walk_ops vmemmap_remap_ops = { 157 .pmd_entry = vmemmap_pmd_entry, 158 .pte_entry = vmemmap_pte_entry, 159 }; 160 161 static int vmemmap_remap_range(unsigned long start, unsigned long end, 162 struct vmemmap_remap_walk *walk) 163 { 164 int ret; 165 166 VM_BUG_ON(!PAGE_ALIGNED(start | end)); 167 168 mmap_read_lock(&init_mm); 169 ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops, 170 NULL, walk); 171 mmap_read_unlock(&init_mm); 172 if (ret) 173 return ret; 174 175 if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH)) 176 flush_tlb_kernel_range(start, end); 177 178 return 0; 179 } 180 181 /* 182 * Free a vmemmap page. A vmemmap page can be allocated from the memblock 183 * allocator or buddy allocator. If the PG_reserved flag is set, it means 184 * that it allocated from the memblock allocator, just free it via the 185 * free_bootmem_page(). Otherwise, use __free_page(). 186 */ 187 static inline void free_vmemmap_page(struct page *page) 188 { 189 if (PageReserved(page)) { 190 memmap_boot_pages_add(-1); 191 free_bootmem_page(page); 192 } else { 193 memmap_pages_add(-1); 194 __free_page(page); 195 } 196 } 197 198 /* Free a list of the vmemmap pages */ 199 static void free_vmemmap_page_list(struct list_head *list) 200 { 201 struct page *page, *next; 202 203 list_for_each_entry_safe(page, next, list, lru) 204 free_vmemmap_page(page); 205 } 206 207 static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, 208 struct vmemmap_remap_walk *walk) 209 { 210 /* 211 * Remap the tail pages as read-only to catch illegal write operation 212 * to the tail pages. 213 */ 214 pgprot_t pgprot = PAGE_KERNEL_RO; 215 struct page *page = pte_page(ptep_get(pte)); 216 pte_t entry; 217 218 /* Remapping the head page requires r/w */ 219 if (unlikely(addr == walk->reuse_addr)) { 220 pgprot = PAGE_KERNEL; 221 list_del(&walk->reuse_page->lru); 222 223 /* 224 * Makes sure that preceding stores to the page contents from 225 * vmemmap_remap_free() become visible before the set_pte_at() 226 * write. 227 */ 228 smp_wmb(); 229 } 230 231 entry = mk_pte(walk->reuse_page, pgprot); 232 list_add(&page->lru, walk->vmemmap_pages); 233 set_pte_at(&init_mm, addr, pte, entry); 234 } 235 236 /* 237 * How many struct page structs need to be reset. When we reuse the head 238 * struct page, the special metadata (e.g. page->flags or page->mapping) 239 * cannot copy to the tail struct page structs. The invalid value will be 240 * checked in the free_tail_page_prepare(). In order to avoid the message 241 * of "corrupted mapping in tail page". We need to reset at least 3 (one 242 * head struct page struct and two tail struct page structs) struct page 243 * structs. 244 */ 245 #define NR_RESET_STRUCT_PAGE 3 246 247 static inline void reset_struct_pages(struct page *start) 248 { 249 struct page *from = start + NR_RESET_STRUCT_PAGE; 250 251 BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page)); 252 memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE); 253 } 254 255 static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, 256 struct vmemmap_remap_walk *walk) 257 { 258 pgprot_t pgprot = PAGE_KERNEL; 259 struct page *page; 260 void *to; 261 262 BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page); 263 264 page = list_first_entry(walk->vmemmap_pages, struct page, lru); 265 list_del(&page->lru); 266 to = page_to_virt(page); 267 copy_page(to, (void *)walk->reuse_addr); 268 reset_struct_pages(to); 269 270 /* 271 * Makes sure that preceding stores to the page contents become visible 272 * before the set_pte_at() write. 273 */ 274 smp_wmb(); 275 set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); 276 } 277 278 /** 279 * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end) 280 * backing PMDs of the directmap into PTEs 281 * @start: start address of the vmemmap virtual address range that we want 282 * to remap. 283 * @end: end address of the vmemmap virtual address range that we want to 284 * remap. 285 * @reuse: reuse address. 286 * 287 * Return: %0 on success, negative error code otherwise. 288 */ 289 static int vmemmap_remap_split(unsigned long start, unsigned long end, 290 unsigned long reuse) 291 { 292 struct vmemmap_remap_walk walk = { 293 .remap_pte = NULL, 294 .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH, 295 }; 296 297 /* See the comment in the vmemmap_remap_free(). */ 298 BUG_ON(start - reuse != PAGE_SIZE); 299 300 return vmemmap_remap_range(reuse, end, &walk); 301 } 302 303 /** 304 * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) 305 * to the page which @reuse is mapped to, then free vmemmap 306 * which the range are mapped to. 307 * @start: start address of the vmemmap virtual address range that we want 308 * to remap. 309 * @end: end address of the vmemmap virtual address range that we want to 310 * remap. 311 * @reuse: reuse address. 312 * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers 313 * responsibility to free pages. 314 * @flags: modifications to vmemmap_remap_walk flags 315 * 316 * Return: %0 on success, negative error code otherwise. 317 */ 318 static int vmemmap_remap_free(unsigned long start, unsigned long end, 319 unsigned long reuse, 320 struct list_head *vmemmap_pages, 321 unsigned long flags) 322 { 323 int ret; 324 struct vmemmap_remap_walk walk = { 325 .remap_pte = vmemmap_remap_pte, 326 .reuse_addr = reuse, 327 .vmemmap_pages = vmemmap_pages, 328 .flags = flags, 329 }; 330 int nid = page_to_nid((struct page *)reuse); 331 gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; 332 333 /* 334 * Allocate a new head vmemmap page to avoid breaking a contiguous 335 * block of struct page memory when freeing it back to page allocator 336 * in free_vmemmap_page_list(). This will allow the likely contiguous 337 * struct page backing memory to be kept contiguous and allowing for 338 * more allocations of hugepages. Fallback to the currently 339 * mapped head page in case should it fail to allocate. 340 */ 341 walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0); 342 if (walk.reuse_page) { 343 copy_page(page_to_virt(walk.reuse_page), 344 (void *)walk.reuse_addr); 345 list_add(&walk.reuse_page->lru, vmemmap_pages); 346 memmap_pages_add(1); 347 } 348 349 /* 350 * In order to make remapping routine most efficient for the huge pages, 351 * the routine of vmemmap page table walking has the following rules 352 * (see more details from the vmemmap_pte_range()): 353 * 354 * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE) 355 * should be continuous. 356 * - The @reuse address is part of the range [@reuse, @end) that we are 357 * walking which is passed to vmemmap_remap_range(). 358 * - The @reuse address is the first in the complete range. 359 * 360 * So we need to make sure that @start and @reuse meet the above rules. 361 */ 362 BUG_ON(start - reuse != PAGE_SIZE); 363 364 ret = vmemmap_remap_range(reuse, end, &walk); 365 if (ret && walk.nr_walked) { 366 end = reuse + walk.nr_walked * PAGE_SIZE; 367 /* 368 * vmemmap_pages contains pages from the previous 369 * vmemmap_remap_range call which failed. These 370 * are pages which were removed from the vmemmap. 371 * They will be restored in the following call. 372 */ 373 walk = (struct vmemmap_remap_walk) { 374 .remap_pte = vmemmap_restore_pte, 375 .reuse_addr = reuse, 376 .vmemmap_pages = vmemmap_pages, 377 .flags = 0, 378 }; 379 380 vmemmap_remap_range(reuse, end, &walk); 381 } 382 383 return ret; 384 } 385 386 static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, 387 struct list_head *list) 388 { 389 gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL; 390 unsigned long nr_pages = (end - start) >> PAGE_SHIFT; 391 int nid = page_to_nid((struct page *)start); 392 struct page *page, *next; 393 int i; 394 395 for (i = 0; i < nr_pages; i++) { 396 page = alloc_pages_node(nid, gfp_mask, 0); 397 if (!page) 398 goto out; 399 list_add(&page->lru, list); 400 } 401 memmap_pages_add(nr_pages); 402 403 return 0; 404 out: 405 list_for_each_entry_safe(page, next, list, lru) 406 __free_page(page); 407 return -ENOMEM; 408 } 409 410 /** 411 * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) 412 * to the page which is from the @vmemmap_pages 413 * respectively. 414 * @start: start address of the vmemmap virtual address range that we want 415 * to remap. 416 * @end: end address of the vmemmap virtual address range that we want to 417 * remap. 418 * @reuse: reuse address. 419 * @flags: modifications to vmemmap_remap_walk flags 420 * 421 * Return: %0 on success, negative error code otherwise. 422 */ 423 static int vmemmap_remap_alloc(unsigned long start, unsigned long end, 424 unsigned long reuse, unsigned long flags) 425 { 426 LIST_HEAD(vmemmap_pages); 427 struct vmemmap_remap_walk walk = { 428 .remap_pte = vmemmap_restore_pte, 429 .reuse_addr = reuse, 430 .vmemmap_pages = &vmemmap_pages, 431 .flags = flags, 432 }; 433 434 /* See the comment in the vmemmap_remap_free(). */ 435 BUG_ON(start - reuse != PAGE_SIZE); 436 437 if (alloc_vmemmap_page_list(start, end, &vmemmap_pages)) 438 return -ENOMEM; 439 440 return vmemmap_remap_range(reuse, end, &walk); 441 } 442 443 DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); 444 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); 445 446 static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); 447 core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0); 448 449 static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, 450 struct folio *folio, unsigned long flags) 451 { 452 int ret; 453 unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; 454 unsigned long vmemmap_reuse; 455 456 VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); 457 VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); 458 459 if (!folio_test_hugetlb_vmemmap_optimized(folio)) 460 return 0; 461 462 if (flags & VMEMMAP_SYNCHRONIZE_RCU) 463 synchronize_rcu(); 464 465 vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); 466 vmemmap_reuse = vmemmap_start; 467 vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; 468 469 /* 470 * The pages which the vmemmap virtual address range [@vmemmap_start, 471 * @vmemmap_end) are mapped to are freed to the buddy allocator, and 472 * the range is mapped to the page which @vmemmap_reuse is mapped to. 473 * When a HugeTLB page is freed to the buddy allocator, previously 474 * discarded vmemmap pages must be allocated and remapping. 475 */ 476 ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags); 477 if (!ret) { 478 folio_clear_hugetlb_vmemmap_optimized(folio); 479 static_branch_dec(&hugetlb_optimize_vmemmap_key); 480 } 481 482 return ret; 483 } 484 485 /** 486 * hugetlb_vmemmap_restore_folio - restore previously optimized (by 487 * hugetlb_vmemmap_optimize_folio()) vmemmap pages which 488 * will be reallocated and remapped. 489 * @h: struct hstate. 490 * @folio: the folio whose vmemmap pages will be restored. 491 * 492 * Return: %0 if @folio's vmemmap pages have been reallocated and remapped, 493 * negative error code otherwise. 494 */ 495 int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) 496 { 497 return __hugetlb_vmemmap_restore_folio(h, folio, VMEMMAP_SYNCHRONIZE_RCU); 498 } 499 500 /** 501 * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list. 502 * @h: hstate. 503 * @folio_list: list of folios. 504 * @non_hvo_folios: Output list of folios for which vmemmap exists. 505 * 506 * Return: number of folios for which vmemmap was restored, or an error code 507 * if an error was encountered restoring vmemmap for a folio. 508 * Folios that have vmemmap are moved to the non_hvo_folios 509 * list. Processing of entries stops when the first error is 510 * encountered. The folio that experienced the error and all 511 * non-processed folios will remain on folio_list. 512 */ 513 long hugetlb_vmemmap_restore_folios(const struct hstate *h, 514 struct list_head *folio_list, 515 struct list_head *non_hvo_folios) 516 { 517 struct folio *folio, *t_folio; 518 long restored = 0; 519 long ret = 0; 520 unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU; 521 522 list_for_each_entry_safe(folio, t_folio, folio_list, lru) { 523 if (folio_test_hugetlb_vmemmap_optimized(folio)) { 524 ret = __hugetlb_vmemmap_restore_folio(h, folio, flags); 525 /* only need to synchronize_rcu() once for each batch */ 526 flags &= ~VMEMMAP_SYNCHRONIZE_RCU; 527 528 if (ret) 529 break; 530 restored++; 531 } 532 533 /* Add non-optimized folios to output list */ 534 list_move(&folio->lru, non_hvo_folios); 535 } 536 537 if (restored) 538 flush_tlb_all(); 539 if (!ret) 540 ret = restored; 541 return ret; 542 } 543 544 /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ 545 static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio) 546 { 547 if (folio_test_hugetlb_vmemmap_optimized(folio)) 548 return false; 549 550 if (!READ_ONCE(vmemmap_optimize_enabled)) 551 return false; 552 553 if (!hugetlb_vmemmap_optimizable(h)) 554 return false; 555 556 return true; 557 } 558 559 static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, 560 struct folio *folio, 561 struct list_head *vmemmap_pages, 562 unsigned long flags) 563 { 564 int ret = 0; 565 unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; 566 unsigned long vmemmap_reuse; 567 568 VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); 569 VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); 570 571 if (!vmemmap_should_optimize_folio(h, folio)) 572 return ret; 573 574 static_branch_inc(&hugetlb_optimize_vmemmap_key); 575 576 if (flags & VMEMMAP_SYNCHRONIZE_RCU) 577 synchronize_rcu(); 578 /* 579 * Very Subtle 580 * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed 581 * immediately after remapping. As a result, subsequent accesses 582 * and modifications to struct pages associated with the hugetlb 583 * page could be to the OLD struct pages. Set the vmemmap optimized 584 * flag here so that it is copied to the new head page. This keeps 585 * the old and new struct pages in sync. 586 * If there is an error during optimization, we will immediately FLUSH 587 * the TLB and clear the flag below. 588 */ 589 folio_set_hugetlb_vmemmap_optimized(folio); 590 591 vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); 592 vmemmap_reuse = vmemmap_start; 593 vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; 594 595 /* 596 * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end) 597 * to the page which @vmemmap_reuse is mapped to. Add pages previously 598 * mapping the range to vmemmap_pages list so that they can be freed by 599 * the caller. 600 */ 601 ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, 602 vmemmap_pages, flags); 603 if (ret) { 604 static_branch_dec(&hugetlb_optimize_vmemmap_key); 605 folio_clear_hugetlb_vmemmap_optimized(folio); 606 } 607 608 return ret; 609 } 610 611 /** 612 * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages. 613 * @h: struct hstate. 614 * @folio: the folio whose vmemmap pages will be optimized. 615 * 616 * This function only tries to optimize @folio's vmemmap pages and does not 617 * guarantee that the optimization will succeed after it returns. The caller 618 * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's 619 * vmemmap pages have been optimized. 620 */ 621 void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) 622 { 623 LIST_HEAD(vmemmap_pages); 624 625 __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_SYNCHRONIZE_RCU); 626 free_vmemmap_page_list(&vmemmap_pages); 627 } 628 629 static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio) 630 { 631 unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; 632 unsigned long vmemmap_reuse; 633 634 if (!vmemmap_should_optimize_folio(h, folio)) 635 return 0; 636 637 vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); 638 vmemmap_reuse = vmemmap_start; 639 vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; 640 641 /* 642 * Split PMDs on the vmemmap virtual address range [@vmemmap_start, 643 * @vmemmap_end] 644 */ 645 return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); 646 } 647 648 void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) 649 { 650 struct folio *folio; 651 LIST_HEAD(vmemmap_pages); 652 unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU; 653 654 list_for_each_entry(folio, folio_list, lru) { 655 int ret = hugetlb_vmemmap_split_folio(h, folio); 656 657 /* 658 * Spliting the PMD requires allocating a page, thus lets fail 659 * early once we encounter the first OOM. No point in retrying 660 * as it can be dynamically done on remap with the memory 661 * we get back from the vmemmap deduplication. 662 */ 663 if (ret == -ENOMEM) 664 break; 665 } 666 667 flush_tlb_all(); 668 669 list_for_each_entry(folio, folio_list, lru) { 670 int ret; 671 672 ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags); 673 /* only need to synchronize_rcu() once for each batch */ 674 flags &= ~VMEMMAP_SYNCHRONIZE_RCU; 675 676 /* 677 * Pages to be freed may have been accumulated. If we 678 * encounter an ENOMEM, free what we have and try again. 679 * This can occur in the case that both spliting fails 680 * halfway and head page allocation also failed. In this 681 * case __hugetlb_vmemmap_optimize_folio() would free memory 682 * allowing more vmemmap remaps to occur. 683 */ 684 if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) { 685 flush_tlb_all(); 686 free_vmemmap_page_list(&vmemmap_pages); 687 INIT_LIST_HEAD(&vmemmap_pages); 688 __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags); 689 } 690 } 691 692 flush_tlb_all(); 693 free_vmemmap_page_list(&vmemmap_pages); 694 } 695 696 static struct ctl_table hugetlb_vmemmap_sysctls[] = { 697 { 698 .procname = "hugetlb_optimize_vmemmap", 699 .data = &vmemmap_optimize_enabled, 700 .maxlen = sizeof(vmemmap_optimize_enabled), 701 .mode = 0644, 702 .proc_handler = proc_dobool, 703 }, 704 }; 705 706 static int __init hugetlb_vmemmap_init(void) 707 { 708 const struct hstate *h; 709 710 /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ 711 BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES); 712 713 for_each_hstate(h) { 714 if (hugetlb_vmemmap_optimizable(h)) { 715 register_sysctl_init("vm", hugetlb_vmemmap_sysctls); 716 break; 717 } 718 } 719 return 0; 720 } 721 late_initcall(hugetlb_vmemmap_init); 722