1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * HugeTLB Vmemmap Optimization (HVO) 4 * 5 * Copyright (c) 2020, ByteDance. All rights reserved. 6 * 7 * Author: Muchun Song <songmuchun@bytedance.com> 8 * 9 * See Documentation/mm/vmemmap_dedup.rst 10 */ 11 #define pr_fmt(fmt) "HugeTLB: " fmt 12 13 #include <linux/pgtable.h> 14 #include <linux/moduleparam.h> 15 #include <linux/bootmem_info.h> 16 #include <linux/mmdebug.h> 17 #include <linux/pagewalk.h> 18 #include <asm/pgalloc.h> 19 #include <asm/tlbflush.h> 20 #include "hugetlb_vmemmap.h" 21 22 /** 23 * struct vmemmap_remap_walk - walk vmemmap page table 24 * 25 * @remap_pte: called for each lowest-level entry (PTE). 26 * @nr_walked: the number of walked pte. 27 * @reuse_page: the page which is reused for the tail vmemmap pages. 28 * @reuse_addr: the virtual address of the @reuse_page page. 29 * @vmemmap_pages: the list head of the vmemmap pages that can be freed 30 * or is mapped from. 31 * @flags: used to modify behavior in vmemmap page table walking 32 * operations. 33 */ 34 struct vmemmap_remap_walk { 35 void (*remap_pte)(pte_t *pte, unsigned long addr, 36 struct vmemmap_remap_walk *walk); 37 unsigned long nr_walked; 38 struct page *reuse_page; 39 unsigned long reuse_addr; 40 struct list_head *vmemmap_pages; 41 42 /* Skip the TLB flush when we split the PMD */ 43 #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) 44 /* Skip the TLB flush when we remap the PTE */ 45 #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1) 46 unsigned long flags; 47 }; 48 49 static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start, 50 struct vmemmap_remap_walk *walk) 51 { 52 pmd_t __pmd; 53 int i; 54 unsigned long addr = start; 55 pte_t *pgtable; 56 57 pgtable = pte_alloc_one_kernel(&init_mm); 58 if (!pgtable) 59 return -ENOMEM; 60 61 pmd_populate_kernel(&init_mm, &__pmd, pgtable); 62 63 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { 64 pte_t entry, *pte; 65 pgprot_t pgprot = PAGE_KERNEL; 66 67 entry = mk_pte(head + i, pgprot); 68 pte = pte_offset_kernel(&__pmd, addr); 69 set_pte_at(&init_mm, addr, pte, entry); 70 } 71 72 spin_lock(&init_mm.page_table_lock); 73 if (likely(pmd_leaf(*pmd))) { 74 /* 75 * Higher order allocations from buddy allocator must be able to 76 * be treated as indepdenent small pages (as they can be freed 77 * individually). 78 */ 79 if (!PageReserved(head)) 80 split_page(head, get_order(PMD_SIZE)); 81 82 /* Make pte visible before pmd. See comment in pmd_install(). */ 83 smp_wmb(); 84 pmd_populate_kernel(&init_mm, pmd, pgtable); 85 if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)) 86 flush_tlb_kernel_range(start, start + PMD_SIZE); 87 } else { 88 pte_free_kernel(&init_mm, pgtable); 89 } 90 spin_unlock(&init_mm.page_table_lock); 91 92 return 0; 93 } 94 95 static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr, 96 unsigned long next, struct mm_walk *walk) 97 { 98 int ret = 0; 99 struct page *head; 100 struct vmemmap_remap_walk *vmemmap_walk = walk->private; 101 102 /* Only splitting, not remapping the vmemmap pages. */ 103 if (!vmemmap_walk->remap_pte) 104 walk->action = ACTION_CONTINUE; 105 106 spin_lock(&init_mm.page_table_lock); 107 head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL; 108 /* 109 * Due to HugeTLB alignment requirements and the vmemmap 110 * pages being at the start of the hotplugged memory 111 * region in memory_hotplug.memmap_on_memory case. Checking 112 * the vmemmap page associated with the first vmemmap page 113 * if it is self-hosted is sufficient. 114 * 115 * [ hotplugged memory ] 116 * [ section ][...][ section ] 117 * [ vmemmap ][ usable memory ] 118 * ^ | ^ | 119 * +--+ | | 120 * +------------------------+ 121 */ 122 if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) { 123 struct page *page = head ? head + pte_index(addr) : 124 pte_page(ptep_get(pte_offset_kernel(pmd, addr))); 125 126 if (PageVmemmapSelfHosted(page)) 127 ret = -ENOTSUPP; 128 } 129 spin_unlock(&init_mm.page_table_lock); 130 if (!head || ret) 131 return ret; 132 133 return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk); 134 } 135 136 static int vmemmap_pte_entry(pte_t *pte, unsigned long addr, 137 unsigned long next, struct mm_walk *walk) 138 { 139 struct vmemmap_remap_walk *vmemmap_walk = walk->private; 140 141 /* 142 * The reuse_page is found 'first' in page table walking before 143 * starting remapping. 144 */ 145 if (!vmemmap_walk->reuse_page) 146 vmemmap_walk->reuse_page = pte_page(ptep_get(pte)); 147 else 148 vmemmap_walk->remap_pte(pte, addr, vmemmap_walk); 149 vmemmap_walk->nr_walked++; 150 151 return 0; 152 } 153 154 static const struct mm_walk_ops vmemmap_remap_ops = { 155 .pmd_entry = vmemmap_pmd_entry, 156 .pte_entry = vmemmap_pte_entry, 157 }; 158 159 static int vmemmap_remap_range(unsigned long start, unsigned long end, 160 struct vmemmap_remap_walk *walk) 161 { 162 int ret; 163 164 VM_BUG_ON(!PAGE_ALIGNED(start | end)); 165 166 mmap_read_lock(&init_mm); 167 ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops, 168 NULL, walk); 169 mmap_read_unlock(&init_mm); 170 if (ret) 171 return ret; 172 173 if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH)) 174 flush_tlb_kernel_range(start, end); 175 176 return 0; 177 } 178 179 /* 180 * Free a vmemmap page. A vmemmap page can be allocated from the memblock 181 * allocator or buddy allocator. If the PG_reserved flag is set, it means 182 * that it allocated from the memblock allocator, just free it via the 183 * free_bootmem_page(). Otherwise, use __free_page(). 184 */ 185 static inline void free_vmemmap_page(struct page *page) 186 { 187 if (PageReserved(page)) { 188 free_bootmem_page(page); 189 mod_node_page_state(page_pgdat(page), NR_MEMMAP_BOOT, -1); 190 } else { 191 __free_page(page); 192 mod_node_page_state(page_pgdat(page), NR_MEMMAP, -1); 193 } 194 } 195 196 /* Free a list of the vmemmap pages */ 197 static void free_vmemmap_page_list(struct list_head *list) 198 { 199 struct page *page, *next; 200 201 list_for_each_entry_safe(page, next, list, lru) 202 free_vmemmap_page(page); 203 } 204 205 static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, 206 struct vmemmap_remap_walk *walk) 207 { 208 /* 209 * Remap the tail pages as read-only to catch illegal write operation 210 * to the tail pages. 211 */ 212 pgprot_t pgprot = PAGE_KERNEL_RO; 213 struct page *page = pte_page(ptep_get(pte)); 214 pte_t entry; 215 216 /* Remapping the head page requires r/w */ 217 if (unlikely(addr == walk->reuse_addr)) { 218 pgprot = PAGE_KERNEL; 219 list_del(&walk->reuse_page->lru); 220 221 /* 222 * Makes sure that preceding stores to the page contents from 223 * vmemmap_remap_free() become visible before the set_pte_at() 224 * write. 225 */ 226 smp_wmb(); 227 } 228 229 entry = mk_pte(walk->reuse_page, pgprot); 230 list_add(&page->lru, walk->vmemmap_pages); 231 set_pte_at(&init_mm, addr, pte, entry); 232 } 233 234 /* 235 * How many struct page structs need to be reset. When we reuse the head 236 * struct page, the special metadata (e.g. page->flags or page->mapping) 237 * cannot copy to the tail struct page structs. The invalid value will be 238 * checked in the free_tail_page_prepare(). In order to avoid the message 239 * of "corrupted mapping in tail page". We need to reset at least 3 (one 240 * head struct page struct and two tail struct page structs) struct page 241 * structs. 242 */ 243 #define NR_RESET_STRUCT_PAGE 3 244 245 static inline void reset_struct_pages(struct page *start) 246 { 247 struct page *from = start + NR_RESET_STRUCT_PAGE; 248 249 BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page)); 250 memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE); 251 } 252 253 static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, 254 struct vmemmap_remap_walk *walk) 255 { 256 pgprot_t pgprot = PAGE_KERNEL; 257 struct page *page; 258 void *to; 259 260 BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page); 261 262 page = list_first_entry(walk->vmemmap_pages, struct page, lru); 263 list_del(&page->lru); 264 to = page_to_virt(page); 265 copy_page(to, (void *)walk->reuse_addr); 266 reset_struct_pages(to); 267 268 /* 269 * Makes sure that preceding stores to the page contents become visible 270 * before the set_pte_at() write. 271 */ 272 smp_wmb(); 273 set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); 274 } 275 276 /** 277 * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end) 278 * backing PMDs of the directmap into PTEs 279 * @start: start address of the vmemmap virtual address range that we want 280 * to remap. 281 * @end: end address of the vmemmap virtual address range that we want to 282 * remap. 283 * @reuse: reuse address. 284 * 285 * Return: %0 on success, negative error code otherwise. 286 */ 287 static int vmemmap_remap_split(unsigned long start, unsigned long end, 288 unsigned long reuse) 289 { 290 struct vmemmap_remap_walk walk = { 291 .remap_pte = NULL, 292 .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH, 293 }; 294 295 /* See the comment in the vmemmap_remap_free(). */ 296 BUG_ON(start - reuse != PAGE_SIZE); 297 298 return vmemmap_remap_range(reuse, end, &walk); 299 } 300 301 /** 302 * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) 303 * to the page which @reuse is mapped to, then free vmemmap 304 * which the range are mapped to. 305 * @start: start address of the vmemmap virtual address range that we want 306 * to remap. 307 * @end: end address of the vmemmap virtual address range that we want to 308 * remap. 309 * @reuse: reuse address. 310 * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers 311 * responsibility to free pages. 312 * @flags: modifications to vmemmap_remap_walk flags 313 * 314 * Return: %0 on success, negative error code otherwise. 315 */ 316 static int vmemmap_remap_free(unsigned long start, unsigned long end, 317 unsigned long reuse, 318 struct list_head *vmemmap_pages, 319 unsigned long flags) 320 { 321 int ret; 322 struct vmemmap_remap_walk walk = { 323 .remap_pte = vmemmap_remap_pte, 324 .reuse_addr = reuse, 325 .vmemmap_pages = vmemmap_pages, 326 .flags = flags, 327 }; 328 int nid = page_to_nid((struct page *)reuse); 329 gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; 330 331 /* 332 * Allocate a new head vmemmap page to avoid breaking a contiguous 333 * block of struct page memory when freeing it back to page allocator 334 * in free_vmemmap_page_list(). This will allow the likely contiguous 335 * struct page backing memory to be kept contiguous and allowing for 336 * more allocations of hugepages. Fallback to the currently 337 * mapped head page in case should it fail to allocate. 338 */ 339 walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0); 340 if (walk.reuse_page) { 341 copy_page(page_to_virt(walk.reuse_page), 342 (void *)walk.reuse_addr); 343 list_add(&walk.reuse_page->lru, vmemmap_pages); 344 mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, 1); 345 } 346 347 /* 348 * In order to make remapping routine most efficient for the huge pages, 349 * the routine of vmemmap page table walking has the following rules 350 * (see more details from the vmemmap_pte_range()): 351 * 352 * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE) 353 * should be continuous. 354 * - The @reuse address is part of the range [@reuse, @end) that we are 355 * walking which is passed to vmemmap_remap_range(). 356 * - The @reuse address is the first in the complete range. 357 * 358 * So we need to make sure that @start and @reuse meet the above rules. 359 */ 360 BUG_ON(start - reuse != PAGE_SIZE); 361 362 ret = vmemmap_remap_range(reuse, end, &walk); 363 if (ret && walk.nr_walked) { 364 end = reuse + walk.nr_walked * PAGE_SIZE; 365 /* 366 * vmemmap_pages contains pages from the previous 367 * vmemmap_remap_range call which failed. These 368 * are pages which were removed from the vmemmap. 369 * They will be restored in the following call. 370 */ 371 walk = (struct vmemmap_remap_walk) { 372 .remap_pte = vmemmap_restore_pte, 373 .reuse_addr = reuse, 374 .vmemmap_pages = vmemmap_pages, 375 .flags = 0, 376 }; 377 378 vmemmap_remap_range(reuse, end, &walk); 379 } 380 381 return ret; 382 } 383 384 static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, 385 struct list_head *list) 386 { 387 gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL; 388 unsigned long nr_pages = (end - start) >> PAGE_SHIFT; 389 int nid = page_to_nid((struct page *)start); 390 struct page *page, *next; 391 int i; 392 393 for (i = 0; i < nr_pages; i++) { 394 page = alloc_pages_node(nid, gfp_mask, 0); 395 if (!page) { 396 mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, i); 397 goto out; 398 } 399 list_add(&page->lru, list); 400 } 401 402 mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, nr_pages); 403 404 return 0; 405 out: 406 list_for_each_entry_safe(page, next, list, lru) 407 __free_page(page); 408 return -ENOMEM; 409 } 410 411 /** 412 * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) 413 * to the page which is from the @vmemmap_pages 414 * respectively. 415 * @start: start address of the vmemmap virtual address range that we want 416 * to remap. 417 * @end: end address of the vmemmap virtual address range that we want to 418 * remap. 419 * @reuse: reuse address. 420 * @flags: modifications to vmemmap_remap_walk flags 421 * 422 * Return: %0 on success, negative error code otherwise. 423 */ 424 static int vmemmap_remap_alloc(unsigned long start, unsigned long end, 425 unsigned long reuse, unsigned long flags) 426 { 427 LIST_HEAD(vmemmap_pages); 428 struct vmemmap_remap_walk walk = { 429 .remap_pte = vmemmap_restore_pte, 430 .reuse_addr = reuse, 431 .vmemmap_pages = &vmemmap_pages, 432 .flags = flags, 433 }; 434 435 /* See the comment in the vmemmap_remap_free(). */ 436 BUG_ON(start - reuse != PAGE_SIZE); 437 438 if (alloc_vmemmap_page_list(start, end, &vmemmap_pages)) 439 return -ENOMEM; 440 441 return vmemmap_remap_range(reuse, end, &walk); 442 } 443 444 DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); 445 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); 446 447 static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); 448 core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0); 449 450 static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, 451 struct folio *folio, unsigned long flags) 452 { 453 int ret; 454 unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; 455 unsigned long vmemmap_reuse; 456 457 VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); 458 VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); 459 460 if (!folio_test_hugetlb_vmemmap_optimized(folio)) 461 return 0; 462 463 vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); 464 vmemmap_reuse = vmemmap_start; 465 vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; 466 467 /* 468 * The pages which the vmemmap virtual address range [@vmemmap_start, 469 * @vmemmap_end) are mapped to are freed to the buddy allocator, and 470 * the range is mapped to the page which @vmemmap_reuse is mapped to. 471 * When a HugeTLB page is freed to the buddy allocator, previously 472 * discarded vmemmap pages must be allocated and remapping. 473 */ 474 ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags); 475 if (!ret) { 476 folio_clear_hugetlb_vmemmap_optimized(folio); 477 static_branch_dec(&hugetlb_optimize_vmemmap_key); 478 } 479 480 return ret; 481 } 482 483 /** 484 * hugetlb_vmemmap_restore_folio - restore previously optimized (by 485 * hugetlb_vmemmap_optimize_folio()) vmemmap pages which 486 * will be reallocated and remapped. 487 * @h: struct hstate. 488 * @folio: the folio whose vmemmap pages will be restored. 489 * 490 * Return: %0 if @folio's vmemmap pages have been reallocated and remapped, 491 * negative error code otherwise. 492 */ 493 int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) 494 { 495 /* avoid writes from page_ref_add_unless() while unfolding vmemmap */ 496 synchronize_rcu(); 497 498 return __hugetlb_vmemmap_restore_folio(h, folio, 0); 499 } 500 501 /** 502 * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list. 503 * @h: hstate. 504 * @folio_list: list of folios. 505 * @non_hvo_folios: Output list of folios for which vmemmap exists. 506 * 507 * Return: number of folios for which vmemmap was restored, or an error code 508 * if an error was encountered restoring vmemmap for a folio. 509 * Folios that have vmemmap are moved to the non_hvo_folios 510 * list. Processing of entries stops when the first error is 511 * encountered. The folio that experienced the error and all 512 * non-processed folios will remain on folio_list. 513 */ 514 long hugetlb_vmemmap_restore_folios(const struct hstate *h, 515 struct list_head *folio_list, 516 struct list_head *non_hvo_folios) 517 { 518 struct folio *folio, *t_folio; 519 long restored = 0; 520 long ret = 0; 521 522 /* avoid writes from page_ref_add_unless() while unfolding vmemmap */ 523 synchronize_rcu(); 524 525 list_for_each_entry_safe(folio, t_folio, folio_list, lru) { 526 if (folio_test_hugetlb_vmemmap_optimized(folio)) { 527 ret = __hugetlb_vmemmap_restore_folio(h, folio, 528 VMEMMAP_REMAP_NO_TLB_FLUSH); 529 if (ret) 530 break; 531 restored++; 532 } 533 534 /* Add non-optimized folios to output list */ 535 list_move(&folio->lru, non_hvo_folios); 536 } 537 538 if (restored) 539 flush_tlb_all(); 540 if (!ret) 541 ret = restored; 542 return ret; 543 } 544 545 /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ 546 static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio) 547 { 548 if (folio_test_hugetlb_vmemmap_optimized(folio)) 549 return false; 550 551 if (!READ_ONCE(vmemmap_optimize_enabled)) 552 return false; 553 554 if (!hugetlb_vmemmap_optimizable(h)) 555 return false; 556 557 return true; 558 } 559 560 static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, 561 struct folio *folio, 562 struct list_head *vmemmap_pages, 563 unsigned long flags) 564 { 565 int ret = 0; 566 unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; 567 unsigned long vmemmap_reuse; 568 569 VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); 570 VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); 571 572 if (!vmemmap_should_optimize_folio(h, folio)) 573 return ret; 574 575 static_branch_inc(&hugetlb_optimize_vmemmap_key); 576 /* 577 * Very Subtle 578 * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed 579 * immediately after remapping. As a result, subsequent accesses 580 * and modifications to struct pages associated with the hugetlb 581 * page could be to the OLD struct pages. Set the vmemmap optimized 582 * flag here so that it is copied to the new head page. This keeps 583 * the old and new struct pages in sync. 584 * If there is an error during optimization, we will immediately FLUSH 585 * the TLB and clear the flag below. 586 */ 587 folio_set_hugetlb_vmemmap_optimized(folio); 588 589 vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); 590 vmemmap_reuse = vmemmap_start; 591 vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; 592 593 /* 594 * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end) 595 * to the page which @vmemmap_reuse is mapped to. Add pages previously 596 * mapping the range to vmemmap_pages list so that they can be freed by 597 * the caller. 598 */ 599 ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, 600 vmemmap_pages, flags); 601 if (ret) { 602 static_branch_dec(&hugetlb_optimize_vmemmap_key); 603 folio_clear_hugetlb_vmemmap_optimized(folio); 604 } 605 606 return ret; 607 } 608 609 /** 610 * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages. 611 * @h: struct hstate. 612 * @folio: the folio whose vmemmap pages will be optimized. 613 * 614 * This function only tries to optimize @folio's vmemmap pages and does not 615 * guarantee that the optimization will succeed after it returns. The caller 616 * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's 617 * vmemmap pages have been optimized. 618 */ 619 void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) 620 { 621 LIST_HEAD(vmemmap_pages); 622 623 /* avoid writes from page_ref_add_unless() while folding vmemmap */ 624 synchronize_rcu(); 625 626 __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0); 627 free_vmemmap_page_list(&vmemmap_pages); 628 } 629 630 static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio) 631 { 632 unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; 633 unsigned long vmemmap_reuse; 634 635 if (!vmemmap_should_optimize_folio(h, folio)) 636 return 0; 637 638 vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); 639 vmemmap_reuse = vmemmap_start; 640 vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; 641 642 /* 643 * Split PMDs on the vmemmap virtual address range [@vmemmap_start, 644 * @vmemmap_end] 645 */ 646 return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); 647 } 648 649 void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) 650 { 651 struct folio *folio; 652 LIST_HEAD(vmemmap_pages); 653 654 list_for_each_entry(folio, folio_list, lru) { 655 int ret = hugetlb_vmemmap_split_folio(h, folio); 656 657 /* 658 * Spliting the PMD requires allocating a page, thus lets fail 659 * early once we encounter the first OOM. No point in retrying 660 * as it can be dynamically done on remap with the memory 661 * we get back from the vmemmap deduplication. 662 */ 663 if (ret == -ENOMEM) 664 break; 665 } 666 667 flush_tlb_all(); 668 669 /* avoid writes from page_ref_add_unless() while folding vmemmap */ 670 synchronize_rcu(); 671 672 list_for_each_entry(folio, folio_list, lru) { 673 int ret; 674 675 ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 676 VMEMMAP_REMAP_NO_TLB_FLUSH); 677 678 /* 679 * Pages to be freed may have been accumulated. If we 680 * encounter an ENOMEM, free what we have and try again. 681 * This can occur in the case that both spliting fails 682 * halfway and head page allocation also failed. In this 683 * case __hugetlb_vmemmap_optimize_folio() would free memory 684 * allowing more vmemmap remaps to occur. 685 */ 686 if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) { 687 flush_tlb_all(); 688 free_vmemmap_page_list(&vmemmap_pages); 689 INIT_LIST_HEAD(&vmemmap_pages); 690 __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 691 VMEMMAP_REMAP_NO_TLB_FLUSH); 692 } 693 } 694 695 flush_tlb_all(); 696 free_vmemmap_page_list(&vmemmap_pages); 697 } 698 699 static struct ctl_table hugetlb_vmemmap_sysctls[] = { 700 { 701 .procname = "hugetlb_optimize_vmemmap", 702 .data = &vmemmap_optimize_enabled, 703 .maxlen = sizeof(vmemmap_optimize_enabled), 704 .mode = 0644, 705 .proc_handler = proc_dobool, 706 }, 707 }; 708 709 static int __init hugetlb_vmemmap_init(void) 710 { 711 const struct hstate *h; 712 713 /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ 714 BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES); 715 716 for_each_hstate(h) { 717 if (hugetlb_vmemmap_optimizable(h)) { 718 register_sysctl_init("vm", hugetlb_vmemmap_sysctls); 719 break; 720 } 721 } 722 return 0; 723 } 724 late_initcall(hugetlb_vmemmap_init); 725