1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * HugeTLB Vmemmap Optimization (HVO)
4 *
5 * Copyright (c) 2020, ByteDance. All rights reserved.
6 *
7 * Author: Muchun Song <songmuchun@bytedance.com>
8 *
9 * See Documentation/mm/vmemmap_dedup.rst
10 */
11 #define pr_fmt(fmt) "HugeTLB: " fmt
12
13 #include <linux/pgtable.h>
14 #include <linux/moduleparam.h>
15 #include <linux/bootmem_info.h>
16 #include <linux/mmdebug.h>
17 #include <linux/pagewalk.h>
18 #include <linux/pgalloc.h>
19
20 #include <asm/tlbflush.h>
21 #include "hugetlb_vmemmap.h"
22
23 /**
24 * struct vmemmap_remap_walk - walk vmemmap page table
25 *
26 * @remap_pte: called for each lowest-level entry (PTE).
27 * @nr_walked: the number of walked pte.
28 * @reuse_page: the page which is reused for the tail vmemmap pages.
29 * @reuse_addr: the virtual address of the @reuse_page page.
30 * @vmemmap_pages: the list head of the vmemmap pages that can be freed
31 * or is mapped from.
32 * @flags: used to modify behavior in vmemmap page table walking
33 * operations.
34 */
35 struct vmemmap_remap_walk {
36 void (*remap_pte)(pte_t *pte, unsigned long addr,
37 struct vmemmap_remap_walk *walk);
38 unsigned long nr_walked;
39 struct page *reuse_page;
40 unsigned long reuse_addr;
41 struct list_head *vmemmap_pages;
42
43 /* Skip the TLB flush when we split the PMD */
44 #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0)
45 /* Skip the TLB flush when we remap the PTE */
46 #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1)
47 /* synchronize_rcu() to avoid writes from page_ref_add_unless() */
48 #define VMEMMAP_SYNCHRONIZE_RCU BIT(2)
49 unsigned long flags;
50 };
51
vmemmap_split_pmd(pmd_t * pmd,struct page * head,unsigned long start,struct vmemmap_remap_walk * walk)52 static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start,
53 struct vmemmap_remap_walk *walk)
54 {
55 pmd_t __pmd;
56 int i;
57 unsigned long addr = start;
58 pte_t *pgtable;
59
60 pgtable = pte_alloc_one_kernel(&init_mm);
61 if (!pgtable)
62 return -ENOMEM;
63
64 pmd_populate_kernel(&init_mm, &__pmd, pgtable);
65
66 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
67 pte_t entry, *pte;
68 pgprot_t pgprot = PAGE_KERNEL;
69
70 entry = mk_pte(head + i, pgprot);
71 pte = pte_offset_kernel(&__pmd, addr);
72 set_pte_at(&init_mm, addr, pte, entry);
73 }
74
75 spin_lock(&init_mm.page_table_lock);
76 if (likely(pmd_leaf(*pmd))) {
77 /*
78 * Higher order allocations from buddy allocator must be able to
79 * be treated as independent small pages (as they can be freed
80 * individually).
81 */
82 if (!PageReserved(head))
83 split_page(head, get_order(PMD_SIZE));
84
85 /* Make pte visible before pmd. See comment in pmd_install(). */
86 smp_wmb();
87 pmd_populate_kernel(&init_mm, pmd, pgtable);
88 if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH))
89 flush_tlb_kernel_range(start, start + PMD_SIZE);
90 } else {
91 pte_free_kernel(&init_mm, pgtable);
92 }
93 spin_unlock(&init_mm.page_table_lock);
94
95 return 0;
96 }
97
vmemmap_pmd_entry(pmd_t * pmd,unsigned long addr,unsigned long next,struct mm_walk * walk)98 static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr,
99 unsigned long next, struct mm_walk *walk)
100 {
101 int ret = 0;
102 struct page *head;
103 struct vmemmap_remap_walk *vmemmap_walk = walk->private;
104
105 /* Only splitting, not remapping the vmemmap pages. */
106 if (!vmemmap_walk->remap_pte)
107 walk->action = ACTION_CONTINUE;
108
109 spin_lock(&init_mm.page_table_lock);
110 head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL;
111 /*
112 * Due to HugeTLB alignment requirements and the vmemmap
113 * pages being at the start of the hotplugged memory
114 * region in memory_hotplug.memmap_on_memory case. Checking
115 * the vmemmap page associated with the first vmemmap page
116 * if it is self-hosted is sufficient.
117 *
118 * [ hotplugged memory ]
119 * [ section ][...][ section ]
120 * [ vmemmap ][ usable memory ]
121 * ^ | ^ |
122 * +--+ | |
123 * +------------------------+
124 */
125 if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) {
126 struct page *page = head ? head + pte_index(addr) :
127 pte_page(ptep_get(pte_offset_kernel(pmd, addr)));
128
129 if (PageVmemmapSelfHosted(page))
130 ret = -ENOTSUPP;
131 }
132 spin_unlock(&init_mm.page_table_lock);
133 if (!head || ret)
134 return ret;
135
136 return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk);
137 }
138
vmemmap_pte_entry(pte_t * pte,unsigned long addr,unsigned long next,struct mm_walk * walk)139 static int vmemmap_pte_entry(pte_t *pte, unsigned long addr,
140 unsigned long next, struct mm_walk *walk)
141 {
142 struct vmemmap_remap_walk *vmemmap_walk = walk->private;
143
144 /*
145 * The reuse_page is found 'first' in page table walking before
146 * starting remapping.
147 */
148 if (!vmemmap_walk->reuse_page)
149 vmemmap_walk->reuse_page = pte_page(ptep_get(pte));
150 else
151 vmemmap_walk->remap_pte(pte, addr, vmemmap_walk);
152 vmemmap_walk->nr_walked++;
153
154 return 0;
155 }
156
157 static const struct mm_walk_ops vmemmap_remap_ops = {
158 .pmd_entry = vmemmap_pmd_entry,
159 .pte_entry = vmemmap_pte_entry,
160 };
161
vmemmap_remap_range(unsigned long start,unsigned long end,struct vmemmap_remap_walk * walk)162 static int vmemmap_remap_range(unsigned long start, unsigned long end,
163 struct vmemmap_remap_walk *walk)
164 {
165 int ret;
166
167 VM_BUG_ON(!PAGE_ALIGNED(start | end));
168
169 mmap_read_lock(&init_mm);
170 ret = walk_kernel_page_table_range(start, end, &vmemmap_remap_ops,
171 NULL, walk);
172 mmap_read_unlock(&init_mm);
173 if (ret)
174 return ret;
175
176 if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH))
177 flush_tlb_kernel_range(start, end);
178
179 return 0;
180 }
181
182 /*
183 * Free a vmemmap page. A vmemmap page can be allocated from the memblock
184 * allocator or buddy allocator. If the PG_reserved flag is set, it means
185 * that it allocated from the memblock allocator, just free it via the
186 * free_bootmem_page(). Otherwise, use __free_page().
187 */
free_vmemmap_page(struct page * page)188 static inline void free_vmemmap_page(struct page *page)
189 {
190 if (PageReserved(page)) {
191 memmap_boot_pages_add(-1);
192 free_bootmem_page(page);
193 } else {
194 memmap_pages_add(-1);
195 __free_page(page);
196 }
197 }
198
199 /* Free a list of the vmemmap pages */
free_vmemmap_page_list(struct list_head * list)200 static void free_vmemmap_page_list(struct list_head *list)
201 {
202 struct page *page, *next;
203
204 list_for_each_entry_safe(page, next, list, lru)
205 free_vmemmap_page(page);
206 }
207
vmemmap_remap_pte(pte_t * pte,unsigned long addr,struct vmemmap_remap_walk * walk)208 static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
209 struct vmemmap_remap_walk *walk)
210 {
211 /*
212 * Remap the tail pages as read-only to catch illegal write operation
213 * to the tail pages.
214 */
215 pgprot_t pgprot = PAGE_KERNEL_RO;
216 struct page *page = pte_page(ptep_get(pte));
217 pte_t entry;
218
219 /* Remapping the head page requires r/w */
220 if (unlikely(addr == walk->reuse_addr)) {
221 pgprot = PAGE_KERNEL;
222 list_del(&walk->reuse_page->lru);
223
224 /*
225 * Makes sure that preceding stores to the page contents from
226 * vmemmap_remap_free() become visible before the set_pte_at()
227 * write.
228 */
229 smp_wmb();
230 }
231
232 entry = mk_pte(walk->reuse_page, pgprot);
233 list_add(&page->lru, walk->vmemmap_pages);
234 set_pte_at(&init_mm, addr, pte, entry);
235 }
236
237 /*
238 * How many struct page structs need to be reset. When we reuse the head
239 * struct page, the special metadata (e.g. page->flags or page->mapping)
240 * cannot copy to the tail struct page structs. The invalid value will be
241 * checked in the free_tail_page_prepare(). In order to avoid the message
242 * of "corrupted mapping in tail page". We need to reset at least 4 (one
243 * head struct page struct and three tail struct page structs) struct page
244 * structs.
245 */
246 #define NR_RESET_STRUCT_PAGE 4
247
reset_struct_pages(struct page * start)248 static inline void reset_struct_pages(struct page *start)
249 {
250 struct page *from = start + NR_RESET_STRUCT_PAGE;
251
252 BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page));
253 memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE);
254 }
255
vmemmap_restore_pte(pte_t * pte,unsigned long addr,struct vmemmap_remap_walk * walk)256 static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
257 struct vmemmap_remap_walk *walk)
258 {
259 pgprot_t pgprot = PAGE_KERNEL;
260 struct page *page;
261 void *to;
262
263 BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);
264
265 page = list_first_entry(walk->vmemmap_pages, struct page, lru);
266 list_del(&page->lru);
267 to = page_to_virt(page);
268 copy_page(to, (void *)walk->reuse_addr);
269 reset_struct_pages(to);
270
271 /*
272 * Makes sure that preceding stores to the page contents become visible
273 * before the set_pte_at() write.
274 */
275 smp_wmb();
276 set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
277 }
278
279 /**
280 * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
281 * backing PMDs of the directmap into PTEs
282 * @start: start address of the vmemmap virtual address range that we want
283 * to remap.
284 * @end: end address of the vmemmap virtual address range that we want to
285 * remap.
286 * @reuse: reuse address.
287 *
288 * Return: %0 on success, negative error code otherwise.
289 */
vmemmap_remap_split(unsigned long start,unsigned long end,unsigned long reuse)290 static int vmemmap_remap_split(unsigned long start, unsigned long end,
291 unsigned long reuse)
292 {
293 struct vmemmap_remap_walk walk = {
294 .remap_pte = NULL,
295 .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH,
296 };
297
298 /* See the comment in the vmemmap_remap_free(). */
299 BUG_ON(start - reuse != PAGE_SIZE);
300
301 return vmemmap_remap_range(reuse, end, &walk);
302 }
303
304 /**
305 * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
306 * to the page which @reuse is mapped to, then free vmemmap
307 * which the range are mapped to.
308 * @start: start address of the vmemmap virtual address range that we want
309 * to remap.
310 * @end: end address of the vmemmap virtual address range that we want to
311 * remap.
312 * @reuse: reuse address.
313 * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers
314 * responsibility to free pages.
315 * @flags: modifications to vmemmap_remap_walk flags
316 *
317 * Return: %0 on success, negative error code otherwise.
318 */
vmemmap_remap_free(unsigned long start,unsigned long end,unsigned long reuse,struct list_head * vmemmap_pages,unsigned long flags)319 static int vmemmap_remap_free(unsigned long start, unsigned long end,
320 unsigned long reuse,
321 struct list_head *vmemmap_pages,
322 unsigned long flags)
323 {
324 int ret;
325 struct vmemmap_remap_walk walk = {
326 .remap_pte = vmemmap_remap_pte,
327 .reuse_addr = reuse,
328 .vmemmap_pages = vmemmap_pages,
329 .flags = flags,
330 };
331 int nid = page_to_nid((struct page *)reuse);
332 gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
333
334 /*
335 * Allocate a new head vmemmap page to avoid breaking a contiguous
336 * block of struct page memory when freeing it back to page allocator
337 * in free_vmemmap_page_list(). This will allow the likely contiguous
338 * struct page backing memory to be kept contiguous and allowing for
339 * more allocations of hugepages. Fallback to the currently
340 * mapped head page in case should it fail to allocate.
341 */
342 walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0);
343 if (walk.reuse_page) {
344 copy_page(page_to_virt(walk.reuse_page),
345 (void *)walk.reuse_addr);
346 list_add(&walk.reuse_page->lru, vmemmap_pages);
347 memmap_pages_add(1);
348 }
349
350 /*
351 * In order to make remapping routine most efficient for the huge pages,
352 * the routine of vmemmap page table walking has the following rules
353 * (see more details from the vmemmap_pte_range()):
354 *
355 * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
356 * should be continuous.
357 * - The @reuse address is part of the range [@reuse, @end) that we are
358 * walking which is passed to vmemmap_remap_range().
359 * - The @reuse address is the first in the complete range.
360 *
361 * So we need to make sure that @start and @reuse meet the above rules.
362 */
363 BUG_ON(start - reuse != PAGE_SIZE);
364
365 ret = vmemmap_remap_range(reuse, end, &walk);
366 if (ret && walk.nr_walked) {
367 end = reuse + walk.nr_walked * PAGE_SIZE;
368 /*
369 * vmemmap_pages contains pages from the previous
370 * vmemmap_remap_range call which failed. These
371 * are pages which were removed from the vmemmap.
372 * They will be restored in the following call.
373 */
374 walk = (struct vmemmap_remap_walk) {
375 .remap_pte = vmemmap_restore_pte,
376 .reuse_addr = reuse,
377 .vmemmap_pages = vmemmap_pages,
378 .flags = 0,
379 };
380
381 vmemmap_remap_range(reuse, end, &walk);
382 }
383
384 return ret;
385 }
386
alloc_vmemmap_page_list(unsigned long start,unsigned long end,struct list_head * list)387 static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
388 struct list_head *list)
389 {
390 gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
391 unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
392 int nid = page_to_nid((struct page *)start);
393 struct page *page, *next;
394 int i;
395
396 for (i = 0; i < nr_pages; i++) {
397 page = alloc_pages_node(nid, gfp_mask, 0);
398 if (!page)
399 goto out;
400 list_add(&page->lru, list);
401 }
402 memmap_pages_add(nr_pages);
403
404 return 0;
405 out:
406 list_for_each_entry_safe(page, next, list, lru)
407 __free_page(page);
408 return -ENOMEM;
409 }
410
411 /**
412 * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
413 * to the page which is from the @vmemmap_pages
414 * respectively.
415 * @start: start address of the vmemmap virtual address range that we want
416 * to remap.
417 * @end: end address of the vmemmap virtual address range that we want to
418 * remap.
419 * @reuse: reuse address.
420 * @flags: modifications to vmemmap_remap_walk flags
421 *
422 * Return: %0 on success, negative error code otherwise.
423 */
vmemmap_remap_alloc(unsigned long start,unsigned long end,unsigned long reuse,unsigned long flags)424 static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
425 unsigned long reuse, unsigned long flags)
426 {
427 LIST_HEAD(vmemmap_pages);
428 struct vmemmap_remap_walk walk = {
429 .remap_pte = vmemmap_restore_pte,
430 .reuse_addr = reuse,
431 .vmemmap_pages = &vmemmap_pages,
432 .flags = flags,
433 };
434
435 /* See the comment in the vmemmap_remap_free(). */
436 BUG_ON(start - reuse != PAGE_SIZE);
437
438 if (alloc_vmemmap_page_list(start, end, &vmemmap_pages))
439 return -ENOMEM;
440
441 return vmemmap_remap_range(reuse, end, &walk);
442 }
443
444 DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
445 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
446
447 static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
hugetlb_vmemmap_optimize_param(char * buf)448 static int __init hugetlb_vmemmap_optimize_param(char *buf)
449 {
450 return kstrtobool(buf, &vmemmap_optimize_enabled);
451 }
452 early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_optimize_param);
453
__hugetlb_vmemmap_restore_folio(const struct hstate * h,struct folio * folio,unsigned long flags)454 static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
455 struct folio *folio, unsigned long flags)
456 {
457 int ret;
458 unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
459 unsigned long vmemmap_reuse;
460
461 VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
462 VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
463
464 if (!folio_test_hugetlb_vmemmap_optimized(folio))
465 return 0;
466
467 if (flags & VMEMMAP_SYNCHRONIZE_RCU)
468 synchronize_rcu();
469
470 vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
471 vmemmap_reuse = vmemmap_start;
472 vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
473
474 /*
475 * The pages which the vmemmap virtual address range [@vmemmap_start,
476 * @vmemmap_end) are mapped to are freed to the buddy allocator, and
477 * the range is mapped to the page which @vmemmap_reuse is mapped to.
478 * When a HugeTLB page is freed to the buddy allocator, previously
479 * discarded vmemmap pages must be allocated and remapping.
480 */
481 ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags);
482 if (!ret) {
483 folio_clear_hugetlb_vmemmap_optimized(folio);
484 static_branch_dec(&hugetlb_optimize_vmemmap_key);
485 }
486
487 return ret;
488 }
489
490 /**
491 * hugetlb_vmemmap_restore_folio - restore previously optimized (by
492 * hugetlb_vmemmap_optimize_folio()) vmemmap pages which
493 * will be reallocated and remapped.
494 * @h: struct hstate.
495 * @folio: the folio whose vmemmap pages will be restored.
496 *
497 * Return: %0 if @folio's vmemmap pages have been reallocated and remapped,
498 * negative error code otherwise.
499 */
hugetlb_vmemmap_restore_folio(const struct hstate * h,struct folio * folio)500 int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio)
501 {
502 return __hugetlb_vmemmap_restore_folio(h, folio, VMEMMAP_SYNCHRONIZE_RCU);
503 }
504
505 /**
506 * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
507 * @h: hstate.
508 * @folio_list: list of folios.
509 * @non_hvo_folios: Output list of folios for which vmemmap exists.
510 *
511 * Return: number of folios for which vmemmap was restored, or an error code
512 * if an error was encountered restoring vmemmap for a folio.
513 * Folios that have vmemmap are moved to the non_hvo_folios
514 * list. Processing of entries stops when the first error is
515 * encountered. The folio that experienced the error and all
516 * non-processed folios will remain on folio_list.
517 */
hugetlb_vmemmap_restore_folios(const struct hstate * h,struct list_head * folio_list,struct list_head * non_hvo_folios)518 long hugetlb_vmemmap_restore_folios(const struct hstate *h,
519 struct list_head *folio_list,
520 struct list_head *non_hvo_folios)
521 {
522 struct folio *folio, *t_folio;
523 long restored = 0;
524 long ret = 0;
525 unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU;
526
527 list_for_each_entry_safe(folio, t_folio, folio_list, lru) {
528 if (folio_test_hugetlb_vmemmap_optimized(folio)) {
529 ret = __hugetlb_vmemmap_restore_folio(h, folio, flags);
530 /* only need to synchronize_rcu() once for each batch */
531 flags &= ~VMEMMAP_SYNCHRONIZE_RCU;
532
533 if (ret)
534 break;
535 restored++;
536 }
537
538 /* Add non-optimized folios to output list */
539 list_move(&folio->lru, non_hvo_folios);
540 }
541
542 if (restored)
543 flush_tlb_all();
544 if (!ret)
545 ret = restored;
546 return ret;
547 }
548
549 /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
vmemmap_should_optimize_folio(const struct hstate * h,struct folio * folio)550 static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio)
551 {
552 if (folio_test_hugetlb_vmemmap_optimized(folio))
553 return false;
554
555 if (!READ_ONCE(vmemmap_optimize_enabled))
556 return false;
557
558 if (!hugetlb_vmemmap_optimizable(h))
559 return false;
560
561 return true;
562 }
563
__hugetlb_vmemmap_optimize_folio(const struct hstate * h,struct folio * folio,struct list_head * vmemmap_pages,unsigned long flags)564 static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
565 struct folio *folio,
566 struct list_head *vmemmap_pages,
567 unsigned long flags)
568 {
569 int ret = 0;
570 unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
571 unsigned long vmemmap_reuse;
572
573 VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
574 VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
575
576 if (!vmemmap_should_optimize_folio(h, folio))
577 return ret;
578
579 static_branch_inc(&hugetlb_optimize_vmemmap_key);
580
581 if (flags & VMEMMAP_SYNCHRONIZE_RCU)
582 synchronize_rcu();
583 /*
584 * Very Subtle
585 * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed
586 * immediately after remapping. As a result, subsequent accesses
587 * and modifications to struct pages associated with the hugetlb
588 * page could be to the OLD struct pages. Set the vmemmap optimized
589 * flag here so that it is copied to the new head page. This keeps
590 * the old and new struct pages in sync.
591 * If there is an error during optimization, we will immediately FLUSH
592 * the TLB and clear the flag below.
593 */
594 folio_set_hugetlb_vmemmap_optimized(folio);
595
596 vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
597 vmemmap_reuse = vmemmap_start;
598 vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
599
600 /*
601 * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
602 * to the page which @vmemmap_reuse is mapped to. Add pages previously
603 * mapping the range to vmemmap_pages list so that they can be freed by
604 * the caller.
605 */
606 ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse,
607 vmemmap_pages, flags);
608 if (ret) {
609 static_branch_dec(&hugetlb_optimize_vmemmap_key);
610 folio_clear_hugetlb_vmemmap_optimized(folio);
611 }
612
613 return ret;
614 }
615
616 /**
617 * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages.
618 * @h: struct hstate.
619 * @folio: the folio whose vmemmap pages will be optimized.
620 *
621 * This function only tries to optimize @folio's vmemmap pages and does not
622 * guarantee that the optimization will succeed after it returns. The caller
623 * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's
624 * vmemmap pages have been optimized.
625 */
hugetlb_vmemmap_optimize_folio(const struct hstate * h,struct folio * folio)626 void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
627 {
628 LIST_HEAD(vmemmap_pages);
629
630 __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_SYNCHRONIZE_RCU);
631 free_vmemmap_page_list(&vmemmap_pages);
632 }
633
hugetlb_vmemmap_split_folio(const struct hstate * h,struct folio * folio)634 static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio)
635 {
636 unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
637 unsigned long vmemmap_reuse;
638
639 if (!vmemmap_should_optimize_folio(h, folio))
640 return 0;
641
642 vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
643 vmemmap_reuse = vmemmap_start;
644 vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
645
646 /*
647 * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
648 * @vmemmap_end]
649 */
650 return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
651 }
652
__hugetlb_vmemmap_optimize_folios(struct hstate * h,struct list_head * folio_list,bool boot)653 static void __hugetlb_vmemmap_optimize_folios(struct hstate *h,
654 struct list_head *folio_list,
655 bool boot)
656 {
657 struct folio *folio;
658 int nr_to_optimize;
659 LIST_HEAD(vmemmap_pages);
660 unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU;
661
662 nr_to_optimize = 0;
663 list_for_each_entry(folio, folio_list, lru) {
664 int ret;
665 unsigned long spfn, epfn;
666
667 if (boot && folio_test_hugetlb_vmemmap_optimized(folio)) {
668 /*
669 * Already optimized by pre-HVO, just map the
670 * mirrored tail page structs RO.
671 */
672 spfn = (unsigned long)&folio->page;
673 epfn = spfn + pages_per_huge_page(h);
674 vmemmap_wrprotect_hvo(spfn, epfn, folio_nid(folio),
675 HUGETLB_VMEMMAP_RESERVE_SIZE);
676 register_page_bootmem_memmap(pfn_to_section_nr(spfn),
677 &folio->page,
678 HUGETLB_VMEMMAP_RESERVE_SIZE);
679 static_branch_inc(&hugetlb_optimize_vmemmap_key);
680 continue;
681 }
682
683 nr_to_optimize++;
684
685 ret = hugetlb_vmemmap_split_folio(h, folio);
686
687 /*
688 * Splitting the PMD requires allocating a page, thus let's fail
689 * early once we encounter the first OOM. No point in retrying
690 * as it can be dynamically done on remap with the memory
691 * we get back from the vmemmap deduplication.
692 */
693 if (ret == -ENOMEM)
694 break;
695 }
696
697 if (!nr_to_optimize)
698 /*
699 * All pre-HVO folios, nothing left to do. It's ok if
700 * there is a mix of pre-HVO and not yet HVO-ed folios
701 * here, as __hugetlb_vmemmap_optimize_folio() will
702 * skip any folios that already have the optimized flag
703 * set, see vmemmap_should_optimize_folio().
704 */
705 goto out;
706
707 flush_tlb_all();
708
709 list_for_each_entry(folio, folio_list, lru) {
710 int ret;
711
712 ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags);
713 /* only need to synchronize_rcu() once for each batch */
714 flags &= ~VMEMMAP_SYNCHRONIZE_RCU;
715
716 /*
717 * Pages to be freed may have been accumulated. If we
718 * encounter an ENOMEM, free what we have and try again.
719 * This can occur in the case that both splitting fails
720 * halfway and head page allocation also failed. In this
721 * case __hugetlb_vmemmap_optimize_folio() would free memory
722 * allowing more vmemmap remaps to occur.
723 */
724 if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
725 flush_tlb_all();
726 free_vmemmap_page_list(&vmemmap_pages);
727 INIT_LIST_HEAD(&vmemmap_pages);
728 __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags);
729 }
730 }
731
732 out:
733 flush_tlb_all();
734 free_vmemmap_page_list(&vmemmap_pages);
735 }
736
hugetlb_vmemmap_optimize_folios(struct hstate * h,struct list_head * folio_list)737 void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
738 {
739 __hugetlb_vmemmap_optimize_folios(h, folio_list, false);
740 }
741
hugetlb_vmemmap_optimize_bootmem_folios(struct hstate * h,struct list_head * folio_list)742 void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list)
743 {
744 __hugetlb_vmemmap_optimize_folios(h, folio_list, true);
745 }
746
747 #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
748
749 /* Return true of a bootmem allocated HugeTLB page should be pre-HVO-ed */
vmemmap_should_optimize_bootmem_page(struct huge_bootmem_page * m)750 static bool vmemmap_should_optimize_bootmem_page(struct huge_bootmem_page *m)
751 {
752 unsigned long section_size, psize, pmd_vmemmap_size;
753 phys_addr_t paddr;
754
755 if (!READ_ONCE(vmemmap_optimize_enabled))
756 return false;
757
758 if (!hugetlb_vmemmap_optimizable(m->hstate))
759 return false;
760
761 psize = huge_page_size(m->hstate);
762 paddr = virt_to_phys(m);
763
764 /*
765 * Pre-HVO only works if the bootmem huge page
766 * is aligned to the section size.
767 */
768 section_size = (1UL << PA_SECTION_SHIFT);
769 if (!IS_ALIGNED(paddr, section_size) ||
770 !IS_ALIGNED(psize, section_size))
771 return false;
772
773 /*
774 * The pre-HVO code does not deal with splitting PMDS,
775 * so the bootmem page must be aligned to the number
776 * of base pages that can be mapped with one vmemmap PMD.
777 */
778 pmd_vmemmap_size = (PMD_SIZE / (sizeof(struct page))) << PAGE_SHIFT;
779 if (!IS_ALIGNED(paddr, pmd_vmemmap_size) ||
780 !IS_ALIGNED(psize, pmd_vmemmap_size))
781 return false;
782
783 return true;
784 }
785
786 /*
787 * Initialize memmap section for a gigantic page, HVO-style.
788 */
hugetlb_vmemmap_init_early(int nid)789 void __init hugetlb_vmemmap_init_early(int nid)
790 {
791 unsigned long psize, paddr, section_size;
792 unsigned long ns, i, pnum, pfn, nr_pages;
793 unsigned long start, end;
794 struct huge_bootmem_page *m = NULL;
795 void *map;
796
797 /*
798 * Noting to do if bootmem pages were not allocated
799 * early in boot, or if HVO wasn't enabled in the
800 * first place.
801 */
802 if (!hugetlb_bootmem_allocated())
803 return;
804
805 if (!READ_ONCE(vmemmap_optimize_enabled))
806 return;
807
808 section_size = (1UL << PA_SECTION_SHIFT);
809
810 list_for_each_entry(m, &huge_boot_pages[nid], list) {
811 if (!vmemmap_should_optimize_bootmem_page(m))
812 continue;
813
814 nr_pages = pages_per_huge_page(m->hstate);
815 psize = nr_pages << PAGE_SHIFT;
816 paddr = virt_to_phys(m);
817 pfn = PHYS_PFN(paddr);
818 map = pfn_to_page(pfn);
819 start = (unsigned long)map;
820 end = start + nr_pages * sizeof(struct page);
821
822 if (vmemmap_populate_hvo(start, end, nid,
823 HUGETLB_VMEMMAP_RESERVE_SIZE) < 0)
824 continue;
825
826 memmap_boot_pages_add(HUGETLB_VMEMMAP_RESERVE_SIZE / PAGE_SIZE);
827
828 pnum = pfn_to_section_nr(pfn);
829 ns = psize / section_size;
830
831 for (i = 0; i < ns; i++) {
832 sparse_init_early_section(nid, map, pnum,
833 SECTION_IS_VMEMMAP_PREINIT);
834 map += section_map_size();
835 pnum++;
836 }
837
838 m->flags |= HUGE_BOOTMEM_HVO;
839 }
840 }
841
hugetlb_vmemmap_init_late(int nid)842 void __init hugetlb_vmemmap_init_late(int nid)
843 {
844 struct huge_bootmem_page *m, *tm;
845 unsigned long phys, nr_pages, start, end;
846 unsigned long pfn, nr_mmap;
847 struct hstate *h;
848 void *map;
849
850 if (!hugetlb_bootmem_allocated())
851 return;
852
853 if (!READ_ONCE(vmemmap_optimize_enabled))
854 return;
855
856 list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) {
857 if (!(m->flags & HUGE_BOOTMEM_HVO))
858 continue;
859
860 phys = virt_to_phys(m);
861 h = m->hstate;
862 pfn = PHYS_PFN(phys);
863 nr_pages = pages_per_huge_page(h);
864
865 if (!hugetlb_bootmem_page_zones_valid(nid, m)) {
866 /*
867 * Oops, the hugetlb page spans multiple zones.
868 * Remove it from the list, and undo HVO.
869 */
870 list_del(&m->list);
871
872 map = pfn_to_page(pfn);
873
874 start = (unsigned long)map;
875 end = start + nr_pages * sizeof(struct page);
876
877 vmemmap_undo_hvo(start, end, nid,
878 HUGETLB_VMEMMAP_RESERVE_SIZE);
879 nr_mmap = end - start - HUGETLB_VMEMMAP_RESERVE_SIZE;
880 memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE));
881
882 memblock_phys_free(phys, huge_page_size(h));
883 continue;
884 } else
885 m->flags |= HUGE_BOOTMEM_ZONES_VALID;
886 }
887 }
888 #endif
889
890 static const struct ctl_table hugetlb_vmemmap_sysctls[] = {
891 {
892 .procname = "hugetlb_optimize_vmemmap",
893 .data = &vmemmap_optimize_enabled,
894 .maxlen = sizeof(vmemmap_optimize_enabled),
895 .mode = 0644,
896 .proc_handler = proc_dobool,
897 },
898 };
899
hugetlb_vmemmap_init(void)900 static int __init hugetlb_vmemmap_init(void)
901 {
902 const struct hstate *h;
903
904 /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
905 BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES);
906
907 for_each_hstate(h) {
908 if (hugetlb_vmemmap_optimizable(h)) {
909 register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
910 break;
911 }
912 }
913 return 0;
914 }
915 late_initcall(hugetlb_vmemmap_init);
916