xref: /linux/mm/hugetlb_vmemmap.c (revision da3e2d1ca43de56a83a806237b6be7e91cf07052)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * HugeTLB Vmemmap Optimization (HVO)
4  *
5  * Copyright (c) 2020, ByteDance. All rights reserved.
6  *
7  *     Author: Muchun Song <songmuchun@bytedance.com>
8  *
9  * See Documentation/mm/vmemmap_dedup.rst
10  */
11 #define pr_fmt(fmt)	"HugeTLB: " fmt
12 
13 #include <linux/pgtable.h>
14 #include <linux/moduleparam.h>
15 #include <linux/bootmem_info.h>
16 #include <linux/mmdebug.h>
17 #include <linux/pagewalk.h>
18 #include <linux/pgalloc.h>
19 
20 #include <asm/tlbflush.h>
21 #include "hugetlb_vmemmap.h"
22 #include "internal.h"
23 
24 /**
25  * struct vmemmap_remap_walk - walk vmemmap page table
26  *
27  * @remap_pte:		called for each lowest-level entry (PTE).
28  * @nr_walked:		the number of walked pte.
29  * @vmemmap_head:	the page to be installed as first in the vmemmap range
30  * @vmemmap_tail:	the page to be installed as non-first in the vmemmap range
31  * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
32  *			or is mapped from.
33  * @flags:		used to modify behavior in vmemmap page table walking
34  *			operations.
35  */
36 struct vmemmap_remap_walk {
37 	void			(*remap_pte)(pte_t *pte, unsigned long addr,
38 					     struct vmemmap_remap_walk *walk);
39 
40 	unsigned long		nr_walked;
41 	struct page		*vmemmap_head;
42 	struct page		*vmemmap_tail;
43 	struct list_head	*vmemmap_pages;
44 
45 
46 /* Skip the TLB flush when we split the PMD */
47 #define VMEMMAP_SPLIT_NO_TLB_FLUSH	BIT(0)
48 /* Skip the TLB flush when we remap the PTE */
49 #define VMEMMAP_REMAP_NO_TLB_FLUSH	BIT(1)
50 	unsigned long		flags;
51 };
52 
53 static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start,
54 			     struct vmemmap_remap_walk *walk)
55 {
56 	pmd_t __pmd;
57 	int i;
58 	unsigned long addr = start;
59 	pte_t *pgtable;
60 
61 	pgtable = pte_alloc_one_kernel(&init_mm);
62 	if (!pgtable)
63 		return -ENOMEM;
64 
65 	pmd_populate_kernel(&init_mm, &__pmd, pgtable);
66 
67 	for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
68 		pte_t entry, *pte;
69 		pgprot_t pgprot = PAGE_KERNEL;
70 
71 		entry = mk_pte(head + i, pgprot);
72 		pte = pte_offset_kernel(&__pmd, addr);
73 		set_pte_at(&init_mm, addr, pte, entry);
74 	}
75 
76 	spin_lock(&init_mm.page_table_lock);
77 	if (likely(pmd_leaf(*pmd))) {
78 		/*
79 		 * Higher order allocations from buddy allocator must be able to
80 		 * be treated as independent small pages (as they can be freed
81 		 * individually).
82 		 */
83 		if (!PageReserved(head))
84 			split_page(head, get_order(PMD_SIZE));
85 
86 		/* Make pte visible before pmd. See comment in pmd_install(). */
87 		smp_wmb();
88 		pmd_populate_kernel(&init_mm, pmd, pgtable);
89 		if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH))
90 			flush_tlb_kernel_range(start, start + PMD_SIZE);
91 	} else {
92 		pte_free_kernel(&init_mm, pgtable);
93 	}
94 	spin_unlock(&init_mm.page_table_lock);
95 
96 	return 0;
97 }
98 
99 static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr,
100 			     unsigned long next, struct mm_walk *walk)
101 {
102 	int ret = 0;
103 	struct page *head;
104 	struct vmemmap_remap_walk *vmemmap_walk = walk->private;
105 
106 	/* Only splitting, not remapping the vmemmap pages. */
107 	if (!vmemmap_walk->remap_pte)
108 		walk->action = ACTION_CONTINUE;
109 
110 	spin_lock(&init_mm.page_table_lock);
111 	head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL;
112 	/*
113 	 * Due to HugeTLB alignment requirements and the vmemmap
114 	 * pages being at the start of the hotplugged memory
115 	 * region in memory_hotplug.memmap_on_memory case. Checking
116 	 * the vmemmap page associated with the first vmemmap page
117 	 * if it is self-hosted is sufficient.
118 	 *
119 	 * [                  hotplugged memory                  ]
120 	 * [        section        ][...][        section        ]
121 	 * [ vmemmap ][              usable memory               ]
122 	 *   ^  | ^                        |
123 	 *   +--+ |                        |
124 	 *        +------------------------+
125 	 */
126 	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) {
127 		struct page *page = head ? head + pte_index(addr) :
128 				    pte_page(ptep_get(pte_offset_kernel(pmd, addr)));
129 
130 		if (PageVmemmapSelfHosted(page))
131 			ret = -ENOTSUPP;
132 	}
133 	spin_unlock(&init_mm.page_table_lock);
134 	if (!head || ret)
135 		return ret;
136 
137 	return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk);
138 }
139 
140 static int vmemmap_pte_entry(pte_t *pte, unsigned long addr,
141 			     unsigned long next, struct mm_walk *walk)
142 {
143 	struct vmemmap_remap_walk *vmemmap_walk = walk->private;
144 
145 	vmemmap_walk->remap_pte(pte, addr, vmemmap_walk);
146 	vmemmap_walk->nr_walked++;
147 
148 	return 0;
149 }
150 
151 static const struct mm_walk_ops vmemmap_remap_ops = {
152 	.pmd_entry	= vmemmap_pmd_entry,
153 	.pte_entry	= vmemmap_pte_entry,
154 };
155 
156 static int vmemmap_remap_range(unsigned long start, unsigned long end,
157 			       struct vmemmap_remap_walk *walk)
158 {
159 	int ret;
160 
161 	VM_BUG_ON(!PAGE_ALIGNED(start | end));
162 
163 	mmap_read_lock(&init_mm);
164 	ret = walk_kernel_page_table_range(start, end, &vmemmap_remap_ops,
165 				    NULL, walk);
166 	mmap_read_unlock(&init_mm);
167 	if (ret)
168 		return ret;
169 
170 	if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH))
171 		flush_tlb_kernel_range(start, end);
172 
173 	return 0;
174 }
175 
176 /*
177  * Free a vmemmap page. A vmemmap page can be allocated from the memblock
178  * allocator or buddy allocator. If the PG_reserved flag is set, it means
179  * that it allocated from the memblock allocator, just free it via the
180  * free_bootmem_page(). Otherwise, use __free_page().
181  */
182 static inline void free_vmemmap_page(struct page *page)
183 {
184 	if (PageReserved(page)) {
185 		memmap_boot_pages_add(-1);
186 		free_bootmem_page(page);
187 	} else {
188 		memmap_pages_add(-1);
189 		__free_page(page);
190 	}
191 }
192 
193 /* Free a list of the vmemmap pages */
194 static void free_vmemmap_page_list(struct list_head *list)
195 {
196 	struct page *page, *next;
197 
198 	list_for_each_entry_safe(page, next, list, lru)
199 		free_vmemmap_page(page);
200 }
201 
202 static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
203 			      struct vmemmap_remap_walk *walk)
204 {
205 	struct page *page = pte_page(ptep_get(pte));
206 	pte_t entry;
207 
208 	/* Remapping the head page requires r/w */
209 	if (unlikely(walk->nr_walked == 0 && walk->vmemmap_head)) {
210 		list_del(&walk->vmemmap_head->lru);
211 
212 		/*
213 		 * Makes sure that preceding stores to the page contents from
214 		 * vmemmap_remap_free() become visible before the set_pte_at()
215 		 * write.
216 		 */
217 		smp_wmb();
218 
219 		entry = mk_pte(walk->vmemmap_head, PAGE_KERNEL);
220 	} else {
221 		/*
222 		 * Remap the tail pages as read-only to catch illegal write
223 		 * operation to the tail pages.
224 		 */
225 		entry = mk_pte(walk->vmemmap_tail, PAGE_KERNEL_RO);
226 	}
227 
228 	list_add(&page->lru, walk->vmemmap_pages);
229 	set_pte_at(&init_mm, addr, pte, entry);
230 }
231 
232 static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
233 				struct vmemmap_remap_walk *walk)
234 {
235 	struct page *page;
236 	struct page *from, *to;
237 
238 	page = list_first_entry(walk->vmemmap_pages, struct page, lru);
239 	list_del(&page->lru);
240 
241 	/*
242 	 * Initialize tail pages in the newly allocated vmemmap page.
243 	 *
244 	 * There is folio-scope metadata that is encoded in the first few
245 	 * tail pages.
246 	 *
247 	 * Use the value last tail page in the page with the head page
248 	 * to initialize the rest of tail pages.
249 	 */
250 	from = compound_head((struct page *)addr) +
251 		PAGE_SIZE / sizeof(struct page) - 1;
252 	to = page_to_virt(page);
253 	for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++, to++)
254 		*to = *from;
255 
256 	/*
257 	 * Makes sure that preceding stores to the page contents become visible
258 	 * before the set_pte_at() write.
259 	 */
260 	smp_wmb();
261 	set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL));
262 }
263 
264 /**
265  * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
266  *                      backing PMDs of the directmap into PTEs
267  * @start:     start address of the vmemmap virtual address range that we want
268  *             to remap.
269  * @end:       end address of the vmemmap virtual address range that we want to
270  *             remap.
271  * Return: %0 on success, negative error code otherwise.
272  */
273 static int vmemmap_remap_split(unsigned long start, unsigned long end)
274 {
275 	struct vmemmap_remap_walk walk = {
276 		.remap_pte	= NULL,
277 		.flags		= VMEMMAP_SPLIT_NO_TLB_FLUSH,
278 	};
279 
280 	return vmemmap_remap_range(start, end, &walk);
281 }
282 
283 /**
284  * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
285  *			to use @vmemmap_head/tail, then free vmemmap which
286  *			the range are mapped to.
287  * @start:	start address of the vmemmap virtual address range that we want
288  *		to remap.
289  * @end:	end address of the vmemmap virtual address range that we want to
290  *		remap.
291  * @vmemmap_head: the page to be installed as first in the vmemmap range
292  * @vmemmap_tail: the page to be installed as non-first in the vmemmap range
293  * @vmemmap_pages: list to deposit vmemmap pages to be freed.  It is callers
294  *		responsibility to free pages.
295  * @flags:	modifications to vmemmap_remap_walk flags
296  *
297  * Return: %0 on success, negative error code otherwise.
298  */
299 static int vmemmap_remap_free(unsigned long start, unsigned long end,
300 			      struct page *vmemmap_head,
301 			      struct page *vmemmap_tail,
302 			      struct list_head *vmemmap_pages,
303 			      unsigned long flags)
304 {
305 	int ret;
306 	struct vmemmap_remap_walk walk = {
307 		.remap_pte	= vmemmap_remap_pte,
308 		.vmemmap_head	= vmemmap_head,
309 		.vmemmap_tail	= vmemmap_tail,
310 		.vmemmap_pages	= vmemmap_pages,
311 		.flags		= flags,
312 	};
313 
314 	ret = vmemmap_remap_range(start, end, &walk);
315 	if (!ret || !walk.nr_walked)
316 		return ret;
317 
318 	end = start + walk.nr_walked * PAGE_SIZE;
319 
320 	/*
321 	 * vmemmap_pages contains pages from the previous vmemmap_remap_range()
322 	 * call which failed.  These are pages which were removed from
323 	 * the vmemmap. They will be restored in the following call.
324 	 */
325 	walk = (struct vmemmap_remap_walk) {
326 		.remap_pte	= vmemmap_restore_pte,
327 		.vmemmap_pages	= vmemmap_pages,
328 		.flags		= 0,
329 	};
330 
331 	vmemmap_remap_range(start, end, &walk);
332 
333 	return ret;
334 }
335 
336 static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
337 				   struct list_head *list)
338 {
339 	gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
340 	unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
341 	int nid = page_to_nid((struct page *)start);
342 	struct page *page, *next;
343 	int i;
344 
345 	for (i = 0; i < nr_pages; i++) {
346 		page = alloc_pages_node(nid, gfp_mask, 0);
347 		if (!page)
348 			goto out;
349 		list_add(&page->lru, list);
350 	}
351 	memmap_pages_add(nr_pages);
352 
353 	return 0;
354 out:
355 	list_for_each_entry_safe(page, next, list, lru)
356 		__free_page(page);
357 	return -ENOMEM;
358 }
359 
360 /**
361  * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
362  *			 to the page which is from the @vmemmap_pages
363  *			 respectively.
364  * @start:	start address of the vmemmap virtual address range that we want
365  *		to remap.
366  * @end:	end address of the vmemmap virtual address range that we want to
367  *		remap.
368  * @flags:	modifications to vmemmap_remap_walk flags
369  *
370  * Return: %0 on success, negative error code otherwise.
371  */
372 static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
373 			       unsigned long flags)
374 {
375 	LIST_HEAD(vmemmap_pages);
376 	struct vmemmap_remap_walk walk = {
377 		.remap_pte	= vmemmap_restore_pte,
378 		.vmemmap_pages	= &vmemmap_pages,
379 		.flags		= flags,
380 	};
381 
382 	if (alloc_vmemmap_page_list(start, end, &vmemmap_pages))
383 		return -ENOMEM;
384 
385 	return vmemmap_remap_range(start, end, &walk);
386 }
387 
388 static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
389 static int __init hugetlb_vmemmap_optimize_param(char *buf)
390 {
391 	return kstrtobool(buf, &vmemmap_optimize_enabled);
392 }
393 early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_optimize_param);
394 
395 static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
396 					   struct folio *folio, unsigned long flags)
397 {
398 	int ret;
399 	unsigned long vmemmap_start, vmemmap_end;
400 
401 	VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
402 	VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
403 
404 	if (!folio_test_hugetlb_vmemmap_optimized(folio))
405 		return 0;
406 
407 	vmemmap_start	= (unsigned long)&folio->page;
408 	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
409 
410 	vmemmap_start	+= HUGETLB_VMEMMAP_RESERVE_SIZE;
411 
412 	/*
413 	 * The pages which the vmemmap virtual address range [@vmemmap_start,
414 	 * @vmemmap_end) are mapped to are freed to the buddy allocator.
415 	 * When a HugeTLB page is freed to the buddy allocator, previously
416 	 * discarded vmemmap pages must be allocated and remapping.
417 	 */
418 	ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, flags);
419 	if (!ret)
420 		folio_clear_hugetlb_vmemmap_optimized(folio);
421 
422 	return ret;
423 }
424 
425 /**
426  * hugetlb_vmemmap_restore_folio - restore previously optimized (by
427  *				hugetlb_vmemmap_optimize_folio()) vmemmap pages which
428  *				will be reallocated and remapped.
429  * @h:		struct hstate.
430  * @folio:     the folio whose vmemmap pages will be restored.
431  *
432  * Return: %0 if @folio's vmemmap pages have been reallocated and remapped,
433  * negative error code otherwise.
434  */
435 int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio)
436 {
437 	return __hugetlb_vmemmap_restore_folio(h, folio, 0);
438 }
439 
440 /**
441  * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
442  * @h:			hstate.
443  * @folio_list:		list of folios.
444  * @non_hvo_folios:	Output list of folios for which vmemmap exists.
445  *
446  * Return: number of folios for which vmemmap was restored, or an error code
447  *		if an error was encountered restoring vmemmap for a folio.
448  *		Folios that have vmemmap are moved to the non_hvo_folios
449  *		list.  Processing of entries stops when the first error is
450  *		encountered. The folio that experienced the error and all
451  *		non-processed folios will remain on folio_list.
452  */
453 long hugetlb_vmemmap_restore_folios(const struct hstate *h,
454 					struct list_head *folio_list,
455 					struct list_head *non_hvo_folios)
456 {
457 	struct folio *folio, *t_folio;
458 	long restored = 0;
459 	long ret = 0;
460 	unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH;
461 
462 	list_for_each_entry_safe(folio, t_folio, folio_list, lru) {
463 		if (folio_test_hugetlb_vmemmap_optimized(folio)) {
464 			ret = __hugetlb_vmemmap_restore_folio(h, folio, flags);
465 			if (ret)
466 				break;
467 			restored++;
468 		}
469 
470 		/* Add non-optimized folios to output list */
471 		list_move(&folio->lru, non_hvo_folios);
472 	}
473 
474 	if (restored)
475 		flush_tlb_all();
476 	if (!ret)
477 		ret = restored;
478 	return ret;
479 }
480 
481 /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
482 static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio)
483 {
484 	if (folio_test_hugetlb_vmemmap_optimized(folio))
485 		return false;
486 
487 	if (!READ_ONCE(vmemmap_optimize_enabled))
488 		return false;
489 
490 	if (!hugetlb_vmemmap_optimizable(h))
491 		return false;
492 
493 	return true;
494 }
495 
496 static struct page *vmemmap_get_tail(unsigned int order, struct zone *zone)
497 {
498 	const unsigned int idx = order - VMEMMAP_TAIL_MIN_ORDER;
499 	struct page *tail, *p;
500 	int node = zone_to_nid(zone);
501 
502 	tail = READ_ONCE(zone->vmemmap_tails[idx]);
503 	if (likely(tail))
504 		return tail;
505 
506 	tail = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
507 	if (!tail)
508 		return NULL;
509 
510 	p = page_to_virt(tail);
511 	for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++)
512 		init_compound_tail(p + i, NULL, order, zone);
513 
514 	if (cmpxchg(&zone->vmemmap_tails[idx], NULL, tail)) {
515 		__free_page(tail);
516 		tail = READ_ONCE(zone->vmemmap_tails[idx]);
517 	}
518 
519 	return tail;
520 }
521 
522 static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
523 					    struct folio *folio,
524 					    struct list_head *vmemmap_pages,
525 					    unsigned long flags)
526 {
527 	unsigned long vmemmap_start, vmemmap_end;
528 	struct page *vmemmap_head, *vmemmap_tail;
529 	int nid, ret = 0;
530 
531 	VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
532 	VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
533 
534 	if (!vmemmap_should_optimize_folio(h, folio))
535 		return ret;
536 
537 	nid = folio_nid(folio);
538 	vmemmap_tail = vmemmap_get_tail(h->order, folio_zone(folio));
539 	if (!vmemmap_tail)
540 		return -ENOMEM;
541 
542 	/*
543 	 * Very Subtle
544 	 * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed
545 	 * immediately after remapping.  As a result, subsequent accesses
546 	 * and modifications to struct pages associated with the hugetlb
547 	 * page could be to the OLD struct pages.  Set the vmemmap optimized
548 	 * flag here so that it is copied to the new head page.  This keeps
549 	 * the old and new struct pages in sync.
550 	 * If there is an error during optimization, we will immediately FLUSH
551 	 * the TLB and clear the flag below.
552 	 */
553 	folio_set_hugetlb_vmemmap_optimized(folio);
554 
555 	vmemmap_head = alloc_pages_node(nid, GFP_KERNEL, 0);
556 	if (!vmemmap_head) {
557 		ret = -ENOMEM;
558 		goto out;
559 	}
560 
561 	copy_page(page_to_virt(vmemmap_head), folio);
562 	list_add(&vmemmap_head->lru, vmemmap_pages);
563 	memmap_pages_add(1);
564 
565 	vmemmap_start	= (unsigned long)&folio->page;
566 	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
567 
568 	/*
569 	 * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end).
570 	 * Add pages previously mapping the range to vmemmap_pages list so that
571 	 * they can be freed by the caller.
572 	 */
573 	ret = vmemmap_remap_free(vmemmap_start, vmemmap_end,
574 				 vmemmap_head, vmemmap_tail,
575 				 vmemmap_pages, flags);
576 out:
577 	if (ret)
578 		folio_clear_hugetlb_vmemmap_optimized(folio);
579 
580 	return ret;
581 }
582 
583 /**
584  * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages.
585  * @h:		struct hstate.
586  * @folio:     the folio whose vmemmap pages will be optimized.
587  *
588  * This function only tries to optimize @folio's vmemmap pages and does not
589  * guarantee that the optimization will succeed after it returns. The caller
590  * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's
591  * vmemmap pages have been optimized.
592  */
593 void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
594 {
595 	LIST_HEAD(vmemmap_pages);
596 
597 	__hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0);
598 	free_vmemmap_page_list(&vmemmap_pages);
599 }
600 
601 static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio)
602 {
603 	unsigned long vmemmap_start, vmemmap_end;
604 
605 	if (!vmemmap_should_optimize_folio(h, folio))
606 		return 0;
607 
608 	vmemmap_start	= (unsigned long)&folio->page;
609 	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
610 
611 	/*
612 	 * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
613 	 * @vmemmap_end]
614 	 */
615 	return vmemmap_remap_split(vmemmap_start, vmemmap_end);
616 }
617 
618 static void __hugetlb_vmemmap_optimize_folios(struct hstate *h,
619 					      struct list_head *folio_list,
620 					      bool boot)
621 {
622 	struct folio *folio;
623 	int nr_to_optimize;
624 	LIST_HEAD(vmemmap_pages);
625 	unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH;
626 
627 	nr_to_optimize = 0;
628 	list_for_each_entry(folio, folio_list, lru) {
629 		int ret;
630 		unsigned long spfn, epfn;
631 
632 		if (boot && folio_test_hugetlb_vmemmap_optimized(folio)) {
633 			/*
634 			 * Already optimized by pre-HVO, just map the
635 			 * mirrored tail page structs RO.
636 			 */
637 			spfn = (unsigned long)&folio->page;
638 			epfn = spfn + pages_per_huge_page(h);
639 			vmemmap_wrprotect_hvo(spfn, epfn, folio_nid(folio),
640 					HUGETLB_VMEMMAP_RESERVE_SIZE);
641 			register_page_bootmem_memmap(pfn_to_section_nr(spfn),
642 					&folio->page,
643 					HUGETLB_VMEMMAP_RESERVE_SIZE);
644 			continue;
645 		}
646 
647 		nr_to_optimize++;
648 
649 		ret = hugetlb_vmemmap_split_folio(h, folio);
650 
651 		/*
652 		 * Splitting the PMD requires allocating a page, thus let's fail
653 		 * early once we encounter the first OOM. No point in retrying
654 		 * as it can be dynamically done on remap with the memory
655 		 * we get back from the vmemmap deduplication.
656 		 */
657 		if (ret == -ENOMEM)
658 			break;
659 	}
660 
661 	if (!nr_to_optimize)
662 		/*
663 		 * All pre-HVO folios, nothing left to do. It's ok if
664 		 * there is a mix of pre-HVO and not yet HVO-ed folios
665 		 * here, as __hugetlb_vmemmap_optimize_folio() will
666 		 * skip any folios that already have the optimized flag
667 		 * set, see vmemmap_should_optimize_folio().
668 		 */
669 		goto out;
670 
671 	flush_tlb_all();
672 
673 	list_for_each_entry(folio, folio_list, lru) {
674 		int ret;
675 
676 		ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags);
677 
678 		/*
679 		 * Pages to be freed may have been accumulated.  If we
680 		 * encounter an ENOMEM,  free what we have and try again.
681 		 * This can occur in the case that both splitting fails
682 		 * halfway and head page allocation also failed. In this
683 		 * case __hugetlb_vmemmap_optimize_folio() would free memory
684 		 * allowing more vmemmap remaps to occur.
685 		 */
686 		if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
687 			flush_tlb_all();
688 			free_vmemmap_page_list(&vmemmap_pages);
689 			INIT_LIST_HEAD(&vmemmap_pages);
690 			__hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags);
691 		}
692 	}
693 
694 out:
695 	flush_tlb_all();
696 	free_vmemmap_page_list(&vmemmap_pages);
697 }
698 
699 void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
700 {
701 	__hugetlb_vmemmap_optimize_folios(h, folio_list, false);
702 }
703 
704 void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list)
705 {
706 	__hugetlb_vmemmap_optimize_folios(h, folio_list, true);
707 }
708 
709 #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
710 
711 /* Return true of a bootmem allocated HugeTLB page should be pre-HVO-ed */
712 static bool vmemmap_should_optimize_bootmem_page(struct huge_bootmem_page *m)
713 {
714 	unsigned long section_size, psize, pmd_vmemmap_size;
715 	phys_addr_t paddr;
716 
717 	if (!READ_ONCE(vmemmap_optimize_enabled))
718 		return false;
719 
720 	if (!hugetlb_vmemmap_optimizable(m->hstate))
721 		return false;
722 
723 	psize = huge_page_size(m->hstate);
724 	paddr = virt_to_phys(m);
725 
726 	/*
727 	 * Pre-HVO only works if the bootmem huge page
728 	 * is aligned to the section size.
729 	 */
730 	section_size = (1UL << PA_SECTION_SHIFT);
731 	if (!IS_ALIGNED(paddr, section_size) ||
732 	    !IS_ALIGNED(psize, section_size))
733 		return false;
734 
735 	/*
736 	 * The pre-HVO code does not deal with splitting PMDS,
737 	 * so the bootmem page must be aligned to the number
738 	 * of base pages that can be mapped with one vmemmap PMD.
739 	 */
740 	pmd_vmemmap_size = (PMD_SIZE / (sizeof(struct page))) << PAGE_SHIFT;
741 	if (!IS_ALIGNED(paddr, pmd_vmemmap_size) ||
742 	    !IS_ALIGNED(psize, pmd_vmemmap_size))
743 		return false;
744 
745 	return true;
746 }
747 
748 /*
749  * Initialize memmap section for a gigantic page, HVO-style.
750  */
751 void __init hugetlb_vmemmap_init_early(int nid)
752 {
753 	unsigned long psize, paddr, section_size;
754 	unsigned long ns, i, pnum, pfn, nr_pages;
755 	struct huge_bootmem_page *m = NULL;
756 	void *map;
757 
758 	if (!READ_ONCE(vmemmap_optimize_enabled))
759 		return;
760 
761 	section_size = (1UL << PA_SECTION_SHIFT);
762 
763 	list_for_each_entry(m, &huge_boot_pages[nid], list) {
764 		if (!vmemmap_should_optimize_bootmem_page(m))
765 			continue;
766 
767 		nr_pages = pages_per_huge_page(m->hstate);
768 		psize = nr_pages << PAGE_SHIFT;
769 		paddr = virt_to_phys(m);
770 		pfn = PHYS_PFN(paddr);
771 		map = pfn_to_page(pfn);
772 
773 		pnum = pfn_to_section_nr(pfn);
774 		ns = psize / section_size;
775 
776 		for (i = 0; i < ns; i++) {
777 			sparse_init_early_section(nid, map, pnum,
778 					SECTION_IS_VMEMMAP_PREINIT);
779 			map += section_map_size();
780 			pnum++;
781 		}
782 
783 		m->flags |= HUGE_BOOTMEM_HVO;
784 	}
785 }
786 
787 static struct zone *pfn_to_zone(unsigned nid, unsigned long pfn)
788 {
789 	struct zone *zone;
790 	enum zone_type zone_type;
791 
792 	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
793 		zone = &NODE_DATA(nid)->node_zones[zone_type];
794 		if (zone_spans_pfn(zone, pfn))
795 			return zone;
796 	}
797 
798 	return NULL;
799 }
800 
801 void __init hugetlb_vmemmap_init_late(int nid)
802 {
803 	struct huge_bootmem_page *m, *tm;
804 	unsigned long phys, nr_pages, start, end;
805 	unsigned long pfn, nr_mmap;
806 	struct zone *zone = NULL;
807 	struct hstate *h;
808 	void *map;
809 
810 	if (!READ_ONCE(vmemmap_optimize_enabled))
811 		return;
812 
813 	list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) {
814 		if (!(m->flags & HUGE_BOOTMEM_HVO))
815 			continue;
816 
817 		phys = virt_to_phys(m);
818 		h = m->hstate;
819 		pfn = PHYS_PFN(phys);
820 		nr_pages = pages_per_huge_page(h);
821 		map = pfn_to_page(pfn);
822 		start = (unsigned long)map;
823 		end = start + nr_pages * sizeof(struct page);
824 
825 		if (!hugetlb_bootmem_page_zones_valid(nid, m)) {
826 			/*
827 			 * Oops, the hugetlb page spans multiple zones.
828 			 * Remove it from the list, and populate it normally.
829 			 */
830 			list_del(&m->list);
831 
832 			vmemmap_populate(start, end, nid, NULL);
833 			nr_mmap = end - start;
834 			memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE));
835 
836 			memblock_phys_free(phys, huge_page_size(h));
837 			continue;
838 		}
839 
840 		if (!zone || !zone_spans_pfn(zone, pfn))
841 			zone = pfn_to_zone(nid, pfn);
842 		if (WARN_ON_ONCE(!zone))
843 			continue;
844 
845 		if (vmemmap_populate_hvo(start, end, huge_page_order(h), zone,
846 					 HUGETLB_VMEMMAP_RESERVE_SIZE) < 0) {
847 			/* Fallback if HVO population fails */
848 			vmemmap_populate(start, end, nid, NULL);
849 			nr_mmap = end - start;
850 		} else {
851 			m->flags |= HUGE_BOOTMEM_ZONES_VALID;
852 			nr_mmap = HUGETLB_VMEMMAP_RESERVE_SIZE;
853 		}
854 
855 		memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE));
856 	}
857 }
858 #endif
859 
860 static const struct ctl_table hugetlb_vmemmap_sysctls[] = {
861 	{
862 		.procname	= "hugetlb_optimize_vmemmap",
863 		.data		= &vmemmap_optimize_enabled,
864 		.maxlen		= sizeof(vmemmap_optimize_enabled),
865 		.mode		= 0644,
866 		.proc_handler	= proc_dobool,
867 	},
868 };
869 
870 static int __init hugetlb_vmemmap_init(void)
871 {
872 	const struct hstate *h;
873 	struct zone *zone;
874 
875 	/* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
876 	BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES);
877 
878 	for_each_zone(zone) {
879 		for (int i = 0; i < NR_VMEMMAP_TAILS; i++) {
880 			struct page *tail, *p;
881 			unsigned int order;
882 
883 			tail = zone->vmemmap_tails[i];
884 			if (!tail)
885 				continue;
886 
887 			order = i + VMEMMAP_TAIL_MIN_ORDER;
888 			p = page_to_virt(tail);
889 			for (int j = 0; j < PAGE_SIZE / sizeof(struct page); j++)
890 				init_compound_tail(p + j, NULL, order, zone);
891 		}
892 	}
893 
894 	for_each_hstate(h) {
895 		if (hugetlb_vmemmap_optimizable(h)) {
896 			register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
897 			break;
898 		}
899 	}
900 	return 0;
901 }
902 late_initcall(hugetlb_vmemmap_init);
903