xref: /linux/mm/sparse-vmemmap.c (revision 55896f935a60b919ce699d11754061f6df936a7d)
1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
28f6aac41SChristoph Lameter /*
38f6aac41SChristoph Lameter  * Virtual Memory Map support
48f6aac41SChristoph Lameter  *
5cde53535SChristoph Lameter  * (C) 2007 sgi. Christoph Lameter.
68f6aac41SChristoph Lameter  *
78f6aac41SChristoph Lameter  * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn,
88f6aac41SChristoph Lameter  * virt_to_page, page_address() to be implemented as a base offset
98f6aac41SChristoph Lameter  * calculation without memory access.
108f6aac41SChristoph Lameter  *
118f6aac41SChristoph Lameter  * However, virtual mappings need a page table and TLBs. Many Linux
128f6aac41SChristoph Lameter  * architectures already map their physical space using 1-1 mappings
13b595076aSUwe Kleine-König  * via TLBs. For those arches the virtual memory map is essentially
148f6aac41SChristoph Lameter  * for free if we use the same page size as the 1-1 mappings. In that
158f6aac41SChristoph Lameter  * case the overhead consists of a few additional pages that are
168f6aac41SChristoph Lameter  * allocated to create a view of memory for vmemmap.
178f6aac41SChristoph Lameter  *
1829c71111SAndy Whitcroft  * The architecture is expected to provide a vmemmap_populate() function
1929c71111SAndy Whitcroft  * to instantiate the mapping.
208f6aac41SChristoph Lameter  */
218f6aac41SChristoph Lameter #include <linux/mm.h>
228f6aac41SChristoph Lameter #include <linux/mmzone.h>
2397ad1087SMike Rapoport #include <linux/memblock.h>
244b94ffdcSDan Williams #include <linux/memremap.h>
258f6aac41SChristoph Lameter #include <linux/highmem.h>
265a0e3ad6STejun Heo #include <linux/slab.h>
278f6aac41SChristoph Lameter #include <linux/spinlock.h>
288f6aac41SChristoph Lameter #include <linux/vmalloc.h>
298bca44bbSGlauber de Oliveira Costa #include <linux/sched.h>
30f41f2ed4SMuchun Song #include <linux/pgtable.h>
31f41f2ed4SMuchun Song #include <linux/bootmem_info.h>
32f41f2ed4SMuchun Song 
338f6aac41SChristoph Lameter #include <asm/dma.h>
348f6aac41SChristoph Lameter #include <asm/pgalloc.h>
35f41f2ed4SMuchun Song #include <asm/tlbflush.h>
36f41f2ed4SMuchun Song 
3747010c04SMuchun Song #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
38f41f2ed4SMuchun Song /**
39f41f2ed4SMuchun Song  * struct vmemmap_remap_walk - walk vmemmap page table
40f41f2ed4SMuchun Song  *
41f41f2ed4SMuchun Song  * @remap_pte:		called for each lowest-level entry (PTE).
423bc2b6a7SMuchun Song  * @nr_walked:		the number of walked pte.
43f41f2ed4SMuchun Song  * @reuse_page:		the page which is reused for the tail vmemmap pages.
44f41f2ed4SMuchun Song  * @reuse_addr:		the virtual address of the @reuse_page page.
45ad2fa371SMuchun Song  * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
46ad2fa371SMuchun Song  *			or is mapped from.
47f41f2ed4SMuchun Song  */
48f41f2ed4SMuchun Song struct vmemmap_remap_walk {
49f41f2ed4SMuchun Song 	void (*remap_pte)(pte_t *pte, unsigned long addr,
50f41f2ed4SMuchun Song 			  struct vmemmap_remap_walk *walk);
513bc2b6a7SMuchun Song 	unsigned long nr_walked;
52f41f2ed4SMuchun Song 	struct page *reuse_page;
53f41f2ed4SMuchun Song 	unsigned long reuse_addr;
54f41f2ed4SMuchun Song 	struct list_head *vmemmap_pages;
55f41f2ed4SMuchun Song };
56f41f2ed4SMuchun Song 
57d8d55f56SMuchun Song static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
583bc2b6a7SMuchun Song {
593bc2b6a7SMuchun Song 	pmd_t __pmd;
603bc2b6a7SMuchun Song 	int i;
613bc2b6a7SMuchun Song 	unsigned long addr = start;
623bc2b6a7SMuchun Song 	struct page *page = pmd_page(*pmd);
633bc2b6a7SMuchun Song 	pte_t *pgtable = pte_alloc_one_kernel(&init_mm);
643bc2b6a7SMuchun Song 
653bc2b6a7SMuchun Song 	if (!pgtable)
663bc2b6a7SMuchun Song 		return -ENOMEM;
673bc2b6a7SMuchun Song 
683bc2b6a7SMuchun Song 	pmd_populate_kernel(&init_mm, &__pmd, pgtable);
693bc2b6a7SMuchun Song 
703bc2b6a7SMuchun Song 	for (i = 0; i < PMD_SIZE / PAGE_SIZE; i++, addr += PAGE_SIZE) {
713bc2b6a7SMuchun Song 		pte_t entry, *pte;
723bc2b6a7SMuchun Song 		pgprot_t pgprot = PAGE_KERNEL;
733bc2b6a7SMuchun Song 
743bc2b6a7SMuchun Song 		entry = mk_pte(page + i, pgprot);
753bc2b6a7SMuchun Song 		pte = pte_offset_kernel(&__pmd, addr);
763bc2b6a7SMuchun Song 		set_pte_at(&init_mm, addr, pte, entry);
773bc2b6a7SMuchun Song 	}
783bc2b6a7SMuchun Song 
79d8d55f56SMuchun Song 	spin_lock(&init_mm.page_table_lock);
80d8d55f56SMuchun Song 	if (likely(pmd_leaf(*pmd))) {
81ed33b5a6SQi Zheng 		/* Make pte visible before pmd. See comment in pmd_install(). */
823bc2b6a7SMuchun Song 		smp_wmb();
833bc2b6a7SMuchun Song 		pmd_populate_kernel(&init_mm, pmd, pgtable);
843bc2b6a7SMuchun Song 		flush_tlb_kernel_range(start, start + PMD_SIZE);
85d8d55f56SMuchun Song 	} else {
86d8d55f56SMuchun Song 		pte_free_kernel(&init_mm, pgtable);
87d8d55f56SMuchun Song 	}
88d8d55f56SMuchun Song 	spin_unlock(&init_mm.page_table_lock);
893bc2b6a7SMuchun Song 
903bc2b6a7SMuchun Song 	return 0;
913bc2b6a7SMuchun Song }
923bc2b6a7SMuchun Song 
93d8d55f56SMuchun Song static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
94d8d55f56SMuchun Song {
95d8d55f56SMuchun Song 	int leaf;
96d8d55f56SMuchun Song 
97d8d55f56SMuchun Song 	spin_lock(&init_mm.page_table_lock);
98d8d55f56SMuchun Song 	leaf = pmd_leaf(*pmd);
99d8d55f56SMuchun Song 	spin_unlock(&init_mm.page_table_lock);
100d8d55f56SMuchun Song 
101d8d55f56SMuchun Song 	if (!leaf)
102d8d55f56SMuchun Song 		return 0;
103d8d55f56SMuchun Song 
104d8d55f56SMuchun Song 	return __split_vmemmap_huge_pmd(pmd, start);
105d8d55f56SMuchun Song }
106d8d55f56SMuchun Song 
107f41f2ed4SMuchun Song static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
108f41f2ed4SMuchun Song 			      unsigned long end,
109f41f2ed4SMuchun Song 			      struct vmemmap_remap_walk *walk)
110f41f2ed4SMuchun Song {
111f41f2ed4SMuchun Song 	pte_t *pte = pte_offset_kernel(pmd, addr);
112f41f2ed4SMuchun Song 
113f41f2ed4SMuchun Song 	/*
114f41f2ed4SMuchun Song 	 * The reuse_page is found 'first' in table walk before we start
115f41f2ed4SMuchun Song 	 * remapping (which is calling @walk->remap_pte).
116f41f2ed4SMuchun Song 	 */
117f41f2ed4SMuchun Song 	if (!walk->reuse_page) {
118f41f2ed4SMuchun Song 		walk->reuse_page = pte_page(*pte);
119f41f2ed4SMuchun Song 		/*
120f41f2ed4SMuchun Song 		 * Because the reuse address is part of the range that we are
121f41f2ed4SMuchun Song 		 * walking, skip the reuse address range.
122f41f2ed4SMuchun Song 		 */
123f41f2ed4SMuchun Song 		addr += PAGE_SIZE;
124f41f2ed4SMuchun Song 		pte++;
1253bc2b6a7SMuchun Song 		walk->nr_walked++;
126f41f2ed4SMuchun Song 	}
127f41f2ed4SMuchun Song 
1283bc2b6a7SMuchun Song 	for (; addr != end; addr += PAGE_SIZE, pte++) {
129f41f2ed4SMuchun Song 		walk->remap_pte(pte, addr, walk);
1303bc2b6a7SMuchun Song 		walk->nr_walked++;
1313bc2b6a7SMuchun Song 	}
132f41f2ed4SMuchun Song }
133f41f2ed4SMuchun Song 
1343bc2b6a7SMuchun Song static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
135f41f2ed4SMuchun Song 			     unsigned long end,
136f41f2ed4SMuchun Song 			     struct vmemmap_remap_walk *walk)
137f41f2ed4SMuchun Song {
138f41f2ed4SMuchun Song 	pmd_t *pmd;
139f41f2ed4SMuchun Song 	unsigned long next;
140f41f2ed4SMuchun Song 
141f41f2ed4SMuchun Song 	pmd = pmd_offset(pud, addr);
142f41f2ed4SMuchun Song 	do {
1433bc2b6a7SMuchun Song 		int ret;
144f41f2ed4SMuchun Song 
145d8d55f56SMuchun Song 		ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
1463bc2b6a7SMuchun Song 		if (ret)
1473bc2b6a7SMuchun Song 			return ret;
148d8d55f56SMuchun Song 
149f41f2ed4SMuchun Song 		next = pmd_addr_end(addr, end);
150f41f2ed4SMuchun Song 		vmemmap_pte_range(pmd, addr, next, walk);
151f41f2ed4SMuchun Song 	} while (pmd++, addr = next, addr != end);
1523bc2b6a7SMuchun Song 
1533bc2b6a7SMuchun Song 	return 0;
154f41f2ed4SMuchun Song }
155f41f2ed4SMuchun Song 
1563bc2b6a7SMuchun Song static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
157f41f2ed4SMuchun Song 			     unsigned long end,
158f41f2ed4SMuchun Song 			     struct vmemmap_remap_walk *walk)
159f41f2ed4SMuchun Song {
160f41f2ed4SMuchun Song 	pud_t *pud;
161f41f2ed4SMuchun Song 	unsigned long next;
162f41f2ed4SMuchun Song 
163f41f2ed4SMuchun Song 	pud = pud_offset(p4d, addr);
164f41f2ed4SMuchun Song 	do {
1653bc2b6a7SMuchun Song 		int ret;
1663bc2b6a7SMuchun Song 
167f41f2ed4SMuchun Song 		next = pud_addr_end(addr, end);
1683bc2b6a7SMuchun Song 		ret = vmemmap_pmd_range(pud, addr, next, walk);
1693bc2b6a7SMuchun Song 		if (ret)
1703bc2b6a7SMuchun Song 			return ret;
171f41f2ed4SMuchun Song 	} while (pud++, addr = next, addr != end);
1723bc2b6a7SMuchun Song 
1733bc2b6a7SMuchun Song 	return 0;
174f41f2ed4SMuchun Song }
175f41f2ed4SMuchun Song 
1763bc2b6a7SMuchun Song static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
177f41f2ed4SMuchun Song 			     unsigned long end,
178f41f2ed4SMuchun Song 			     struct vmemmap_remap_walk *walk)
179f41f2ed4SMuchun Song {
180f41f2ed4SMuchun Song 	p4d_t *p4d;
181f41f2ed4SMuchun Song 	unsigned long next;
182f41f2ed4SMuchun Song 
183f41f2ed4SMuchun Song 	p4d = p4d_offset(pgd, addr);
184f41f2ed4SMuchun Song 	do {
1853bc2b6a7SMuchun Song 		int ret;
1863bc2b6a7SMuchun Song 
187f41f2ed4SMuchun Song 		next = p4d_addr_end(addr, end);
1883bc2b6a7SMuchun Song 		ret = vmemmap_pud_range(p4d, addr, next, walk);
1893bc2b6a7SMuchun Song 		if (ret)
1903bc2b6a7SMuchun Song 			return ret;
191f41f2ed4SMuchun Song 	} while (p4d++, addr = next, addr != end);
1923bc2b6a7SMuchun Song 
1933bc2b6a7SMuchun Song 	return 0;
194f41f2ed4SMuchun Song }
195f41f2ed4SMuchun Song 
1963bc2b6a7SMuchun Song static int vmemmap_remap_range(unsigned long start, unsigned long end,
197f41f2ed4SMuchun Song 			       struct vmemmap_remap_walk *walk)
198f41f2ed4SMuchun Song {
199f41f2ed4SMuchun Song 	unsigned long addr = start;
200f41f2ed4SMuchun Song 	unsigned long next;
201f41f2ed4SMuchun Song 	pgd_t *pgd;
202f41f2ed4SMuchun Song 
2030b82ade6SFanjun Kong 	VM_BUG_ON(!PAGE_ALIGNED(start));
2040b82ade6SFanjun Kong 	VM_BUG_ON(!PAGE_ALIGNED(end));
205f41f2ed4SMuchun Song 
206f41f2ed4SMuchun Song 	pgd = pgd_offset_k(addr);
207f41f2ed4SMuchun Song 	do {
2083bc2b6a7SMuchun Song 		int ret;
2093bc2b6a7SMuchun Song 
210f41f2ed4SMuchun Song 		next = pgd_addr_end(addr, end);
2113bc2b6a7SMuchun Song 		ret = vmemmap_p4d_range(pgd, addr, next, walk);
2123bc2b6a7SMuchun Song 		if (ret)
2133bc2b6a7SMuchun Song 			return ret;
214f41f2ed4SMuchun Song 	} while (pgd++, addr = next, addr != end);
215f41f2ed4SMuchun Song 
216f41f2ed4SMuchun Song 	/*
217f41f2ed4SMuchun Song 	 * We only change the mapping of the vmemmap virtual address range
218f41f2ed4SMuchun Song 	 * [@start + PAGE_SIZE, end), so we only need to flush the TLB which
219f41f2ed4SMuchun Song 	 * belongs to the range.
220f41f2ed4SMuchun Song 	 */
221f41f2ed4SMuchun Song 	flush_tlb_kernel_range(start + PAGE_SIZE, end);
2223bc2b6a7SMuchun Song 
2233bc2b6a7SMuchun Song 	return 0;
224f41f2ed4SMuchun Song }
225f41f2ed4SMuchun Song 
226f41f2ed4SMuchun Song /*
227f41f2ed4SMuchun Song  * Free a vmemmap page. A vmemmap page can be allocated from the memblock
228f41f2ed4SMuchun Song  * allocator or buddy allocator. If the PG_reserved flag is set, it means
229f41f2ed4SMuchun Song  * that it allocated from the memblock allocator, just free it via the
230f41f2ed4SMuchun Song  * free_bootmem_page(). Otherwise, use __free_page().
231f41f2ed4SMuchun Song  */
232f41f2ed4SMuchun Song static inline void free_vmemmap_page(struct page *page)
233f41f2ed4SMuchun Song {
234f41f2ed4SMuchun Song 	if (PageReserved(page))
235f41f2ed4SMuchun Song 		free_bootmem_page(page);
236f41f2ed4SMuchun Song 	else
237f41f2ed4SMuchun Song 		__free_page(page);
238f41f2ed4SMuchun Song }
239f41f2ed4SMuchun Song 
240f41f2ed4SMuchun Song /* Free a list of the vmemmap pages */
241f41f2ed4SMuchun Song static void free_vmemmap_page_list(struct list_head *list)
242f41f2ed4SMuchun Song {
243f41f2ed4SMuchun Song 	struct page *page, *next;
244f41f2ed4SMuchun Song 
245f41f2ed4SMuchun Song 	list_for_each_entry_safe(page, next, list, lru) {
246f41f2ed4SMuchun Song 		list_del(&page->lru);
247f41f2ed4SMuchun Song 		free_vmemmap_page(page);
248f41f2ed4SMuchun Song 	}
249f41f2ed4SMuchun Song }
250f41f2ed4SMuchun Song 
251f41f2ed4SMuchun Song static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
252f41f2ed4SMuchun Song 			      struct vmemmap_remap_walk *walk)
253f41f2ed4SMuchun Song {
254f41f2ed4SMuchun Song 	/*
255f41f2ed4SMuchun Song 	 * Remap the tail pages as read-only to catch illegal write operation
256f41f2ed4SMuchun Song 	 * to the tail pages.
257f41f2ed4SMuchun Song 	 */
258f41f2ed4SMuchun Song 	pgprot_t pgprot = PAGE_KERNEL_RO;
259f41f2ed4SMuchun Song 	pte_t entry = mk_pte(walk->reuse_page, pgprot);
260f41f2ed4SMuchun Song 	struct page *page = pte_page(*pte);
261f41f2ed4SMuchun Song 
2623bc2b6a7SMuchun Song 	list_add_tail(&page->lru, walk->vmemmap_pages);
263f41f2ed4SMuchun Song 	set_pte_at(&init_mm, addr, pte, entry);
264f41f2ed4SMuchun Song }
265f41f2ed4SMuchun Song 
266e7d32485SMuchun Song /*
267e7d32485SMuchun Song  * How many struct page structs need to be reset. When we reuse the head
268e7d32485SMuchun Song  * struct page, the special metadata (e.g. page->flags or page->mapping)
269e7d32485SMuchun Song  * cannot copy to the tail struct page structs. The invalid value will be
270e7d32485SMuchun Song  * checked in the free_tail_pages_check(). In order to avoid the message
271e7d32485SMuchun Song  * of "corrupted mapping in tail page". We need to reset at least 3 (one
272e7d32485SMuchun Song  * head struct page struct and two tail struct page structs) struct page
273e7d32485SMuchun Song  * structs.
274e7d32485SMuchun Song  */
275e7d32485SMuchun Song #define NR_RESET_STRUCT_PAGE		3
276e7d32485SMuchun Song 
277e7d32485SMuchun Song static inline void reset_struct_pages(struct page *start)
278e7d32485SMuchun Song {
279e7d32485SMuchun Song 	int i;
280e7d32485SMuchun Song 	struct page *from = start + NR_RESET_STRUCT_PAGE;
281e7d32485SMuchun Song 
282e7d32485SMuchun Song 	for (i = 0; i < NR_RESET_STRUCT_PAGE; i++)
283e7d32485SMuchun Song 		memcpy(start + i, from, sizeof(*from));
284e7d32485SMuchun Song }
285e7d32485SMuchun Song 
2863bc2b6a7SMuchun Song static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
2873bc2b6a7SMuchun Song 				struct vmemmap_remap_walk *walk)
2883bc2b6a7SMuchun Song {
2893bc2b6a7SMuchun Song 	pgprot_t pgprot = PAGE_KERNEL;
2903bc2b6a7SMuchun Song 	struct page *page;
2913bc2b6a7SMuchun Song 	void *to;
2923bc2b6a7SMuchun Song 
2933bc2b6a7SMuchun Song 	BUG_ON(pte_page(*pte) != walk->reuse_page);
2943bc2b6a7SMuchun Song 
2953bc2b6a7SMuchun Song 	page = list_first_entry(walk->vmemmap_pages, struct page, lru);
2963bc2b6a7SMuchun Song 	list_del(&page->lru);
2973bc2b6a7SMuchun Song 	to = page_to_virt(page);
2983bc2b6a7SMuchun Song 	copy_page(to, (void *)walk->reuse_addr);
299e7d32485SMuchun Song 	reset_struct_pages(to);
3003bc2b6a7SMuchun Song 
3013bc2b6a7SMuchun Song 	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
3023bc2b6a7SMuchun Song }
3033bc2b6a7SMuchun Song 
304f41f2ed4SMuchun Song /**
305f41f2ed4SMuchun Song  * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
306f41f2ed4SMuchun Song  *			to the page which @reuse is mapped to, then free vmemmap
307f41f2ed4SMuchun Song  *			which the range are mapped to.
308f41f2ed4SMuchun Song  * @start:	start address of the vmemmap virtual address range that we want
309f41f2ed4SMuchun Song  *		to remap.
310f41f2ed4SMuchun Song  * @end:	end address of the vmemmap virtual address range that we want to
311f41f2ed4SMuchun Song  *		remap.
312f41f2ed4SMuchun Song  * @reuse:	reuse address.
313f41f2ed4SMuchun Song  *
3143bc2b6a7SMuchun Song  * Return: %0 on success, negative error code otherwise.
315f41f2ed4SMuchun Song  */
3163bc2b6a7SMuchun Song int vmemmap_remap_free(unsigned long start, unsigned long end,
317f41f2ed4SMuchun Song 		       unsigned long reuse)
318f41f2ed4SMuchun Song {
3193bc2b6a7SMuchun Song 	int ret;
320f41f2ed4SMuchun Song 	LIST_HEAD(vmemmap_pages);
321f41f2ed4SMuchun Song 	struct vmemmap_remap_walk walk = {
322f41f2ed4SMuchun Song 		.remap_pte	= vmemmap_remap_pte,
323f41f2ed4SMuchun Song 		.reuse_addr	= reuse,
324f41f2ed4SMuchun Song 		.vmemmap_pages	= &vmemmap_pages,
325f41f2ed4SMuchun Song 	};
326f41f2ed4SMuchun Song 
327f41f2ed4SMuchun Song 	/*
328f41f2ed4SMuchun Song 	 * In order to make remapping routine most efficient for the huge pages,
329f41f2ed4SMuchun Song 	 * the routine of vmemmap page table walking has the following rules
330f41f2ed4SMuchun Song 	 * (see more details from the vmemmap_pte_range()):
331f41f2ed4SMuchun Song 	 *
332f41f2ed4SMuchun Song 	 * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
333f41f2ed4SMuchun Song 	 *   should be continuous.
334f41f2ed4SMuchun Song 	 * - The @reuse address is part of the range [@reuse, @end) that we are
335f41f2ed4SMuchun Song 	 *   walking which is passed to vmemmap_remap_range().
336f41f2ed4SMuchun Song 	 * - The @reuse address is the first in the complete range.
337f41f2ed4SMuchun Song 	 *
338f41f2ed4SMuchun Song 	 * So we need to make sure that @start and @reuse meet the above rules.
339f41f2ed4SMuchun Song 	 */
340f41f2ed4SMuchun Song 	BUG_ON(start - reuse != PAGE_SIZE);
341f41f2ed4SMuchun Song 
342d8d55f56SMuchun Song 	mmap_read_lock(&init_mm);
3433bc2b6a7SMuchun Song 	ret = vmemmap_remap_range(reuse, end, &walk);
3443bc2b6a7SMuchun Song 	if (ret && walk.nr_walked) {
3453bc2b6a7SMuchun Song 		end = reuse + walk.nr_walked * PAGE_SIZE;
3463bc2b6a7SMuchun Song 		/*
3473bc2b6a7SMuchun Song 		 * vmemmap_pages contains pages from the previous
3483bc2b6a7SMuchun Song 		 * vmemmap_remap_range call which failed.  These
3493bc2b6a7SMuchun Song 		 * are pages which were removed from the vmemmap.
3503bc2b6a7SMuchun Song 		 * They will be restored in the following call.
3513bc2b6a7SMuchun Song 		 */
3523bc2b6a7SMuchun Song 		walk = (struct vmemmap_remap_walk) {
3533bc2b6a7SMuchun Song 			.remap_pte	= vmemmap_restore_pte,
3543bc2b6a7SMuchun Song 			.reuse_addr	= reuse,
3553bc2b6a7SMuchun Song 			.vmemmap_pages	= &vmemmap_pages,
3563bc2b6a7SMuchun Song 		};
3573bc2b6a7SMuchun Song 
358f41f2ed4SMuchun Song 		vmemmap_remap_range(reuse, end, &walk);
359f41f2ed4SMuchun Song 	}
3603bc2b6a7SMuchun Song 	mmap_read_unlock(&init_mm);
3618f6aac41SChristoph Lameter 
3623bc2b6a7SMuchun Song 	free_vmemmap_page_list(&vmemmap_pages);
363ad2fa371SMuchun Song 
3643bc2b6a7SMuchun Song 	return ret;
365ad2fa371SMuchun Song }
366ad2fa371SMuchun Song 
367ad2fa371SMuchun Song static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
368ad2fa371SMuchun Song 				   gfp_t gfp_mask, struct list_head *list)
369ad2fa371SMuchun Song {
370ad2fa371SMuchun Song 	unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
371ad2fa371SMuchun Song 	int nid = page_to_nid((struct page *)start);
372ad2fa371SMuchun Song 	struct page *page, *next;
373ad2fa371SMuchun Song 
374ad2fa371SMuchun Song 	while (nr_pages--) {
375ad2fa371SMuchun Song 		page = alloc_pages_node(nid, gfp_mask, 0);
376ad2fa371SMuchun Song 		if (!page)
377ad2fa371SMuchun Song 			goto out;
378ad2fa371SMuchun Song 		list_add_tail(&page->lru, list);
379ad2fa371SMuchun Song 	}
380ad2fa371SMuchun Song 
381ad2fa371SMuchun Song 	return 0;
382ad2fa371SMuchun Song out:
383ad2fa371SMuchun Song 	list_for_each_entry_safe(page, next, list, lru)
384ad2fa371SMuchun Song 		__free_pages(page, 0);
385ad2fa371SMuchun Song 	return -ENOMEM;
386ad2fa371SMuchun Song }
387ad2fa371SMuchun Song 
388ad2fa371SMuchun Song /**
389ad2fa371SMuchun Song  * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
390ad2fa371SMuchun Song  *			 to the page which is from the @vmemmap_pages
391ad2fa371SMuchun Song  *			 respectively.
392ad2fa371SMuchun Song  * @start:	start address of the vmemmap virtual address range that we want
393ad2fa371SMuchun Song  *		to remap.
394ad2fa371SMuchun Song  * @end:	end address of the vmemmap virtual address range that we want to
395ad2fa371SMuchun Song  *		remap.
396ad2fa371SMuchun Song  * @reuse:	reuse address.
397ad2fa371SMuchun Song  * @gfp_mask:	GFP flag for allocating vmemmap pages.
3983bc2b6a7SMuchun Song  *
3993bc2b6a7SMuchun Song  * Return: %0 on success, negative error code otherwise.
400ad2fa371SMuchun Song  */
401ad2fa371SMuchun Song int vmemmap_remap_alloc(unsigned long start, unsigned long end,
402ad2fa371SMuchun Song 			unsigned long reuse, gfp_t gfp_mask)
403ad2fa371SMuchun Song {
404ad2fa371SMuchun Song 	LIST_HEAD(vmemmap_pages);
405ad2fa371SMuchun Song 	struct vmemmap_remap_walk walk = {
406ad2fa371SMuchun Song 		.remap_pte	= vmemmap_restore_pte,
407ad2fa371SMuchun Song 		.reuse_addr	= reuse,
408ad2fa371SMuchun Song 		.vmemmap_pages	= &vmemmap_pages,
409ad2fa371SMuchun Song 	};
410ad2fa371SMuchun Song 
411ad2fa371SMuchun Song 	/* See the comment in the vmemmap_remap_free(). */
412ad2fa371SMuchun Song 	BUG_ON(start - reuse != PAGE_SIZE);
413ad2fa371SMuchun Song 
414ad2fa371SMuchun Song 	if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
415ad2fa371SMuchun Song 		return -ENOMEM;
416ad2fa371SMuchun Song 
4173bc2b6a7SMuchun Song 	mmap_read_lock(&init_mm);
418ad2fa371SMuchun Song 	vmemmap_remap_range(reuse, end, &walk);
4193bc2b6a7SMuchun Song 	mmap_read_unlock(&init_mm);
420ad2fa371SMuchun Song 
421ad2fa371SMuchun Song 	return 0;
422ad2fa371SMuchun Song }
42347010c04SMuchun Song #endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
424ad2fa371SMuchun Song 
4258f6aac41SChristoph Lameter /*
4268f6aac41SChristoph Lameter  * Allocate a block of memory to be used to back the virtual memory map
4278f6aac41SChristoph Lameter  * or to back the page tables that are used to create the mapping.
4288f6aac41SChristoph Lameter  * Uses the main allocators if they are available, else bootmem.
4298f6aac41SChristoph Lameter  */
430e0dc3a53SKAMEZAWA Hiroyuki 
431bd721ea7SFabian Frederick static void * __ref __earlyonly_bootmem_alloc(int node,
432e0dc3a53SKAMEZAWA Hiroyuki 				unsigned long size,
433e0dc3a53SKAMEZAWA Hiroyuki 				unsigned long align,
434e0dc3a53SKAMEZAWA Hiroyuki 				unsigned long goal)
435e0dc3a53SKAMEZAWA Hiroyuki {
436eb31d559SMike Rapoport 	return memblock_alloc_try_nid_raw(size, align, goal,
43797ad1087SMike Rapoport 					       MEMBLOCK_ALLOC_ACCESSIBLE, node);
438e0dc3a53SKAMEZAWA Hiroyuki }
439e0dc3a53SKAMEZAWA Hiroyuki 
4408f6aac41SChristoph Lameter void * __meminit vmemmap_alloc_block(unsigned long size, int node)
4418f6aac41SChristoph Lameter {
4428f6aac41SChristoph Lameter 	/* If the main allocator is up use that, fallback to bootmem. */
4438f6aac41SChristoph Lameter 	if (slab_is_available()) {
444fcdaf842SMichal Hocko 		gfp_t gfp_mask = GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
445fcdaf842SMichal Hocko 		int order = get_order(size);
446fcdaf842SMichal Hocko 		static bool warned;
447f52407ceSShaohua Li 		struct page *page;
448f52407ceSShaohua Li 
449fcdaf842SMichal Hocko 		page = alloc_pages_node(node, gfp_mask, order);
4508f6aac41SChristoph Lameter 		if (page)
4518f6aac41SChristoph Lameter 			return page_address(page);
452fcdaf842SMichal Hocko 
453fcdaf842SMichal Hocko 		if (!warned) {
454fcdaf842SMichal Hocko 			warn_alloc(gfp_mask & ~__GFP_NOWARN, NULL,
455fcdaf842SMichal Hocko 				   "vmemmap alloc failure: order:%u", order);
456fcdaf842SMichal Hocko 			warned = true;
457fcdaf842SMichal Hocko 		}
4588f6aac41SChristoph Lameter 		return NULL;
4598f6aac41SChristoph Lameter 	} else
460e0dc3a53SKAMEZAWA Hiroyuki 		return __earlyonly_bootmem_alloc(node, size, size,
4618f6aac41SChristoph Lameter 				__pa(MAX_DMA_ADDRESS));
4628f6aac41SChristoph Lameter }
4638f6aac41SChristoph Lameter 
46456993b4eSAnshuman Khandual static void * __meminit altmap_alloc_block_buf(unsigned long size,
46556993b4eSAnshuman Khandual 					       struct vmem_altmap *altmap);
4669bdac914SYinghai Lu 
46756993b4eSAnshuman Khandual /* need to make sure size is all the same during early stage */
46856993b4eSAnshuman Khandual void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node,
46956993b4eSAnshuman Khandual 					 struct vmem_altmap *altmap)
47056993b4eSAnshuman Khandual {
47156993b4eSAnshuman Khandual 	void *ptr;
47256993b4eSAnshuman Khandual 
47356993b4eSAnshuman Khandual 	if (altmap)
47456993b4eSAnshuman Khandual 		return altmap_alloc_block_buf(size, altmap);
47556993b4eSAnshuman Khandual 
47656993b4eSAnshuman Khandual 	ptr = sparse_buffer_alloc(size);
47735fd1eb1SPavel Tatashin 	if (!ptr)
47835fd1eb1SPavel Tatashin 		ptr = vmemmap_alloc_block(size, node);
4799bdac914SYinghai Lu 	return ptr;
4809bdac914SYinghai Lu }
4819bdac914SYinghai Lu 
4824b94ffdcSDan Williams static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap)
4834b94ffdcSDan Williams {
4844b94ffdcSDan Williams 	return altmap->base_pfn + altmap->reserve + altmap->alloc
4854b94ffdcSDan Williams 		+ altmap->align;
4864b94ffdcSDan Williams }
4874b94ffdcSDan Williams 
4884b94ffdcSDan Williams static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap)
4894b94ffdcSDan Williams {
4904b94ffdcSDan Williams 	unsigned long allocated = altmap->alloc + altmap->align;
4914b94ffdcSDan Williams 
4924b94ffdcSDan Williams 	if (altmap->free > allocated)
4934b94ffdcSDan Williams 		return altmap->free - allocated;
4944b94ffdcSDan Williams 	return 0;
4954b94ffdcSDan Williams }
4964b94ffdcSDan Williams 
49756993b4eSAnshuman Khandual static void * __meminit altmap_alloc_block_buf(unsigned long size,
4984b94ffdcSDan Williams 					       struct vmem_altmap *altmap)
4994b94ffdcSDan Williams {
500eb804533SChristoph Hellwig 	unsigned long pfn, nr_pfns, nr_align;
5014b94ffdcSDan Williams 
5024b94ffdcSDan Williams 	if (size & ~PAGE_MASK) {
5034b94ffdcSDan Williams 		pr_warn_once("%s: allocations must be multiple of PAGE_SIZE (%ld)\n",
5044b94ffdcSDan Williams 				__func__, size);
5054b94ffdcSDan Williams 		return NULL;
5064b94ffdcSDan Williams 	}
5074b94ffdcSDan Williams 
508eb804533SChristoph Hellwig 	pfn = vmem_altmap_next_pfn(altmap);
5094b94ffdcSDan Williams 	nr_pfns = size >> PAGE_SHIFT;
510eb804533SChristoph Hellwig 	nr_align = 1UL << find_first_bit(&nr_pfns, BITS_PER_LONG);
511eb804533SChristoph Hellwig 	nr_align = ALIGN(pfn, nr_align) - pfn;
512eb804533SChristoph Hellwig 	if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap))
513eb804533SChristoph Hellwig 		return NULL;
514eb804533SChristoph Hellwig 
515eb804533SChristoph Hellwig 	altmap->alloc += nr_pfns;
516eb804533SChristoph Hellwig 	altmap->align += nr_align;
517eb804533SChristoph Hellwig 	pfn += nr_align;
518eb804533SChristoph Hellwig 
5194b94ffdcSDan Williams 	pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n",
5204b94ffdcSDan Williams 			__func__, pfn, altmap->alloc, altmap->align, nr_pfns);
521eb804533SChristoph Hellwig 	return __va(__pfn_to_phys(pfn));
5224b94ffdcSDan Williams }
5234b94ffdcSDan Williams 
5248f6aac41SChristoph Lameter void __meminit vmemmap_verify(pte_t *pte, int node,
5258f6aac41SChristoph Lameter 				unsigned long start, unsigned long end)
5268f6aac41SChristoph Lameter {
5278f6aac41SChristoph Lameter 	unsigned long pfn = pte_pfn(*pte);
5288f6aac41SChristoph Lameter 	int actual_node = early_pfn_to_nid(pfn);
5298f6aac41SChristoph Lameter 
530b41ad14cSDavid Rientjes 	if (node_distance(actual_node, node) > LOCAL_DISTANCE)
5311170532bSJoe Perches 		pr_warn("[%lx-%lx] potential offnode page_structs\n",
532756a025fSJoe Perches 			start, end - 1);
5338f6aac41SChristoph Lameter }
5348f6aac41SChristoph Lameter 
5351d9cfee7SAnshuman Khandual pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
5364917f55bSJoao Martins 				       struct vmem_altmap *altmap,
5374917f55bSJoao Martins 				       struct page *reuse)
5388f6aac41SChristoph Lameter {
53929c71111SAndy Whitcroft 	pte_t *pte = pte_offset_kernel(pmd, addr);
5408f6aac41SChristoph Lameter 	if (pte_none(*pte)) {
5418f6aac41SChristoph Lameter 		pte_t entry;
5421d9cfee7SAnshuman Khandual 		void *p;
5431d9cfee7SAnshuman Khandual 
5444917f55bSJoao Martins 		if (!reuse) {
54556993b4eSAnshuman Khandual 			p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
5468f6aac41SChristoph Lameter 			if (!p)
5479dce07f1SAl Viro 				return NULL;
5484917f55bSJoao Martins 		} else {
5494917f55bSJoao Martins 			/*
5504917f55bSJoao Martins 			 * When a PTE/PMD entry is freed from the init_mm
5514917f55bSJoao Martins 			 * there's a a free_pages() call to this page allocated
5524917f55bSJoao Martins 			 * above. Thus this get_page() is paired with the
5534917f55bSJoao Martins 			 * put_page_testzero() on the freeing path.
5544917f55bSJoao Martins 			 * This can only called by certain ZONE_DEVICE path,
5554917f55bSJoao Martins 			 * and through vmemmap_populate_compound_pages() when
5564917f55bSJoao Martins 			 * slab is available.
5574917f55bSJoao Martins 			 */
5584917f55bSJoao Martins 			get_page(reuse);
5594917f55bSJoao Martins 			p = page_to_virt(reuse);
5604917f55bSJoao Martins 		}
56129c71111SAndy Whitcroft 		entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
56229c71111SAndy Whitcroft 		set_pte_at(&init_mm, addr, pte, entry);
56329c71111SAndy Whitcroft 	}
56429c71111SAndy Whitcroft 	return pte;
5658f6aac41SChristoph Lameter }
5668f6aac41SChristoph Lameter 
567f7f99100SPavel Tatashin static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node)
568f7f99100SPavel Tatashin {
569f7f99100SPavel Tatashin 	void *p = vmemmap_alloc_block(size, node);
570f7f99100SPavel Tatashin 
571f7f99100SPavel Tatashin 	if (!p)
572f7f99100SPavel Tatashin 		return NULL;
573f7f99100SPavel Tatashin 	memset(p, 0, size);
574f7f99100SPavel Tatashin 
575f7f99100SPavel Tatashin 	return p;
576f7f99100SPavel Tatashin }
577f7f99100SPavel Tatashin 
57829c71111SAndy Whitcroft pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
5798f6aac41SChristoph Lameter {
58029c71111SAndy Whitcroft 	pmd_t *pmd = pmd_offset(pud, addr);
5818f6aac41SChristoph Lameter 	if (pmd_none(*pmd)) {
582f7f99100SPavel Tatashin 		void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
5838f6aac41SChristoph Lameter 		if (!p)
5849dce07f1SAl Viro 			return NULL;
5858f6aac41SChristoph Lameter 		pmd_populate_kernel(&init_mm, pmd, p);
5868f6aac41SChristoph Lameter 	}
58729c71111SAndy Whitcroft 	return pmd;
5888f6aac41SChristoph Lameter }
5898f6aac41SChristoph Lameter 
590c2febafcSKirill A. Shutemov pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
5918f6aac41SChristoph Lameter {
592c2febafcSKirill A. Shutemov 	pud_t *pud = pud_offset(p4d, addr);
5938f6aac41SChristoph Lameter 	if (pud_none(*pud)) {
594f7f99100SPavel Tatashin 		void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
5958f6aac41SChristoph Lameter 		if (!p)
5969dce07f1SAl Viro 			return NULL;
5978f6aac41SChristoph Lameter 		pud_populate(&init_mm, pud, p);
5988f6aac41SChristoph Lameter 	}
59929c71111SAndy Whitcroft 	return pud;
6008f6aac41SChristoph Lameter }
6018f6aac41SChristoph Lameter 
602c2febafcSKirill A. Shutemov p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
603c2febafcSKirill A. Shutemov {
604c2febafcSKirill A. Shutemov 	p4d_t *p4d = p4d_offset(pgd, addr);
605c2febafcSKirill A. Shutemov 	if (p4d_none(*p4d)) {
606f7f99100SPavel Tatashin 		void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
607c2febafcSKirill A. Shutemov 		if (!p)
608c2febafcSKirill A. Shutemov 			return NULL;
609c2febafcSKirill A. Shutemov 		p4d_populate(&init_mm, p4d, p);
610c2febafcSKirill A. Shutemov 	}
611c2febafcSKirill A. Shutemov 	return p4d;
612c2febafcSKirill A. Shutemov }
613c2febafcSKirill A. Shutemov 
61429c71111SAndy Whitcroft pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
6158f6aac41SChristoph Lameter {
61629c71111SAndy Whitcroft 	pgd_t *pgd = pgd_offset_k(addr);
6178f6aac41SChristoph Lameter 	if (pgd_none(*pgd)) {
618f7f99100SPavel Tatashin 		void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
6198f6aac41SChristoph Lameter 		if (!p)
6209dce07f1SAl Viro 			return NULL;
6218f6aac41SChristoph Lameter 		pgd_populate(&init_mm, pgd, p);
6228f6aac41SChristoph Lameter 	}
62329c71111SAndy Whitcroft 	return pgd;
6248f6aac41SChristoph Lameter }
62529c71111SAndy Whitcroft 
6262beea70aSJoao Martins static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
6274917f55bSJoao Martins 					      struct vmem_altmap *altmap,
6284917f55bSJoao Martins 					      struct page *reuse)
62929c71111SAndy Whitcroft {
63029c71111SAndy Whitcroft 	pgd_t *pgd;
631c2febafcSKirill A. Shutemov 	p4d_t *p4d;
63229c71111SAndy Whitcroft 	pud_t *pud;
63329c71111SAndy Whitcroft 	pmd_t *pmd;
63429c71111SAndy Whitcroft 	pte_t *pte;
63529c71111SAndy Whitcroft 
63629c71111SAndy Whitcroft 	pgd = vmemmap_pgd_populate(addr, node);
63729c71111SAndy Whitcroft 	if (!pgd)
6382beea70aSJoao Martins 		return NULL;
639c2febafcSKirill A. Shutemov 	p4d = vmemmap_p4d_populate(pgd, addr, node);
640c2febafcSKirill A. Shutemov 	if (!p4d)
6412beea70aSJoao Martins 		return NULL;
642c2febafcSKirill A. Shutemov 	pud = vmemmap_pud_populate(p4d, addr, node);
64329c71111SAndy Whitcroft 	if (!pud)
6442beea70aSJoao Martins 		return NULL;
64529c71111SAndy Whitcroft 	pmd = vmemmap_pmd_populate(pud, addr, node);
64629c71111SAndy Whitcroft 	if (!pmd)
6472beea70aSJoao Martins 		return NULL;
6484917f55bSJoao Martins 	pte = vmemmap_pte_populate(pmd, addr, node, altmap, reuse);
64929c71111SAndy Whitcroft 	if (!pte)
6502beea70aSJoao Martins 		return NULL;
65129c71111SAndy Whitcroft 	vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
6522beea70aSJoao Martins 
6532beea70aSJoao Martins 	return pte;
6542beea70aSJoao Martins }
6552beea70aSJoao Martins 
6562beea70aSJoao Martins static int __meminit vmemmap_populate_range(unsigned long start,
6572beea70aSJoao Martins 					    unsigned long end, int node,
6584917f55bSJoao Martins 					    struct vmem_altmap *altmap,
6594917f55bSJoao Martins 					    struct page *reuse)
6602beea70aSJoao Martins {
6612beea70aSJoao Martins 	unsigned long addr = start;
6622beea70aSJoao Martins 	pte_t *pte;
6632beea70aSJoao Martins 
6642beea70aSJoao Martins 	for (; addr < end; addr += PAGE_SIZE) {
6654917f55bSJoao Martins 		pte = vmemmap_populate_address(addr, node, altmap, reuse);
6662beea70aSJoao Martins 		if (!pte)
6672beea70aSJoao Martins 			return -ENOMEM;
6688f6aac41SChristoph Lameter 	}
66929c71111SAndy Whitcroft 
67029c71111SAndy Whitcroft 	return 0;
67129c71111SAndy Whitcroft }
6728f6aac41SChristoph Lameter 
6732beea70aSJoao Martins int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
6742beea70aSJoao Martins 					 int node, struct vmem_altmap *altmap)
6752beea70aSJoao Martins {
6764917f55bSJoao Martins 	return vmemmap_populate_range(start, end, node, altmap, NULL);
6774917f55bSJoao Martins }
6784917f55bSJoao Martins 
6794917f55bSJoao Martins /*
6804917f55bSJoao Martins  * For compound pages bigger than section size (e.g. x86 1G compound
6814917f55bSJoao Martins  * pages with 2M subsection size) fill the rest of sections as tail
6824917f55bSJoao Martins  * pages.
6834917f55bSJoao Martins  *
6844917f55bSJoao Martins  * Note that memremap_pages() resets @nr_range value and will increment
6854917f55bSJoao Martins  * it after each range successful onlining. Thus the value or @nr_range
6864917f55bSJoao Martins  * at section memmap populate corresponds to the in-progress range
6874917f55bSJoao Martins  * being onlined here.
6884917f55bSJoao Martins  */
6894917f55bSJoao Martins static bool __meminit reuse_compound_section(unsigned long start_pfn,
6904917f55bSJoao Martins 					     struct dev_pagemap *pgmap)
6914917f55bSJoao Martins {
6924917f55bSJoao Martins 	unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
6934917f55bSJoao Martins 	unsigned long offset = start_pfn -
6944917f55bSJoao Martins 		PHYS_PFN(pgmap->ranges[pgmap->nr_range].start);
6954917f55bSJoao Martins 
6964917f55bSJoao Martins 	return !IS_ALIGNED(offset, nr_pages) && nr_pages > PAGES_PER_SUBSECTION;
6974917f55bSJoao Martins }
6984917f55bSJoao Martins 
6994917f55bSJoao Martins static pte_t * __meminit compound_section_tail_page(unsigned long addr)
7004917f55bSJoao Martins {
7014917f55bSJoao Martins 	pte_t *pte;
7024917f55bSJoao Martins 
7034917f55bSJoao Martins 	addr -= PAGE_SIZE;
7044917f55bSJoao Martins 
7054917f55bSJoao Martins 	/*
7064917f55bSJoao Martins 	 * Assuming sections are populated sequentially, the previous section's
7074917f55bSJoao Martins 	 * page data can be reused.
7084917f55bSJoao Martins 	 */
7094917f55bSJoao Martins 	pte = pte_offset_kernel(pmd_off_k(addr), addr);
7104917f55bSJoao Martins 	if (!pte)
7114917f55bSJoao Martins 		return NULL;
7124917f55bSJoao Martins 
7134917f55bSJoao Martins 	return pte;
7144917f55bSJoao Martins }
7154917f55bSJoao Martins 
7164917f55bSJoao Martins static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
7174917f55bSJoao Martins 						     unsigned long start,
7184917f55bSJoao Martins 						     unsigned long end, int node,
7194917f55bSJoao Martins 						     struct dev_pagemap *pgmap)
7204917f55bSJoao Martins {
7214917f55bSJoao Martins 	unsigned long size, addr;
7224917f55bSJoao Martins 	pte_t *pte;
7234917f55bSJoao Martins 	int rc;
7244917f55bSJoao Martins 
7254917f55bSJoao Martins 	if (reuse_compound_section(start_pfn, pgmap)) {
7264917f55bSJoao Martins 		pte = compound_section_tail_page(start);
7274917f55bSJoao Martins 		if (!pte)
7284917f55bSJoao Martins 			return -ENOMEM;
7294917f55bSJoao Martins 
7304917f55bSJoao Martins 		/*
7314917f55bSJoao Martins 		 * Reuse the page that was populated in the prior iteration
7324917f55bSJoao Martins 		 * with just tail struct pages.
7334917f55bSJoao Martins 		 */
7344917f55bSJoao Martins 		return vmemmap_populate_range(start, end, node, NULL,
7354917f55bSJoao Martins 					      pte_page(*pte));
7364917f55bSJoao Martins 	}
7374917f55bSJoao Martins 
7384917f55bSJoao Martins 	size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
7394917f55bSJoao Martins 	for (addr = start; addr < end; addr += size) {
740*55896f93SGautam Menghani 		unsigned long next, last = addr + size;
7414917f55bSJoao Martins 
7424917f55bSJoao Martins 		/* Populate the head page vmemmap page */
7434917f55bSJoao Martins 		pte = vmemmap_populate_address(addr, node, NULL, NULL);
7444917f55bSJoao Martins 		if (!pte)
7454917f55bSJoao Martins 			return -ENOMEM;
7464917f55bSJoao Martins 
7474917f55bSJoao Martins 		/* Populate the tail pages vmemmap page */
7484917f55bSJoao Martins 		next = addr + PAGE_SIZE;
7494917f55bSJoao Martins 		pte = vmemmap_populate_address(next, node, NULL, NULL);
7504917f55bSJoao Martins 		if (!pte)
7514917f55bSJoao Martins 			return -ENOMEM;
7524917f55bSJoao Martins 
7534917f55bSJoao Martins 		/*
7544917f55bSJoao Martins 		 * Reuse the previous page for the rest of tail pages
7554917f55bSJoao Martins 		 * See layout diagram in Documentation/vm/vmemmap_dedup.rst
7564917f55bSJoao Martins 		 */
7574917f55bSJoao Martins 		next += PAGE_SIZE;
7584917f55bSJoao Martins 		rc = vmemmap_populate_range(next, last, node, NULL,
7594917f55bSJoao Martins 					    pte_page(*pte));
7604917f55bSJoao Martins 		if (rc)
7614917f55bSJoao Martins 			return -ENOMEM;
7624917f55bSJoao Martins 	}
7634917f55bSJoao Martins 
7644917f55bSJoao Martins 	return 0;
7652beea70aSJoao Martins }
7662beea70aSJoao Martins 
767e9c0a3f0SDan Williams struct page * __meminit __populate_section_memmap(unsigned long pfn,
768e3246d8fSJoao Martins 		unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
769e3246d8fSJoao Martins 		struct dev_pagemap *pgmap)
7708f6aac41SChristoph Lameter {
7716cda7204SWei Yang 	unsigned long start = (unsigned long) pfn_to_page(pfn);
7726cda7204SWei Yang 	unsigned long end = start + nr_pages * sizeof(struct page);
7734917f55bSJoao Martins 	int r;
7740aad818bSJohannes Weiner 
7756cda7204SWei Yang 	if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) ||
7766cda7204SWei Yang 		!IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
7776cda7204SWei Yang 		return NULL;
7780aad818bSJohannes Weiner 
7794917f55bSJoao Martins 	if (is_power_of_2(sizeof(struct page)) &&
7804917f55bSJoao Martins 	    pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap)
7814917f55bSJoao Martins 		r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap);
7824917f55bSJoao Martins 	else
7834917f55bSJoao Martins 		r = vmemmap_populate(start, end, nid, altmap);
7844917f55bSJoao Martins 
7854917f55bSJoao Martins 	if (r < 0)
7868f6aac41SChristoph Lameter 		return NULL;
7878f6aac41SChristoph Lameter 
788e9c0a3f0SDan Williams 	return pfn_to_page(pfn);
7898f6aac41SChristoph Lameter }
790