xref: /linux/mm/sparse-vmemmap.c (revision fcdaf842bd8f538a88059ce0243bc2822ed1b0e0)
1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
28f6aac41SChristoph Lameter /*
38f6aac41SChristoph Lameter  * Virtual Memory Map support
48f6aac41SChristoph Lameter  *
5cde53535SChristoph Lameter  * (C) 2007 sgi. Christoph Lameter.
68f6aac41SChristoph Lameter  *
78f6aac41SChristoph Lameter  * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn,
88f6aac41SChristoph Lameter  * virt_to_page, page_address() to be implemented as a base offset
98f6aac41SChristoph Lameter  * calculation without memory access.
108f6aac41SChristoph Lameter  *
118f6aac41SChristoph Lameter  * However, virtual mappings need a page table and TLBs. Many Linux
128f6aac41SChristoph Lameter  * architectures already map their physical space using 1-1 mappings
13b595076aSUwe Kleine-König  * via TLBs. For those arches the virtual memory map is essentially
148f6aac41SChristoph Lameter  * for free if we use the same page size as the 1-1 mappings. In that
158f6aac41SChristoph Lameter  * case the overhead consists of a few additional pages that are
168f6aac41SChristoph Lameter  * allocated to create a view of memory for vmemmap.
178f6aac41SChristoph Lameter  *
1829c71111SAndy Whitcroft  * The architecture is expected to provide a vmemmap_populate() function
1929c71111SAndy Whitcroft  * to instantiate the mapping.
208f6aac41SChristoph Lameter  */
218f6aac41SChristoph Lameter #include <linux/mm.h>
228f6aac41SChristoph Lameter #include <linux/mmzone.h>
238f6aac41SChristoph Lameter #include <linux/bootmem.h>
244b94ffdcSDan Williams #include <linux/memremap.h>
258f6aac41SChristoph Lameter #include <linux/highmem.h>
265a0e3ad6STejun Heo #include <linux/slab.h>
278f6aac41SChristoph Lameter #include <linux/spinlock.h>
288f6aac41SChristoph Lameter #include <linux/vmalloc.h>
298bca44bbSGlauber de Oliveira Costa #include <linux/sched.h>
308f6aac41SChristoph Lameter #include <asm/dma.h>
318f6aac41SChristoph Lameter #include <asm/pgalloc.h>
328f6aac41SChristoph Lameter #include <asm/pgtable.h>
338f6aac41SChristoph Lameter 
348f6aac41SChristoph Lameter /*
358f6aac41SChristoph Lameter  * Allocate a block of memory to be used to back the virtual memory map
368f6aac41SChristoph Lameter  * or to back the page tables that are used to create the mapping.
378f6aac41SChristoph Lameter  * Uses the main allocators if they are available, else bootmem.
388f6aac41SChristoph Lameter  */
39e0dc3a53SKAMEZAWA Hiroyuki 
40bd721ea7SFabian Frederick static void * __ref __earlyonly_bootmem_alloc(int node,
41e0dc3a53SKAMEZAWA Hiroyuki 				unsigned long size,
42e0dc3a53SKAMEZAWA Hiroyuki 				unsigned long align,
43e0dc3a53SKAMEZAWA Hiroyuki 				unsigned long goal)
44e0dc3a53SKAMEZAWA Hiroyuki {
45f7f99100SPavel Tatashin 	return memblock_virt_alloc_try_nid_raw(size, align, goal,
46bb016b84SSantosh Shilimkar 					    BOOTMEM_ALLOC_ACCESSIBLE, node);
47e0dc3a53SKAMEZAWA Hiroyuki }
48e0dc3a53SKAMEZAWA Hiroyuki 
499bdac914SYinghai Lu static void *vmemmap_buf;
509bdac914SYinghai Lu static void *vmemmap_buf_end;
51e0dc3a53SKAMEZAWA Hiroyuki 
528f6aac41SChristoph Lameter void * __meminit vmemmap_alloc_block(unsigned long size, int node)
538f6aac41SChristoph Lameter {
548f6aac41SChristoph Lameter 	/* If the main allocator is up use that, fallback to bootmem. */
558f6aac41SChristoph Lameter 	if (slab_is_available()) {
56*fcdaf842SMichal Hocko 		gfp_t gfp_mask = GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
57*fcdaf842SMichal Hocko 		int order = get_order(size);
58*fcdaf842SMichal Hocko 		static bool warned;
59f52407ceSShaohua Li 		struct page *page;
60f52407ceSShaohua Li 
61*fcdaf842SMichal Hocko 		page = alloc_pages_node(node, gfp_mask, order);
628f6aac41SChristoph Lameter 		if (page)
638f6aac41SChristoph Lameter 			return page_address(page);
64*fcdaf842SMichal Hocko 
65*fcdaf842SMichal Hocko 		if (!warned) {
66*fcdaf842SMichal Hocko 			warn_alloc(gfp_mask & ~__GFP_NOWARN, NULL,
67*fcdaf842SMichal Hocko 				   "vmemmap alloc failure: order:%u", order);
68*fcdaf842SMichal Hocko 			warned = true;
69*fcdaf842SMichal Hocko 		}
708f6aac41SChristoph Lameter 		return NULL;
718f6aac41SChristoph Lameter 	} else
72e0dc3a53SKAMEZAWA Hiroyuki 		return __earlyonly_bootmem_alloc(node, size, size,
738f6aac41SChristoph Lameter 				__pa(MAX_DMA_ADDRESS));
748f6aac41SChristoph Lameter }
758f6aac41SChristoph Lameter 
769bdac914SYinghai Lu /* need to make sure size is all the same during early stage */
774b94ffdcSDan Williams static void * __meminit alloc_block_buf(unsigned long size, int node)
789bdac914SYinghai Lu {
799bdac914SYinghai Lu 	void *ptr;
809bdac914SYinghai Lu 
819bdac914SYinghai Lu 	if (!vmemmap_buf)
829bdac914SYinghai Lu 		return vmemmap_alloc_block(size, node);
839bdac914SYinghai Lu 
849bdac914SYinghai Lu 	/* take the from buf */
859bdac914SYinghai Lu 	ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size);
869bdac914SYinghai Lu 	if (ptr + size > vmemmap_buf_end)
879bdac914SYinghai Lu 		return vmemmap_alloc_block(size, node);
889bdac914SYinghai Lu 
899bdac914SYinghai Lu 	vmemmap_buf = ptr + size;
909bdac914SYinghai Lu 
919bdac914SYinghai Lu 	return ptr;
929bdac914SYinghai Lu }
939bdac914SYinghai Lu 
944b94ffdcSDan Williams static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap)
954b94ffdcSDan Williams {
964b94ffdcSDan Williams 	return altmap->base_pfn + altmap->reserve + altmap->alloc
974b94ffdcSDan Williams 		+ altmap->align;
984b94ffdcSDan Williams }
994b94ffdcSDan Williams 
1004b94ffdcSDan Williams static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap)
1014b94ffdcSDan Williams {
1024b94ffdcSDan Williams 	unsigned long allocated = altmap->alloc + altmap->align;
1034b94ffdcSDan Williams 
1044b94ffdcSDan Williams 	if (altmap->free > allocated)
1054b94ffdcSDan Williams 		return altmap->free - allocated;
1064b94ffdcSDan Williams 	return 0;
1074b94ffdcSDan Williams }
1084b94ffdcSDan Williams 
1094b94ffdcSDan Williams /**
1104b94ffdcSDan Williams  * vmem_altmap_alloc - allocate pages from the vmem_altmap reservation
1114b94ffdcSDan Williams  * @altmap - reserved page pool for the allocation
1124b94ffdcSDan Williams  * @nr_pfns - size (in pages) of the allocation
1134b94ffdcSDan Williams  *
1144b94ffdcSDan Williams  * Allocations are aligned to the size of the request
1154b94ffdcSDan Williams  */
1164b94ffdcSDan Williams static unsigned long __meminit vmem_altmap_alloc(struct vmem_altmap *altmap,
1174b94ffdcSDan Williams 		unsigned long nr_pfns)
1184b94ffdcSDan Williams {
1194b94ffdcSDan Williams 	unsigned long pfn = vmem_altmap_next_pfn(altmap);
1204b94ffdcSDan Williams 	unsigned long nr_align;
1214b94ffdcSDan Williams 
1224b94ffdcSDan Williams 	nr_align = 1UL << find_first_bit(&nr_pfns, BITS_PER_LONG);
1234b94ffdcSDan Williams 	nr_align = ALIGN(pfn, nr_align) - pfn;
1244b94ffdcSDan Williams 
1254b94ffdcSDan Williams 	if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap))
1264b94ffdcSDan Williams 		return ULONG_MAX;
1274b94ffdcSDan Williams 	altmap->alloc += nr_pfns;
1284b94ffdcSDan Williams 	altmap->align += nr_align;
1294b94ffdcSDan Williams 	return pfn + nr_align;
1304b94ffdcSDan Williams }
1314b94ffdcSDan Williams 
1324b94ffdcSDan Williams static void * __meminit altmap_alloc_block_buf(unsigned long size,
1334b94ffdcSDan Williams 		struct vmem_altmap *altmap)
1344b94ffdcSDan Williams {
1354b94ffdcSDan Williams 	unsigned long pfn, nr_pfns;
1364b94ffdcSDan Williams 	void *ptr;
1374b94ffdcSDan Williams 
1384b94ffdcSDan Williams 	if (size & ~PAGE_MASK) {
1394b94ffdcSDan Williams 		pr_warn_once("%s: allocations must be multiple of PAGE_SIZE (%ld)\n",
1404b94ffdcSDan Williams 				__func__, size);
1414b94ffdcSDan Williams 		return NULL;
1424b94ffdcSDan Williams 	}
1434b94ffdcSDan Williams 
1444b94ffdcSDan Williams 	nr_pfns = size >> PAGE_SHIFT;
1454b94ffdcSDan Williams 	pfn = vmem_altmap_alloc(altmap, nr_pfns);
1464b94ffdcSDan Williams 	if (pfn < ULONG_MAX)
1474b94ffdcSDan Williams 		ptr = __va(__pfn_to_phys(pfn));
1484b94ffdcSDan Williams 	else
1494b94ffdcSDan Williams 		ptr = NULL;
1504b94ffdcSDan Williams 	pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n",
1514b94ffdcSDan Williams 			__func__, pfn, altmap->alloc, altmap->align, nr_pfns);
1524b94ffdcSDan Williams 
1534b94ffdcSDan Williams 	return ptr;
1544b94ffdcSDan Williams }
1554b94ffdcSDan Williams 
1564b94ffdcSDan Williams /* need to make sure size is all the same during early stage */
1574b94ffdcSDan Williams void * __meminit __vmemmap_alloc_block_buf(unsigned long size, int node,
1584b94ffdcSDan Williams 		struct vmem_altmap *altmap)
1594b94ffdcSDan Williams {
1604b94ffdcSDan Williams 	if (altmap)
1614b94ffdcSDan Williams 		return altmap_alloc_block_buf(size, altmap);
1624b94ffdcSDan Williams 	return alloc_block_buf(size, node);
1634b94ffdcSDan Williams }
1644b94ffdcSDan Williams 
1658f6aac41SChristoph Lameter void __meminit vmemmap_verify(pte_t *pte, int node,
1668f6aac41SChristoph Lameter 				unsigned long start, unsigned long end)
1678f6aac41SChristoph Lameter {
1688f6aac41SChristoph Lameter 	unsigned long pfn = pte_pfn(*pte);
1698f6aac41SChristoph Lameter 	int actual_node = early_pfn_to_nid(pfn);
1708f6aac41SChristoph Lameter 
171b41ad14cSDavid Rientjes 	if (node_distance(actual_node, node) > LOCAL_DISTANCE)
1721170532bSJoe Perches 		pr_warn("[%lx-%lx] potential offnode page_structs\n",
173756a025fSJoe Perches 			start, end - 1);
1748f6aac41SChristoph Lameter }
1758f6aac41SChristoph Lameter 
17629c71111SAndy Whitcroft pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
1778f6aac41SChristoph Lameter {
17829c71111SAndy Whitcroft 	pte_t *pte = pte_offset_kernel(pmd, addr);
1798f6aac41SChristoph Lameter 	if (pte_none(*pte)) {
1808f6aac41SChristoph Lameter 		pte_t entry;
1814b94ffdcSDan Williams 		void *p = alloc_block_buf(PAGE_SIZE, node);
1828f6aac41SChristoph Lameter 		if (!p)
1839dce07f1SAl Viro 			return NULL;
18429c71111SAndy Whitcroft 		entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
18529c71111SAndy Whitcroft 		set_pte_at(&init_mm, addr, pte, entry);
18629c71111SAndy Whitcroft 	}
18729c71111SAndy Whitcroft 	return pte;
1888f6aac41SChristoph Lameter }
1898f6aac41SChristoph Lameter 
190f7f99100SPavel Tatashin static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node)
191f7f99100SPavel Tatashin {
192f7f99100SPavel Tatashin 	void *p = vmemmap_alloc_block(size, node);
193f7f99100SPavel Tatashin 
194f7f99100SPavel Tatashin 	if (!p)
195f7f99100SPavel Tatashin 		return NULL;
196f7f99100SPavel Tatashin 	memset(p, 0, size);
197f7f99100SPavel Tatashin 
198f7f99100SPavel Tatashin 	return p;
199f7f99100SPavel Tatashin }
200f7f99100SPavel Tatashin 
20129c71111SAndy Whitcroft pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
2028f6aac41SChristoph Lameter {
20329c71111SAndy Whitcroft 	pmd_t *pmd = pmd_offset(pud, addr);
2048f6aac41SChristoph Lameter 	if (pmd_none(*pmd)) {
205f7f99100SPavel Tatashin 		void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
2068f6aac41SChristoph Lameter 		if (!p)
2079dce07f1SAl Viro 			return NULL;
2088f6aac41SChristoph Lameter 		pmd_populate_kernel(&init_mm, pmd, p);
2098f6aac41SChristoph Lameter 	}
21029c71111SAndy Whitcroft 	return pmd;
2118f6aac41SChristoph Lameter }
2128f6aac41SChristoph Lameter 
213c2febafcSKirill A. Shutemov pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
2148f6aac41SChristoph Lameter {
215c2febafcSKirill A. Shutemov 	pud_t *pud = pud_offset(p4d, addr);
2168f6aac41SChristoph Lameter 	if (pud_none(*pud)) {
217f7f99100SPavel Tatashin 		void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
2188f6aac41SChristoph Lameter 		if (!p)
2199dce07f1SAl Viro 			return NULL;
2208f6aac41SChristoph Lameter 		pud_populate(&init_mm, pud, p);
2218f6aac41SChristoph Lameter 	}
22229c71111SAndy Whitcroft 	return pud;
2238f6aac41SChristoph Lameter }
2248f6aac41SChristoph Lameter 
225c2febafcSKirill A. Shutemov p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
226c2febafcSKirill A. Shutemov {
227c2febafcSKirill A. Shutemov 	p4d_t *p4d = p4d_offset(pgd, addr);
228c2febafcSKirill A. Shutemov 	if (p4d_none(*p4d)) {
229f7f99100SPavel Tatashin 		void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
230c2febafcSKirill A. Shutemov 		if (!p)
231c2febafcSKirill A. Shutemov 			return NULL;
232c2febafcSKirill A. Shutemov 		p4d_populate(&init_mm, p4d, p);
233c2febafcSKirill A. Shutemov 	}
234c2febafcSKirill A. Shutemov 	return p4d;
235c2febafcSKirill A. Shutemov }
236c2febafcSKirill A. Shutemov 
23729c71111SAndy Whitcroft pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
2388f6aac41SChristoph Lameter {
23929c71111SAndy Whitcroft 	pgd_t *pgd = pgd_offset_k(addr);
2408f6aac41SChristoph Lameter 	if (pgd_none(*pgd)) {
241f7f99100SPavel Tatashin 		void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
2428f6aac41SChristoph Lameter 		if (!p)
2439dce07f1SAl Viro 			return NULL;
2448f6aac41SChristoph Lameter 		pgd_populate(&init_mm, pgd, p);
2458f6aac41SChristoph Lameter 	}
24629c71111SAndy Whitcroft 	return pgd;
2478f6aac41SChristoph Lameter }
24829c71111SAndy Whitcroft 
2490aad818bSJohannes Weiner int __meminit vmemmap_populate_basepages(unsigned long start,
2500aad818bSJohannes Weiner 					 unsigned long end, int node)
25129c71111SAndy Whitcroft {
2520aad818bSJohannes Weiner 	unsigned long addr = start;
25329c71111SAndy Whitcroft 	pgd_t *pgd;
254c2febafcSKirill A. Shutemov 	p4d_t *p4d;
25529c71111SAndy Whitcroft 	pud_t *pud;
25629c71111SAndy Whitcroft 	pmd_t *pmd;
25729c71111SAndy Whitcroft 	pte_t *pte;
25829c71111SAndy Whitcroft 
25929c71111SAndy Whitcroft 	for (; addr < end; addr += PAGE_SIZE) {
26029c71111SAndy Whitcroft 		pgd = vmemmap_pgd_populate(addr, node);
26129c71111SAndy Whitcroft 		if (!pgd)
26229c71111SAndy Whitcroft 			return -ENOMEM;
263c2febafcSKirill A. Shutemov 		p4d = vmemmap_p4d_populate(pgd, addr, node);
264c2febafcSKirill A. Shutemov 		if (!p4d)
265c2febafcSKirill A. Shutemov 			return -ENOMEM;
266c2febafcSKirill A. Shutemov 		pud = vmemmap_pud_populate(p4d, addr, node);
26729c71111SAndy Whitcroft 		if (!pud)
26829c71111SAndy Whitcroft 			return -ENOMEM;
26929c71111SAndy Whitcroft 		pmd = vmemmap_pmd_populate(pud, addr, node);
27029c71111SAndy Whitcroft 		if (!pmd)
27129c71111SAndy Whitcroft 			return -ENOMEM;
27229c71111SAndy Whitcroft 		pte = vmemmap_pte_populate(pmd, addr, node);
27329c71111SAndy Whitcroft 		if (!pte)
27429c71111SAndy Whitcroft 			return -ENOMEM;
27529c71111SAndy Whitcroft 		vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
2768f6aac41SChristoph Lameter 	}
27729c71111SAndy Whitcroft 
27829c71111SAndy Whitcroft 	return 0;
27929c71111SAndy Whitcroft }
2808f6aac41SChristoph Lameter 
28198f3cfc1SYasunori Goto struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
2828f6aac41SChristoph Lameter {
2830aad818bSJohannes Weiner 	unsigned long start;
2840aad818bSJohannes Weiner 	unsigned long end;
2850aad818bSJohannes Weiner 	struct page *map;
2860aad818bSJohannes Weiner 
2870aad818bSJohannes Weiner 	map = pfn_to_page(pnum * PAGES_PER_SECTION);
2880aad818bSJohannes Weiner 	start = (unsigned long)map;
2890aad818bSJohannes Weiner 	end = (unsigned long)(map + PAGES_PER_SECTION);
2900aad818bSJohannes Weiner 
2910aad818bSJohannes Weiner 	if (vmemmap_populate(start, end, nid))
2928f6aac41SChristoph Lameter 		return NULL;
2938f6aac41SChristoph Lameter 
2948f6aac41SChristoph Lameter 	return map;
2958f6aac41SChristoph Lameter }
2969bdac914SYinghai Lu 
2979bdac914SYinghai Lu void __init sparse_mem_maps_populate_node(struct page **map_map,
2989bdac914SYinghai Lu 					  unsigned long pnum_begin,
2999bdac914SYinghai Lu 					  unsigned long pnum_end,
3009bdac914SYinghai Lu 					  unsigned long map_count, int nodeid)
3019bdac914SYinghai Lu {
3029bdac914SYinghai Lu 	unsigned long pnum;
3039bdac914SYinghai Lu 	unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
3049bdac914SYinghai Lu 	void *vmemmap_buf_start;
3059bdac914SYinghai Lu 
3069bdac914SYinghai Lu 	size = ALIGN(size, PMD_SIZE);
3079bdac914SYinghai Lu 	vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count,
3089bdac914SYinghai Lu 			 PMD_SIZE, __pa(MAX_DMA_ADDRESS));
3099bdac914SYinghai Lu 
3109bdac914SYinghai Lu 	if (vmemmap_buf_start) {
3119bdac914SYinghai Lu 		vmemmap_buf = vmemmap_buf_start;
3129bdac914SYinghai Lu 		vmemmap_buf_end = vmemmap_buf_start + size * map_count;
3139bdac914SYinghai Lu 	}
3149bdac914SYinghai Lu 
3159bdac914SYinghai Lu 	for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
3169bdac914SYinghai Lu 		struct mem_section *ms;
3179bdac914SYinghai Lu 
3189bdac914SYinghai Lu 		if (!present_section_nr(pnum))
3199bdac914SYinghai Lu 			continue;
3209bdac914SYinghai Lu 
3219bdac914SYinghai Lu 		map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
3229bdac914SYinghai Lu 		if (map_map[pnum])
3239bdac914SYinghai Lu 			continue;
3249bdac914SYinghai Lu 		ms = __nr_to_section(pnum);
3251170532bSJoe Perches 		pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
326756a025fSJoe Perches 		       __func__);
3279bdac914SYinghai Lu 		ms->section_mem_map = 0;
3289bdac914SYinghai Lu 	}
3299bdac914SYinghai Lu 
3309bdac914SYinghai Lu 	if (vmemmap_buf_start) {
3319bdac914SYinghai Lu 		/* need to free left buf */
332bb016b84SSantosh Shilimkar 		memblock_free_early(__pa(vmemmap_buf),
333bb016b84SSantosh Shilimkar 				    vmemmap_buf_end - vmemmap_buf);
3349bdac914SYinghai Lu 		vmemmap_buf = NULL;
3359bdac914SYinghai Lu 		vmemmap_buf_end = NULL;
3369bdac914SYinghai Lu 	}
3379bdac914SYinghai Lu }
338