xref: /linux/mm/sparse.c (revision c717993dd76a1049093af5c262e751d901b8da10)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * sparse memory mappings.
4   */
5  #include <linux/mm.h>
6  #include <linux/slab.h>
7  #include <linux/mmzone.h>
8  #include <linux/memblock.h>
9  #include <linux/compiler.h>
10  #include <linux/highmem.h>
11  #include <linux/export.h>
12  #include <linux/spinlock.h>
13  #include <linux/vmalloc.h>
14  #include <linux/swap.h>
15  #include <linux/swapops.h>
16  #include <linux/bootmem_info.h>
17  
18  #include "internal.h"
19  #include <asm/dma.h>
20  
21  /*
22   * Permanent SPARSEMEM data:
23   *
24   * 1) mem_section	- memory sections, mem_map's for valid memory
25   */
26  #ifdef CONFIG_SPARSEMEM_EXTREME
27  struct mem_section **mem_section;
28  #else
29  struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
30  	____cacheline_internodealigned_in_smp;
31  #endif
32  EXPORT_SYMBOL(mem_section);
33  
34  #ifdef NODE_NOT_IN_PAGE_FLAGS
35  /*
36   * If we did not store the node number in the page then we have to
37   * do a lookup in the section_to_node_table in order to find which
38   * node the page belongs to.
39   */
40  #if MAX_NUMNODES <= 256
41  static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
42  #else
43  static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
44  #endif
45  
46  int page_to_nid(const struct page *page)
47  {
48  	return section_to_node_table[page_to_section(page)];
49  }
50  EXPORT_SYMBOL(page_to_nid);
51  
52  static void set_section_nid(unsigned long section_nr, int nid)
53  {
54  	section_to_node_table[section_nr] = nid;
55  }
56  #else /* !NODE_NOT_IN_PAGE_FLAGS */
57  static inline void set_section_nid(unsigned long section_nr, int nid)
58  {
59  }
60  #endif
61  
62  #ifdef CONFIG_SPARSEMEM_EXTREME
63  static noinline struct mem_section __ref *sparse_index_alloc(int nid)
64  {
65  	struct mem_section *section = NULL;
66  	unsigned long array_size = SECTIONS_PER_ROOT *
67  				   sizeof(struct mem_section);
68  
69  	if (slab_is_available()) {
70  		section = kzalloc_node(array_size, GFP_KERNEL, nid);
71  	} else {
72  		section = memblock_alloc_node(array_size, SMP_CACHE_BYTES,
73  					      nid);
74  		if (!section)
75  			panic("%s: Failed to allocate %lu bytes nid=%d\n",
76  			      __func__, array_size, nid);
77  	}
78  
79  	return section;
80  }
81  
82  static int __meminit sparse_index_init(unsigned long section_nr, int nid)
83  {
84  	unsigned long root = SECTION_NR_TO_ROOT(section_nr);
85  	struct mem_section *section;
86  
87  	/*
88  	 * An existing section is possible in the sub-section hotplug
89  	 * case. First hot-add instantiates, follow-on hot-add reuses
90  	 * the existing section.
91  	 *
92  	 * The mem_hotplug_lock resolves the apparent race below.
93  	 */
94  	if (mem_section[root])
95  		return 0;
96  
97  	section = sparse_index_alloc(nid);
98  	if (!section)
99  		return -ENOMEM;
100  
101  	mem_section[root] = section;
102  
103  	return 0;
104  }
105  #else /* !SPARSEMEM_EXTREME */
106  static inline int sparse_index_init(unsigned long section_nr, int nid)
107  {
108  	return 0;
109  }
110  #endif
111  
112  /*
113   * During early boot, before section_mem_map is used for an actual
114   * mem_map, we use section_mem_map to store the section's NUMA
115   * node.  This keeps us from having to use another data structure.  The
116   * node information is cleared just before we store the real mem_map.
117   */
118  static inline unsigned long sparse_encode_early_nid(int nid)
119  {
120  	return ((unsigned long)nid << SECTION_NID_SHIFT);
121  }
122  
123  static inline int sparse_early_nid(struct mem_section *section)
124  {
125  	return (section->section_mem_map >> SECTION_NID_SHIFT);
126  }
127  
128  /* Validate the physical addressing limitations of the model */
129  void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
130  						unsigned long *end_pfn)
131  {
132  	unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
133  
134  	/*
135  	 * Sanity checks - do not allow an architecture to pass
136  	 * in larger pfns than the maximum scope of sparsemem:
137  	 */
138  	if (*start_pfn > max_sparsemem_pfn) {
139  		mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
140  			"Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
141  			*start_pfn, *end_pfn, max_sparsemem_pfn);
142  		WARN_ON_ONCE(1);
143  		*start_pfn = max_sparsemem_pfn;
144  		*end_pfn = max_sparsemem_pfn;
145  	} else if (*end_pfn > max_sparsemem_pfn) {
146  		mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
147  			"End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
148  			*start_pfn, *end_pfn, max_sparsemem_pfn);
149  		WARN_ON_ONCE(1);
150  		*end_pfn = max_sparsemem_pfn;
151  	}
152  }
153  
154  /*
155   * There are a number of times that we loop over NR_MEM_SECTIONS,
156   * looking for section_present() on each.  But, when we have very
157   * large physical address spaces, NR_MEM_SECTIONS can also be
158   * very large which makes the loops quite long.
159   *
160   * Keeping track of this gives us an easy way to break out of
161   * those loops early.
162   */
163  unsigned long __highest_present_section_nr;
164  static void __section_mark_present(struct mem_section *ms,
165  		unsigned long section_nr)
166  {
167  	if (section_nr > __highest_present_section_nr)
168  		__highest_present_section_nr = section_nr;
169  
170  	ms->section_mem_map |= SECTION_MARKED_PRESENT;
171  }
172  
173  #define for_each_present_section_nr(start, section_nr)		\
174  	for (section_nr = next_present_section_nr(start-1);	\
175  	     ((section_nr != -1) &&				\
176  	      (section_nr <= __highest_present_section_nr));	\
177  	     section_nr = next_present_section_nr(section_nr))
178  
179  static inline unsigned long first_present_section_nr(void)
180  {
181  	return next_present_section_nr(-1);
182  }
183  
184  #ifdef CONFIG_SPARSEMEM_VMEMMAP
185  static void subsection_mask_set(unsigned long *map, unsigned long pfn,
186  		unsigned long nr_pages)
187  {
188  	int idx = subsection_map_index(pfn);
189  	int end = subsection_map_index(pfn + nr_pages - 1);
190  
191  	bitmap_set(map, idx, end - idx + 1);
192  }
193  
194  void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
195  {
196  	int end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
197  	unsigned long nr, start_sec = pfn_to_section_nr(pfn);
198  
199  	if (!nr_pages)
200  		return;
201  
202  	for (nr = start_sec; nr <= end_sec; nr++) {
203  		struct mem_section *ms;
204  		unsigned long pfns;
205  
206  		pfns = min(nr_pages, PAGES_PER_SECTION
207  				- (pfn & ~PAGE_SECTION_MASK));
208  		ms = __nr_to_section(nr);
209  		subsection_mask_set(ms->usage->subsection_map, pfn, pfns);
210  
211  		pr_debug("%s: sec: %lu pfns: %lu set(%d, %d)\n", __func__, nr,
212  				pfns, subsection_map_index(pfn),
213  				subsection_map_index(pfn + pfns - 1));
214  
215  		pfn += pfns;
216  		nr_pages -= pfns;
217  	}
218  }
219  #else
220  void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
221  {
222  }
223  #endif
224  
225  /* Record a memory area against a node. */
226  static void __init memory_present(int nid, unsigned long start, unsigned long end)
227  {
228  	unsigned long pfn;
229  
230  #ifdef CONFIG_SPARSEMEM_EXTREME
231  	if (unlikely(!mem_section)) {
232  		unsigned long size, align;
233  
234  		size = sizeof(struct mem_section *) * NR_SECTION_ROOTS;
235  		align = 1 << (INTERNODE_CACHE_SHIFT);
236  		mem_section = memblock_alloc(size, align);
237  		if (!mem_section)
238  			panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
239  			      __func__, size, align);
240  	}
241  #endif
242  
243  	start &= PAGE_SECTION_MASK;
244  	mminit_validate_memmodel_limits(&start, &end);
245  	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
246  		unsigned long section = pfn_to_section_nr(pfn);
247  		struct mem_section *ms;
248  
249  		sparse_index_init(section, nid);
250  		set_section_nid(section, nid);
251  
252  		ms = __nr_to_section(section);
253  		if (!ms->section_mem_map) {
254  			ms->section_mem_map = sparse_encode_early_nid(nid) |
255  							SECTION_IS_ONLINE;
256  			__section_mark_present(ms, section);
257  		}
258  	}
259  }
260  
261  /*
262   * Mark all memblocks as present using memory_present().
263   * This is a convenience function that is useful to mark all of the systems
264   * memory as present during initialization.
265   */
266  static void __init memblocks_present(void)
267  {
268  	unsigned long start, end;
269  	int i, nid;
270  
271  	for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid)
272  		memory_present(nid, start, end);
273  }
274  
275  /*
276   * Subtle, we encode the real pfn into the mem_map such that
277   * the identity pfn - section_mem_map will return the actual
278   * physical page frame number.
279   */
280  static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
281  {
282  	unsigned long coded_mem_map =
283  		(unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
284  	BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT));
285  	BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
286  	return coded_mem_map;
287  }
288  
289  #ifdef CONFIG_MEMORY_HOTPLUG
290  /*
291   * Decode mem_map from the coded memmap
292   */
293  struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
294  {
295  	/* mask off the extra low bits of information */
296  	coded_mem_map &= SECTION_MAP_MASK;
297  	return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
298  }
299  #endif /* CONFIG_MEMORY_HOTPLUG */
300  
301  static void __meminit sparse_init_one_section(struct mem_section *ms,
302  		unsigned long pnum, struct page *mem_map,
303  		struct mem_section_usage *usage, unsigned long flags)
304  {
305  	ms->section_mem_map &= ~SECTION_MAP_MASK;
306  	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum)
307  		| SECTION_HAS_MEM_MAP | flags;
308  	ms->usage = usage;
309  }
310  
311  static unsigned long usemap_size(void)
312  {
313  	return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
314  }
315  
316  size_t mem_section_usage_size(void)
317  {
318  	return sizeof(struct mem_section_usage) + usemap_size();
319  }
320  
321  static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat)
322  {
323  #ifndef CONFIG_NUMA
324  	VM_BUG_ON(pgdat != &contig_page_data);
325  	return __pa_symbol(&contig_page_data);
326  #else
327  	return __pa(pgdat);
328  #endif
329  }
330  
331  #ifdef CONFIG_MEMORY_HOTREMOVE
332  static struct mem_section_usage * __init
333  sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
334  					 unsigned long size)
335  {
336  	struct mem_section_usage *usage;
337  	unsigned long goal, limit;
338  	int nid;
339  	/*
340  	 * A page may contain usemaps for other sections preventing the
341  	 * page being freed and making a section unremovable while
342  	 * other sections referencing the usemap remain active. Similarly,
343  	 * a pgdat can prevent a section being removed. If section A
344  	 * contains a pgdat and section B contains the usemap, both
345  	 * sections become inter-dependent. This allocates usemaps
346  	 * from the same section as the pgdat where possible to avoid
347  	 * this problem.
348  	 */
349  	goal = pgdat_to_phys(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
350  	limit = goal + (1UL << PA_SECTION_SHIFT);
351  	nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
352  again:
353  	usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
354  	if (!usage && limit) {
355  		limit = 0;
356  		goto again;
357  	}
358  	return usage;
359  }
360  
361  static void __init check_usemap_section_nr(int nid,
362  		struct mem_section_usage *usage)
363  {
364  	unsigned long usemap_snr, pgdat_snr;
365  	static unsigned long old_usemap_snr;
366  	static unsigned long old_pgdat_snr;
367  	struct pglist_data *pgdat = NODE_DATA(nid);
368  	int usemap_nid;
369  
370  	/* First call */
371  	if (!old_usemap_snr) {
372  		old_usemap_snr = NR_MEM_SECTIONS;
373  		old_pgdat_snr = NR_MEM_SECTIONS;
374  	}
375  
376  	usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
377  	pgdat_snr = pfn_to_section_nr(pgdat_to_phys(pgdat) >> PAGE_SHIFT);
378  	if (usemap_snr == pgdat_snr)
379  		return;
380  
381  	if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr)
382  		/* skip redundant message */
383  		return;
384  
385  	old_usemap_snr = usemap_snr;
386  	old_pgdat_snr = pgdat_snr;
387  
388  	usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
389  	if (usemap_nid != nid) {
390  		pr_info("node %d must be removed before remove section %ld\n",
391  			nid, usemap_snr);
392  		return;
393  	}
394  	/*
395  	 * There is a circular dependency.
396  	 * Some platforms allow un-removable section because they will just
397  	 * gather other removable sections for dynamic partitioning.
398  	 * Just notify un-removable section's number here.
399  	 */
400  	pr_info("Section %ld and %ld (node %d) have a circular dependency on usemap and pgdat allocations\n",
401  		usemap_snr, pgdat_snr, nid);
402  }
403  #else
404  static struct mem_section_usage * __init
405  sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
406  					 unsigned long size)
407  {
408  	return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id);
409  }
410  
411  static void __init check_usemap_section_nr(int nid,
412  		struct mem_section_usage *usage)
413  {
414  }
415  #endif /* CONFIG_MEMORY_HOTREMOVE */
416  
417  #ifdef CONFIG_SPARSEMEM_VMEMMAP
418  static unsigned long __init section_map_size(void)
419  {
420  	return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE);
421  }
422  
423  #else
424  static unsigned long __init section_map_size(void)
425  {
426  	return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
427  }
428  
429  struct page __init *__populate_section_memmap(unsigned long pfn,
430  		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
431  {
432  	unsigned long size = section_map_size();
433  	struct page *map = sparse_buffer_alloc(size);
434  	phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
435  
436  	if (map)
437  		return map;
438  
439  	map = memmap_alloc(size, size, addr, nid, false);
440  	if (!map)
441  		panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
442  		      __func__, size, PAGE_SIZE, nid, &addr);
443  
444  	return map;
445  }
446  #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
447  
448  static void *sparsemap_buf __meminitdata;
449  static void *sparsemap_buf_end __meminitdata;
450  
451  static inline void __meminit sparse_buffer_free(unsigned long size)
452  {
453  	WARN_ON(!sparsemap_buf || size == 0);
454  	memblock_free(sparsemap_buf, size);
455  }
456  
457  static void __init sparse_buffer_init(unsigned long size, int nid)
458  {
459  	phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
460  	WARN_ON(sparsemap_buf);	/* forgot to call sparse_buffer_fini()? */
461  	/*
462  	 * Pre-allocated buffer is mainly used by __populate_section_memmap
463  	 * and we want it to be properly aligned to the section size - this is
464  	 * especially the case for VMEMMAP which maps memmap to PMDs
465  	 */
466  	sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true);
467  	sparsemap_buf_end = sparsemap_buf + size;
468  }
469  
470  static void __init sparse_buffer_fini(void)
471  {
472  	unsigned long size = sparsemap_buf_end - sparsemap_buf;
473  
474  	if (sparsemap_buf && size > 0)
475  		sparse_buffer_free(size);
476  	sparsemap_buf = NULL;
477  }
478  
479  void * __meminit sparse_buffer_alloc(unsigned long size)
480  {
481  	void *ptr = NULL;
482  
483  	if (sparsemap_buf) {
484  		ptr = (void *) roundup((unsigned long)sparsemap_buf, size);
485  		if (ptr + size > sparsemap_buf_end)
486  			ptr = NULL;
487  		else {
488  			/* Free redundant aligned space */
489  			if ((unsigned long)(ptr - sparsemap_buf) > 0)
490  				sparse_buffer_free((unsigned long)(ptr - sparsemap_buf));
491  			sparsemap_buf = ptr + size;
492  		}
493  	}
494  	return ptr;
495  }
496  
497  void __weak __meminit vmemmap_populate_print_last(void)
498  {
499  }
500  
501  /*
502   * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
503   * And number of present sections in this node is map_count.
504   */
505  static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
506  				   unsigned long pnum_end,
507  				   unsigned long map_count)
508  {
509  	struct mem_section_usage *usage;
510  	unsigned long pnum;
511  	struct page *map;
512  
513  	usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
514  			mem_section_usage_size() * map_count);
515  	if (!usage) {
516  		pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
517  		goto failed;
518  	}
519  	sparse_buffer_init(map_count * section_map_size(), nid);
520  	for_each_present_section_nr(pnum_begin, pnum) {
521  		unsigned long pfn = section_nr_to_pfn(pnum);
522  
523  		if (pnum >= pnum_end)
524  			break;
525  
526  		map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
527  				nid, NULL);
528  		if (!map) {
529  			pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
530  			       __func__, nid);
531  			pnum_begin = pnum;
532  			sparse_buffer_fini();
533  			goto failed;
534  		}
535  		check_usemap_section_nr(nid, usage);
536  		sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage,
537  				SECTION_IS_EARLY);
538  		usage = (void *) usage + mem_section_usage_size();
539  	}
540  	sparse_buffer_fini();
541  	return;
542  failed:
543  	/* We failed to allocate, mark all the following pnums as not present */
544  	for_each_present_section_nr(pnum_begin, pnum) {
545  		struct mem_section *ms;
546  
547  		if (pnum >= pnum_end)
548  			break;
549  		ms = __nr_to_section(pnum);
550  		ms->section_mem_map = 0;
551  	}
552  }
553  
554  /*
555   * Allocate the accumulated non-linear sections, allocate a mem_map
556   * for each and record the physical to section mapping.
557   */
558  void __init sparse_init(void)
559  {
560  	unsigned long pnum_end, pnum_begin, map_count = 1;
561  	int nid_begin;
562  
563  	memblocks_present();
564  
565  	pnum_begin = first_present_section_nr();
566  	nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
567  
568  	/* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
569  	set_pageblock_order();
570  
571  	for_each_present_section_nr(pnum_begin + 1, pnum_end) {
572  		int nid = sparse_early_nid(__nr_to_section(pnum_end));
573  
574  		if (nid == nid_begin) {
575  			map_count++;
576  			continue;
577  		}
578  		/* Init node with sections in range [pnum_begin, pnum_end) */
579  		sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
580  		nid_begin = nid;
581  		pnum_begin = pnum_end;
582  		map_count = 1;
583  	}
584  	/* cover the last node */
585  	sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
586  	vmemmap_populate_print_last();
587  }
588  
589  #ifdef CONFIG_MEMORY_HOTPLUG
590  
591  /* Mark all memory sections within the pfn range as online */
592  void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
593  {
594  	unsigned long pfn;
595  
596  	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
597  		unsigned long section_nr = pfn_to_section_nr(pfn);
598  		struct mem_section *ms;
599  
600  		/* onlining code should never touch invalid ranges */
601  		if (WARN_ON(!valid_section_nr(section_nr)))
602  			continue;
603  
604  		ms = __nr_to_section(section_nr);
605  		ms->section_mem_map |= SECTION_IS_ONLINE;
606  	}
607  }
608  
609  /* Mark all memory sections within the pfn range as offline */
610  void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
611  {
612  	unsigned long pfn;
613  
614  	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
615  		unsigned long section_nr = pfn_to_section_nr(pfn);
616  		struct mem_section *ms;
617  
618  		/*
619  		 * TODO this needs some double checking. Offlining code makes
620  		 * sure to check pfn_valid but those checks might be just bogus
621  		 */
622  		if (WARN_ON(!valid_section_nr(section_nr)))
623  			continue;
624  
625  		ms = __nr_to_section(section_nr);
626  		ms->section_mem_map &= ~SECTION_IS_ONLINE;
627  	}
628  }
629  
630  #ifdef CONFIG_SPARSEMEM_VMEMMAP
631  static struct page * __meminit populate_section_memmap(unsigned long pfn,
632  		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
633  {
634  	return __populate_section_memmap(pfn, nr_pages, nid, altmap);
635  }
636  
637  static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
638  		struct vmem_altmap *altmap)
639  {
640  	unsigned long start = (unsigned long) pfn_to_page(pfn);
641  	unsigned long end = start + nr_pages * sizeof(struct page);
642  
643  	vmemmap_free(start, end, altmap);
644  }
645  static void free_map_bootmem(struct page *memmap)
646  {
647  	unsigned long start = (unsigned long)memmap;
648  	unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
649  
650  	vmemmap_free(start, end, NULL);
651  }
652  
653  static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages)
654  {
655  	DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
656  	DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
657  	struct mem_section *ms = __pfn_to_section(pfn);
658  	unsigned long *subsection_map = ms->usage
659  		? &ms->usage->subsection_map[0] : NULL;
660  
661  	subsection_mask_set(map, pfn, nr_pages);
662  	if (subsection_map)
663  		bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);
664  
665  	if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
666  				"section already deactivated (%#lx + %ld)\n",
667  				pfn, nr_pages))
668  		return -EINVAL;
669  
670  	bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
671  	return 0;
672  }
673  
674  static bool is_subsection_map_empty(struct mem_section *ms)
675  {
676  	return bitmap_empty(&ms->usage->subsection_map[0],
677  			    SUBSECTIONS_PER_SECTION);
678  }
679  
680  static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
681  {
682  	struct mem_section *ms = __pfn_to_section(pfn);
683  	DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
684  	unsigned long *subsection_map;
685  	int rc = 0;
686  
687  	subsection_mask_set(map, pfn, nr_pages);
688  
689  	subsection_map = &ms->usage->subsection_map[0];
690  
691  	if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
692  		rc = -EINVAL;
693  	else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
694  		rc = -EEXIST;
695  	else
696  		bitmap_or(subsection_map, map, subsection_map,
697  				SUBSECTIONS_PER_SECTION);
698  
699  	return rc;
700  }
701  #else
702  struct page * __meminit populate_section_memmap(unsigned long pfn,
703  		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
704  {
705  	return kvmalloc_node(array_size(sizeof(struct page),
706  					PAGES_PER_SECTION), GFP_KERNEL, nid);
707  }
708  
709  static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
710  		struct vmem_altmap *altmap)
711  {
712  	kvfree(pfn_to_page(pfn));
713  }
714  
715  static void free_map_bootmem(struct page *memmap)
716  {
717  	unsigned long maps_section_nr, removing_section_nr, i;
718  	unsigned long magic, nr_pages;
719  	struct page *page = virt_to_page(memmap);
720  
721  	nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
722  		>> PAGE_SHIFT;
723  
724  	for (i = 0; i < nr_pages; i++, page++) {
725  		magic = page->index;
726  
727  		BUG_ON(magic == NODE_INFO);
728  
729  		maps_section_nr = pfn_to_section_nr(page_to_pfn(page));
730  		removing_section_nr = page_private(page);
731  
732  		/*
733  		 * When this function is called, the removing section is
734  		 * logical offlined state. This means all pages are isolated
735  		 * from page allocator. If removing section's memmap is placed
736  		 * on the same section, it must not be freed.
737  		 * If it is freed, page allocator may allocate it which will
738  		 * be removed physically soon.
739  		 */
740  		if (maps_section_nr != removing_section_nr)
741  			put_page_bootmem(page);
742  	}
743  }
744  
745  static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages)
746  {
747  	return 0;
748  }
749  
750  static bool is_subsection_map_empty(struct mem_section *ms)
751  {
752  	return true;
753  }
754  
755  static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
756  {
757  	return 0;
758  }
759  #endif /* CONFIG_SPARSEMEM_VMEMMAP */
760  
761  /*
762   * To deactivate a memory region, there are 3 cases to handle across
763   * two configurations (SPARSEMEM_VMEMMAP={y,n}):
764   *
765   * 1. deactivation of a partial hot-added section (only possible in
766   *    the SPARSEMEM_VMEMMAP=y case).
767   *      a) section was present at memory init.
768   *      b) section was hot-added post memory init.
769   * 2. deactivation of a complete hot-added section.
770   * 3. deactivation of a complete section from memory init.
771   *
772   * For 1, when subsection_map does not empty we will not be freeing the
773   * usage map, but still need to free the vmemmap range.
774   *
775   * For 2 and 3, the SPARSEMEM_VMEMMAP={y,n} cases are unified
776   */
777  static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
778  		struct vmem_altmap *altmap)
779  {
780  	struct mem_section *ms = __pfn_to_section(pfn);
781  	bool section_is_early = early_section(ms);
782  	struct page *memmap = NULL;
783  	bool empty;
784  
785  	if (clear_subsection_map(pfn, nr_pages))
786  		return;
787  
788  	empty = is_subsection_map_empty(ms);
789  	if (empty) {
790  		unsigned long section_nr = pfn_to_section_nr(pfn);
791  
792  		/*
793  		 * When removing an early section, the usage map is kept (as the
794  		 * usage maps of other sections fall into the same page). It
795  		 * will be re-used when re-adding the section - which is then no
796  		 * longer an early section. If the usage map is PageReserved, it
797  		 * was allocated during boot.
798  		 */
799  		if (!PageReserved(virt_to_page(ms->usage))) {
800  			kfree(ms->usage);
801  			ms->usage = NULL;
802  		}
803  		memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
804  		/*
805  		 * Mark the section invalid so that valid_section()
806  		 * return false. This prevents code from dereferencing
807  		 * ms->usage array.
808  		 */
809  		ms->section_mem_map &= ~SECTION_HAS_MEM_MAP;
810  	}
811  
812  	/*
813  	 * The memmap of early sections is always fully populated. See
814  	 * section_activate() and pfn_valid() .
815  	 */
816  	if (!section_is_early)
817  		depopulate_section_memmap(pfn, nr_pages, altmap);
818  	else if (memmap)
819  		free_map_bootmem(memmap);
820  
821  	if (empty)
822  		ms->section_mem_map = (unsigned long)NULL;
823  }
824  
825  static struct page * __meminit section_activate(int nid, unsigned long pfn,
826  		unsigned long nr_pages, struct vmem_altmap *altmap)
827  {
828  	struct mem_section *ms = __pfn_to_section(pfn);
829  	struct mem_section_usage *usage = NULL;
830  	struct page *memmap;
831  	int rc = 0;
832  
833  	if (!ms->usage) {
834  		usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
835  		if (!usage)
836  			return ERR_PTR(-ENOMEM);
837  		ms->usage = usage;
838  	}
839  
840  	rc = fill_subsection_map(pfn, nr_pages);
841  	if (rc) {
842  		if (usage)
843  			ms->usage = NULL;
844  		kfree(usage);
845  		return ERR_PTR(rc);
846  	}
847  
848  	/*
849  	 * The early init code does not consider partially populated
850  	 * initial sections, it simply assumes that memory will never be
851  	 * referenced.  If we hot-add memory into such a section then we
852  	 * do not need to populate the memmap and can simply reuse what
853  	 * is already there.
854  	 */
855  	if (nr_pages < PAGES_PER_SECTION && early_section(ms))
856  		return pfn_to_page(pfn);
857  
858  	memmap = populate_section_memmap(pfn, nr_pages, nid, altmap);
859  	if (!memmap) {
860  		section_deactivate(pfn, nr_pages, altmap);
861  		return ERR_PTR(-ENOMEM);
862  	}
863  
864  	return memmap;
865  }
866  
867  /**
868   * sparse_add_section - add a memory section, or populate an existing one
869   * @nid: The node to add section on
870   * @start_pfn: start pfn of the memory range
871   * @nr_pages: number of pfns to add in the section
872   * @altmap: device page map
873   *
874   * This is only intended for hotplug.
875   *
876   * Note that only VMEMMAP supports sub-section aligned hotplug,
877   * the proper alignment and size are gated by check_pfn_span().
878   *
879   *
880   * Return:
881   * * 0		- On success.
882   * * -EEXIST	- Section has been present.
883   * * -ENOMEM	- Out of memory.
884   */
885  int __meminit sparse_add_section(int nid, unsigned long start_pfn,
886  		unsigned long nr_pages, struct vmem_altmap *altmap)
887  {
888  	unsigned long section_nr = pfn_to_section_nr(start_pfn);
889  	struct mem_section *ms;
890  	struct page *memmap;
891  	int ret;
892  
893  	ret = sparse_index_init(section_nr, nid);
894  	if (ret < 0)
895  		return ret;
896  
897  	memmap = section_activate(nid, start_pfn, nr_pages, altmap);
898  	if (IS_ERR(memmap))
899  		return PTR_ERR(memmap);
900  
901  	/*
902  	 * Poison uninitialized struct pages in order to catch invalid flags
903  	 * combinations.
904  	 */
905  	page_init_poison(memmap, sizeof(struct page) * nr_pages);
906  
907  	ms = __nr_to_section(section_nr);
908  	set_section_nid(section_nr, nid);
909  	__section_mark_present(ms, section_nr);
910  
911  	/* Align memmap to section boundary in the subsection case */
912  	if (section_nr_to_pfn(section_nr) != start_pfn)
913  		memmap = pfn_to_page(section_nr_to_pfn(section_nr));
914  	sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0);
915  
916  	return 0;
917  }
918  
919  #ifdef CONFIG_MEMORY_FAILURE
920  static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
921  {
922  	int i;
923  
924  	/*
925  	 * A further optimization is to have per section refcounted
926  	 * num_poisoned_pages.  But that would need more space per memmap, so
927  	 * for now just do a quick global check to speed up this routine in the
928  	 * absence of bad pages.
929  	 */
930  	if (atomic_long_read(&num_poisoned_pages) == 0)
931  		return;
932  
933  	for (i = 0; i < nr_pages; i++) {
934  		if (PageHWPoison(&memmap[i])) {
935  			num_poisoned_pages_dec();
936  			ClearPageHWPoison(&memmap[i]);
937  		}
938  	}
939  }
940  #else
941  static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
942  {
943  }
944  #endif
945  
946  void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
947  		unsigned long nr_pages, unsigned long map_offset,
948  		struct vmem_altmap *altmap)
949  {
950  	clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
951  			nr_pages - map_offset);
952  	section_deactivate(pfn, nr_pages, altmap);
953  }
954  #endif /* CONFIG_MEMORY_HOTPLUG */
955