xref: /linux/arch/x86/mm/numa.c (revision 86941382508850d58c11bdafe0fec646dfd31b09)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Common code for 32 and 64-bit NUMA */
3 #include <linux/acpi.h>
4 #include <linux/kernel.h>
5 #include <linux/mm.h>
6 #include <linux/of.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/memblock.h>
10 #include <linux/mmzone.h>
11 #include <linux/ctype.h>
12 #include <linux/nodemask.h>
13 #include <linux/sched.h>
14 #include <linux/topology.h>
15 #include <linux/sort.h>
16 #include <linux/numa_memblks.h>
17 
18 #include <asm/e820/api.h>
19 #include <asm/proto.h>
20 #include <asm/dma.h>
21 #include <asm/numa.h>
22 #include <asm/amd/nb.h>
23 
24 #include "mm_internal.h"
25 
26 int numa_off;
27 
28 static __init int numa_setup(char *opt)
29 {
30 	if (!opt)
31 		return -EINVAL;
32 	if (!strncmp(opt, "off", 3))
33 		numa_off = 1;
34 	if (!strncmp(opt, "fake=", 5))
35 		return numa_emu_cmdline(opt + 5);
36 	if (!strncmp(opt, "noacpi", 6))
37 		disable_srat();
38 	if (!strncmp(opt, "nohmat", 6))
39 		disable_hmat();
40 	return 0;
41 }
42 early_param("numa", numa_setup);
43 
44 /*
45  * apicid, cpu, node mappings
46  */
47 s16 __apicid_to_node[MAX_LOCAL_APIC] = {
48 	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
49 };
50 
51 int numa_cpu_node(int cpu)
52 {
53 	u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
54 
55 	if (apicid != BAD_APICID)
56 		return __apicid_to_node[apicid];
57 	return NUMA_NO_NODE;
58 }
59 
60 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
61 EXPORT_SYMBOL(node_to_cpumask_map);
62 
63 /*
64  * Map cpu index to node index
65  */
66 DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
67 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
68 
69 void numa_set_node(int cpu, int node)
70 {
71 	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
72 
73 	/* early setting, no percpu area yet */
74 	if (cpu_to_node_map) {
75 		cpu_to_node_map[cpu] = node;
76 		return;
77 	}
78 
79 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
80 	if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
81 		printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
82 		dump_stack();
83 		return;
84 	}
85 #endif
86 	per_cpu(x86_cpu_to_node_map, cpu) = node;
87 
88 	set_cpu_numa_node(cpu, node);
89 }
90 
91 void numa_clear_node(int cpu)
92 {
93 	numa_set_node(cpu, NUMA_NO_NODE);
94 }
95 
96 /*
97  * Allocate node_to_cpumask_map based on number of available nodes
98  * Requires node_possible_map to be valid.
99  *
100  * Note: cpumask_of_node() is not valid until after this is done.
101  * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
102  */
103 void __init setup_node_to_cpumask_map(void)
104 {
105 	unsigned int node;
106 
107 	/* setup nr_node_ids if not done yet */
108 	if (nr_node_ids == MAX_NUMNODES)
109 		setup_nr_node_ids();
110 
111 	/* allocate the map */
112 	for (node = 0; node < nr_node_ids; node++)
113 		alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
114 
115 	/* cpumask_of_node() will now work */
116 	pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
117 }
118 
119 static int __init numa_register_nodes(void)
120 {
121 	int nid;
122 
123 	if (!memblock_validate_numa_coverage(SZ_1M))
124 		return -EINVAL;
125 
126 	/* Finally register nodes. */
127 	for_each_node_mask(nid, node_possible_map) {
128 		unsigned long start_pfn, end_pfn;
129 
130 		/*
131 		 * Note, get_pfn_range_for_nid() depends on
132 		 * memblock_set_node() having already happened
133 		 */
134 		get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
135 		if (start_pfn >= end_pfn)
136 			continue;
137 
138 		alloc_node_data(nid);
139 		node_set_online(nid);
140 	}
141 
142 	/* Dump memblock with node info and return. */
143 	memblock_dump_all();
144 	return 0;
145 }
146 
147 /*
148  * There are unfortunately some poorly designed mainboards around that
149  * only connect memory to a single CPU. This breaks the 1:1 cpu->node
150  * mapping. To avoid this fill in the mapping for all possible CPUs,
151  * as the number of CPUs is not known yet. We round robin the existing
152  * nodes.
153  */
154 static void __init numa_init_array(void)
155 {
156 	int rr, i;
157 
158 	rr = first_node(node_online_map);
159 	for (i = 0; i < nr_cpu_ids; i++) {
160 		if (early_cpu_to_node(i) != NUMA_NO_NODE)
161 			continue;
162 		numa_set_node(i, rr);
163 		rr = next_node_in(rr, node_online_map);
164 	}
165 }
166 
167 static int __init numa_init(int (*init_func)(void))
168 {
169 	int i;
170 	int ret;
171 
172 	for (i = 0; i < MAX_LOCAL_APIC; i++)
173 		set_apicid_to_node(i, NUMA_NO_NODE);
174 
175 	ret = numa_memblks_init(init_func, /* memblock_force_top_down */ true);
176 	if (ret < 0)
177 		return ret;
178 
179 	ret = numa_register_nodes();
180 	if (ret < 0)
181 		return ret;
182 
183 	for (i = 0; i < nr_cpu_ids; i++) {
184 		int nid = early_cpu_to_node(i);
185 
186 		if (nid == NUMA_NO_NODE)
187 			continue;
188 		if (!node_online(nid))
189 			numa_clear_node(i);
190 	}
191 	numa_init_array();
192 
193 	return 0;
194 }
195 
196 /**
197  * dummy_numa_init - Fallback dummy NUMA init
198  *
199  * Used if there's no underlying NUMA architecture, NUMA initialization
200  * fails, or NUMA is disabled on the command line.
201  *
202  * Must online at least one node and add memory blocks that cover all
203  * allowed memory.  This function must not fail.
204  */
205 static int __init dummy_numa_init(void)
206 {
207 	printk(KERN_INFO "%s\n",
208 	       numa_off ? "NUMA turned off" : "No NUMA configuration found");
209 	printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
210 	       0LLU, PFN_PHYS(max_pfn) - 1);
211 
212 	node_set(0, numa_nodes_parsed);
213 	numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
214 
215 	return 0;
216 }
217 
218 /**
219  * x86_numa_init - Initialize NUMA
220  *
221  * Try each configured NUMA initialization method until one succeeds.  The
222  * last fallback is dummy single node config encompassing whole memory and
223  * never fails.
224  */
225 void __init x86_numa_init(void)
226 {
227 	if (!numa_off) {
228 #ifdef CONFIG_ACPI_NUMA
229 		if (!numa_init(x86_acpi_numa_init))
230 			return;
231 #endif
232 #ifdef CONFIG_AMD_NUMA
233 		if (!numa_init(amd_numa_init))
234 			return;
235 #endif
236 		if (acpi_disabled && !numa_init(of_numa_init))
237 			return;
238 	}
239 
240 	numa_init(dummy_numa_init);
241 }
242 
243 
244 /*
245  * A node may exist which has one or more Generic Initiators but no CPUs and no
246  * memory.
247  *
248  * This function must be called after init_cpu_to_node(), to ensure that any
249  * memoryless CPU nodes have already been brought online, and before the
250  * node_data[nid] is needed for zone list setup in build_all_zonelists().
251  *
252  * When this function is called, any nodes containing either memory and/or CPUs
253  * will already be online and there is no need to do anything extra, even if
254  * they also contain one or more Generic Initiators.
255  */
256 void __init init_gi_nodes(void)
257 {
258 	int nid;
259 
260 	/*
261 	 * Exclude this node from
262 	 * bringup_nonboot_cpus
263 	 *  cpu_up
264 	 *   __try_online_node
265 	 *    register_one_node
266 	 * because node_subsys is not initialized yet.
267 	 * TODO remove dependency on node_online
268 	 */
269 	for_each_node_state(nid, N_GENERIC_INITIATOR)
270 		if (!node_online(nid))
271 			node_set_online(nid);
272 }
273 
274 /*
275  * Setup early cpu_to_node.
276  *
277  * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
278  * and apicid_to_node[] tables have valid entries for a CPU.
279  * This means we skip cpu_to_node[] initialisation for NUMA
280  * emulation and faking node case (when running a kernel compiled
281  * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
282  * is already initialized in a round robin manner at numa_init_array,
283  * prior to this call, and this initialization is good enough
284  * for the fake NUMA cases.
285  *
286  * Called before the per_cpu areas are setup.
287  */
288 void __init init_cpu_to_node(void)
289 {
290 	int cpu;
291 	u32 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
292 
293 	BUG_ON(cpu_to_apicid == NULL);
294 
295 	for_each_possible_cpu(cpu) {
296 		int node = numa_cpu_node(cpu);
297 
298 		if (node == NUMA_NO_NODE)
299 			continue;
300 
301 		/*
302 		 * Exclude this node from
303 		 * bringup_nonboot_cpus
304 		 *  cpu_up
305 		 *   __try_online_node
306 		 *    register_one_node
307 		 * because node_subsys is not initialized yet.
308 		 * TODO remove dependency on node_online
309 		 */
310 		if (!node_online(node))
311 			node_set_online(node);
312 
313 		numa_set_node(cpu, node);
314 	}
315 }
316 
317 #ifndef CONFIG_DEBUG_PER_CPU_MAPS
318 
319 # ifndef CONFIG_NUMA_EMU
320 void numa_add_cpu(unsigned int cpu)
321 {
322 	cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
323 }
324 
325 void numa_remove_cpu(unsigned int cpu)
326 {
327 	cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
328 }
329 # endif	/* !CONFIG_NUMA_EMU */
330 
331 #else	/* !CONFIG_DEBUG_PER_CPU_MAPS */
332 
333 int __cpu_to_node(int cpu)
334 {
335 	if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
336 		printk(KERN_WARNING
337 			"cpu_to_node(%d): usage too early!\n", cpu);
338 		dump_stack();
339 		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
340 	}
341 	return per_cpu(x86_cpu_to_node_map, cpu);
342 }
343 EXPORT_SYMBOL(__cpu_to_node);
344 
345 /*
346  * Same function as cpu_to_node() but used if called before the
347  * per_cpu areas are setup.
348  */
349 int early_cpu_to_node(int cpu)
350 {
351 	if (early_per_cpu_ptr(x86_cpu_to_node_map))
352 		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
353 
354 	if (!cpu_possible(cpu)) {
355 		printk(KERN_WARNING
356 			"early_cpu_to_node(%d): no per_cpu area!\n", cpu);
357 		dump_stack();
358 		return NUMA_NO_NODE;
359 	}
360 	return per_cpu(x86_cpu_to_node_map, cpu);
361 }
362 
363 void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable)
364 {
365 	struct cpumask *mask;
366 
367 	if (node == NUMA_NO_NODE) {
368 		/* early_cpu_to_node() already emits a warning and trace */
369 		return;
370 	}
371 	mask = node_to_cpumask_map[node];
372 	if (!cpumask_available(mask)) {
373 		pr_err("node_to_cpumask_map[%i] NULL\n", node);
374 		dump_stack();
375 		return;
376 	}
377 
378 	if (enable)
379 		cpumask_set_cpu(cpu, mask);
380 	else
381 		cpumask_clear_cpu(cpu, mask);
382 
383 	printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n",
384 		enable ? "numa_add_cpu" : "numa_remove_cpu",
385 		cpu, node, cpumask_pr_args(mask));
386 	return;
387 }
388 
389 # ifndef CONFIG_NUMA_EMU
390 static void numa_set_cpumask(int cpu, bool enable)
391 {
392 	debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
393 }
394 
395 void numa_add_cpu(unsigned int cpu)
396 {
397 	numa_set_cpumask(cpu, true);
398 }
399 
400 void numa_remove_cpu(unsigned int cpu)
401 {
402 	numa_set_cpumask(cpu, false);
403 }
404 # endif	/* !CONFIG_NUMA_EMU */
405 
406 /*
407  * Returns a pointer to the bitmask of CPUs on Node 'node'.
408  */
409 const struct cpumask *cpumask_of_node(int node)
410 {
411 	if ((unsigned)node >= nr_node_ids) {
412 		printk(KERN_WARNING
413 			"cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n",
414 			node, nr_node_ids);
415 		dump_stack();
416 		return cpu_none_mask;
417 	}
418 	if (!cpumask_available(node_to_cpumask_map[node])) {
419 		printk(KERN_WARNING
420 			"cpumask_of_node(%d): no node_to_cpumask_map!\n",
421 			node);
422 		dump_stack();
423 		return cpu_online_mask;
424 	}
425 	return node_to_cpumask_map[node];
426 }
427 EXPORT_SYMBOL(cpumask_of_node);
428 
429 #endif	/* !CONFIG_DEBUG_PER_CPU_MAPS */
430 
431 #ifdef CONFIG_NUMA_EMU
432 void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys,
433 					unsigned int nr_emu_nids)
434 {
435 	int i, j;
436 
437 	/*
438 	 * Transform __apicid_to_node table to use emulated nids by
439 	 * reverse-mapping phys_nid.  The maps should always exist but fall
440 	 * back to zero just in case.
441 	 */
442 	for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
443 		if (__apicid_to_node[i] == NUMA_NO_NODE)
444 			continue;
445 		for (j = 0; j < nr_emu_nids; j++)
446 			if (__apicid_to_node[i] == emu_nid_to_phys[j])
447 				break;
448 		__apicid_to_node[i] = j < nr_emu_nids ? j : 0;
449 	}
450 }
451 
452 u64 __init numa_emu_dma_end(void)
453 {
454 	return PFN_PHYS(MAX_DMA32_PFN);
455 }
456 #endif /* CONFIG_NUMA_EMU */
457