xref: /linux/arch/x86/mm/numa.c (revision c23719abc3308df7ed3ad35650ad211fb2d2003d)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Common code for 32 and 64-bit NUMA */
3 #include <linux/acpi.h>
4 #include <linux/kernel.h>
5 #include <linux/mm.h>
6 #include <linux/of.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/memblock.h>
10 #include <linux/mmzone.h>
11 #include <linux/ctype.h>
12 #include <linux/nodemask.h>
13 #include <linux/sched.h>
14 #include <linux/topology.h>
15 #include <linux/sort.h>
16 #include <linux/numa_memblks.h>
17 
18 #include <asm/e820/api.h>
19 #include <asm/proto.h>
20 #include <asm/dma.h>
21 #include <asm/numa.h>
22 #include <asm/amd/nb.h>
23 
24 #include "mm_internal.h"
25 
26 int numa_off;
27 
numa_setup(char * opt)28 static __init int numa_setup(char *opt)
29 {
30 	if (!opt)
31 		return -EINVAL;
32 	if (!strncmp(opt, "off", 3))
33 		numa_off = 1;
34 	if (!strncmp(opt, "fake=", 5))
35 		return numa_emu_cmdline(opt + 5);
36 	if (!strncmp(opt, "noacpi", 6))
37 		disable_srat();
38 	if (!strncmp(opt, "nohmat", 6))
39 		disable_hmat();
40 	return 0;
41 }
42 early_param("numa", numa_setup);
43 
44 /*
45  * apicid, cpu, node mappings
46  */
47 s16 __apicid_to_node[MAX_LOCAL_APIC] = {
48 	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
49 };
50 
51 nodemask_t numa_phys_nodes_parsed __initdata;
52 
numa_cpu_node(int cpu)53 int numa_cpu_node(int cpu)
54 {
55 	u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
56 
57 	if (apicid != BAD_APICID)
58 		return __apicid_to_node[apicid];
59 	return NUMA_NO_NODE;
60 }
61 
num_phys_nodes(void)62 int __init num_phys_nodes(void)
63 {
64 	return bitmap_weight(numa_phys_nodes_parsed.bits, MAX_NUMNODES);
65 }
66 
67 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
68 EXPORT_SYMBOL(node_to_cpumask_map);
69 
70 /*
71  * Map cpu index to node index
72  */
73 DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
74 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
75 
numa_set_node(int cpu,int node)76 void numa_set_node(int cpu, int node)
77 {
78 	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
79 
80 	/* early setting, no percpu area yet */
81 	if (cpu_to_node_map) {
82 		cpu_to_node_map[cpu] = node;
83 		return;
84 	}
85 
86 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
87 	if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
88 		printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
89 		dump_stack();
90 		return;
91 	}
92 #endif
93 	per_cpu(x86_cpu_to_node_map, cpu) = node;
94 
95 	set_cpu_numa_node(cpu, node);
96 }
97 
numa_clear_node(int cpu)98 void numa_clear_node(int cpu)
99 {
100 	numa_set_node(cpu, NUMA_NO_NODE);
101 }
102 
103 /*
104  * Allocate node_to_cpumask_map based on number of available nodes
105  * Requires node_possible_map to be valid.
106  *
107  * Note: cpumask_of_node() is not valid until after this is done.
108  * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
109  */
setup_node_to_cpumask_map(void)110 void __init setup_node_to_cpumask_map(void)
111 {
112 	unsigned int node;
113 
114 	/* setup nr_node_ids if not done yet */
115 	if (nr_node_ids == MAX_NUMNODES)
116 		setup_nr_node_ids();
117 
118 	/* allocate the map */
119 	for (node = 0; node < nr_node_ids; node++)
120 		alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
121 
122 	/* cpumask_of_node() will now work */
123 	pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
124 }
125 
numa_register_nodes(void)126 static int __init numa_register_nodes(void)
127 {
128 	int nid;
129 
130 	if (!memblock_validate_numa_coverage(SZ_1M))
131 		return -EINVAL;
132 
133 	/* Finally register nodes. */
134 	for_each_node_mask(nid, node_possible_map) {
135 		unsigned long start_pfn, end_pfn;
136 
137 		/*
138 		 * Note, get_pfn_range_for_nid() depends on
139 		 * memblock_set_node() having already happened
140 		 */
141 		get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
142 		if (start_pfn >= end_pfn)
143 			continue;
144 
145 		alloc_node_data(nid);
146 		node_set_online(nid);
147 	}
148 
149 	/* Dump memblock with node info and return. */
150 	memblock_dump_all();
151 	return 0;
152 }
153 
154 /*
155  * There are unfortunately some poorly designed mainboards around that
156  * only connect memory to a single CPU. This breaks the 1:1 cpu->node
157  * mapping. To avoid this fill in the mapping for all possible CPUs,
158  * as the number of CPUs is not known yet. We round robin the existing
159  * nodes.
160  */
numa_init_array(void)161 static void __init numa_init_array(void)
162 {
163 	int rr, i;
164 
165 	rr = first_node(node_online_map);
166 	for (i = 0; i < nr_cpu_ids; i++) {
167 		if (early_cpu_to_node(i) != NUMA_NO_NODE)
168 			continue;
169 		numa_set_node(i, rr);
170 		rr = next_node_in(rr, node_online_map);
171 	}
172 }
173 
numa_init(int (* init_func)(void))174 static int __init numa_init(int (*init_func)(void))
175 {
176 	int i;
177 	int ret;
178 
179 	for (i = 0; i < MAX_LOCAL_APIC; i++)
180 		set_apicid_to_node(i, NUMA_NO_NODE);
181 
182 	ret = numa_memblks_init(init_func, /* memblock_force_top_down */ true);
183 	if (ret < 0)
184 		return ret;
185 
186 	ret = numa_register_nodes();
187 	if (ret < 0)
188 		return ret;
189 
190 	for (i = 0; i < nr_cpu_ids; i++) {
191 		int nid = early_cpu_to_node(i);
192 
193 		if (nid == NUMA_NO_NODE)
194 			continue;
195 		if (!node_online(nid))
196 			numa_clear_node(i);
197 	}
198 	numa_init_array();
199 
200 	return 0;
201 }
202 
203 /**
204  * dummy_numa_init - Fallback dummy NUMA init
205  *
206  * Used if there's no underlying NUMA architecture, NUMA initialization
207  * fails, or NUMA is disabled on the command line.
208  *
209  * Must online at least one node and add memory blocks that cover all
210  * allowed memory.  This function must not fail.
211  */
dummy_numa_init(void)212 static int __init dummy_numa_init(void)
213 {
214 	printk(KERN_INFO "%s\n",
215 	       numa_off ? "NUMA turned off" : "No NUMA configuration found");
216 	printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
217 	       0LLU, PFN_PHYS(max_pfn) - 1);
218 
219 	node_set(0, numa_nodes_parsed);
220 	node_set(0, numa_phys_nodes_parsed);
221 	numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
222 
223 	return 0;
224 }
225 
226 /**
227  * x86_numa_init - Initialize NUMA
228  *
229  * Try each configured NUMA initialization method until one succeeds.  The
230  * last fallback is dummy single node config encompassing whole memory and
231  * never fails.
232  */
x86_numa_init(void)233 void __init x86_numa_init(void)
234 {
235 	if (!numa_off) {
236 #ifdef CONFIG_ACPI_NUMA
237 		if (!numa_init(x86_acpi_numa_init))
238 			return;
239 #endif
240 #ifdef CONFIG_AMD_NUMA
241 		if (!numa_init(amd_numa_init))
242 			return;
243 #endif
244 		if (acpi_disabled && !numa_init(of_numa_init))
245 			return;
246 	}
247 
248 	numa_init(dummy_numa_init);
249 }
250 
251 
252 /*
253  * A node may exist which has one or more Generic Initiators but no CPUs and no
254  * memory.
255  *
256  * This function must be called after init_cpu_to_node(), to ensure that any
257  * memoryless CPU nodes have already been brought online, and before the
258  * node_data[nid] is needed for zone list setup in build_all_zonelists().
259  *
260  * When this function is called, any nodes containing either memory and/or CPUs
261  * will already be online and there is no need to do anything extra, even if
262  * they also contain one or more Generic Initiators.
263  */
init_gi_nodes(void)264 void __init init_gi_nodes(void)
265 {
266 	int nid;
267 
268 	/*
269 	 * Exclude this node from
270 	 * bringup_nonboot_cpus
271 	 *  cpu_up
272 	 *   __try_online_node
273 	 *    register_node
274 	 * because node_subsys is not initialized yet.
275 	 * TODO remove dependency on node_online
276 	 */
277 	for_each_node_state(nid, N_GENERIC_INITIATOR)
278 		if (!node_online(nid))
279 			node_set_online(nid);
280 }
281 
282 /*
283  * Setup early cpu_to_node.
284  *
285  * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
286  * and apicid_to_node[] tables have valid entries for a CPU.
287  * This means we skip cpu_to_node[] initialisation for NUMA
288  * emulation and faking node case (when running a kernel compiled
289  * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
290  * is already initialized in a round robin manner at numa_init_array,
291  * prior to this call, and this initialization is good enough
292  * for the fake NUMA cases.
293  *
294  * Called before the per_cpu areas are setup.
295  */
init_cpu_to_node(void)296 void __init init_cpu_to_node(void)
297 {
298 	int cpu;
299 	u32 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
300 
301 	BUG_ON(cpu_to_apicid == NULL);
302 
303 	for_each_possible_cpu(cpu) {
304 		int node = numa_cpu_node(cpu);
305 
306 		if (node == NUMA_NO_NODE)
307 			continue;
308 
309 		/*
310 		 * Exclude this node from
311 		 * bringup_nonboot_cpus
312 		 *  cpu_up
313 		 *   __try_online_node
314 		 *    register_node
315 		 * because node_subsys is not initialized yet.
316 		 * TODO remove dependency on node_online
317 		 */
318 		if (!node_online(node))
319 			node_set_online(node);
320 
321 		numa_set_node(cpu, node);
322 	}
323 }
324 
325 #ifndef CONFIG_DEBUG_PER_CPU_MAPS
326 
327 # ifndef CONFIG_NUMA_EMU
numa_add_cpu(unsigned int cpu)328 void numa_add_cpu(unsigned int cpu)
329 {
330 	cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
331 }
332 
numa_remove_cpu(unsigned int cpu)333 void numa_remove_cpu(unsigned int cpu)
334 {
335 	cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
336 }
337 # endif	/* !CONFIG_NUMA_EMU */
338 
339 #else	/* !CONFIG_DEBUG_PER_CPU_MAPS */
340 
__cpu_to_node(int cpu)341 int __cpu_to_node(int cpu)
342 {
343 	if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
344 		printk(KERN_WARNING
345 			"cpu_to_node(%d): usage too early!\n", cpu);
346 		dump_stack();
347 		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
348 	}
349 	return per_cpu(x86_cpu_to_node_map, cpu);
350 }
351 EXPORT_SYMBOL(__cpu_to_node);
352 
353 /*
354  * Same function as cpu_to_node() but used if called before the
355  * per_cpu areas are setup.
356  */
early_cpu_to_node(int cpu)357 int early_cpu_to_node(int cpu)
358 {
359 	if (early_per_cpu_ptr(x86_cpu_to_node_map))
360 		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
361 
362 	if (!cpu_possible(cpu)) {
363 		printk(KERN_WARNING
364 			"early_cpu_to_node(%d): no per_cpu area!\n", cpu);
365 		dump_stack();
366 		return NUMA_NO_NODE;
367 	}
368 	return per_cpu(x86_cpu_to_node_map, cpu);
369 }
370 
debug_cpumask_set_cpu(unsigned int cpu,int node,bool enable)371 void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable)
372 {
373 	struct cpumask *mask;
374 
375 	if (node == NUMA_NO_NODE) {
376 		/* early_cpu_to_node() already emits a warning and trace */
377 		return;
378 	}
379 	mask = node_to_cpumask_map[node];
380 	if (!cpumask_available(mask)) {
381 		pr_err("node_to_cpumask_map[%i] NULL\n", node);
382 		dump_stack();
383 		return;
384 	}
385 
386 	if (enable)
387 		cpumask_set_cpu(cpu, mask);
388 	else
389 		cpumask_clear_cpu(cpu, mask);
390 
391 	printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n",
392 		enable ? "numa_add_cpu" : "numa_remove_cpu",
393 		cpu, node, cpumask_pr_args(mask));
394 	return;
395 }
396 
397 # ifndef CONFIG_NUMA_EMU
numa_set_cpumask(int cpu,bool enable)398 static void numa_set_cpumask(int cpu, bool enable)
399 {
400 	debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
401 }
402 
numa_add_cpu(unsigned int cpu)403 void numa_add_cpu(unsigned int cpu)
404 {
405 	numa_set_cpumask(cpu, true);
406 }
407 
numa_remove_cpu(unsigned int cpu)408 void numa_remove_cpu(unsigned int cpu)
409 {
410 	numa_set_cpumask(cpu, false);
411 }
412 # endif	/* !CONFIG_NUMA_EMU */
413 
414 /*
415  * Returns a pointer to the bitmask of CPUs on Node 'node'.
416  */
cpumask_of_node(int node)417 const struct cpumask *cpumask_of_node(int node)
418 {
419 	if ((unsigned)node >= nr_node_ids) {
420 		printk(KERN_WARNING
421 			"cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n",
422 			node, nr_node_ids);
423 		dump_stack();
424 		return cpu_none_mask;
425 	}
426 	if (!cpumask_available(node_to_cpumask_map[node])) {
427 		printk(KERN_WARNING
428 			"cpumask_of_node(%d): no node_to_cpumask_map!\n",
429 			node);
430 		dump_stack();
431 		return cpu_online_mask;
432 	}
433 	return node_to_cpumask_map[node];
434 }
435 EXPORT_SYMBOL(cpumask_of_node);
436 
437 #endif	/* !CONFIG_DEBUG_PER_CPU_MAPS */
438 
439 #ifdef CONFIG_NUMA_EMU
numa_emu_update_cpu_to_node(int * emu_nid_to_phys,unsigned int nr_emu_nids)440 void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys,
441 					unsigned int nr_emu_nids)
442 {
443 	int i, j;
444 
445 	/*
446 	 * Transform __apicid_to_node table to use emulated nids by
447 	 * reverse-mapping phys_nid.  The maps should always exist but fall
448 	 * back to zero just in case.
449 	 */
450 	for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
451 		if (__apicid_to_node[i] == NUMA_NO_NODE)
452 			continue;
453 		for (j = 0; j < nr_emu_nids; j++)
454 			if (__apicid_to_node[i] == emu_nid_to_phys[j])
455 				break;
456 		__apicid_to_node[i] = j < nr_emu_nids ? j : 0;
457 	}
458 }
459 
numa_emu_dma_end(void)460 u64 __init numa_emu_dma_end(void)
461 {
462 	return PFN_PHYS(MAX_DMA32_PFN);
463 }
464 #endif /* CONFIG_NUMA_EMU */
465