1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Common code for 32 and 64-bit NUMA */
3 #include <linux/acpi.h>
4 #include <linux/kernel.h>
5 #include <linux/mm.h>
6 #include <linux/of.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/memblock.h>
10 #include <linux/mmzone.h>
11 #include <linux/ctype.h>
12 #include <linux/nodemask.h>
13 #include <linux/sched.h>
14 #include <linux/topology.h>
15 #include <linux/sort.h>
16 #include <linux/numa_memblks.h>
17
18 #include <asm/e820/api.h>
19 #include <asm/proto.h>
20 #include <asm/dma.h>
21 #include <asm/numa.h>
22 #include <asm/amd/nb.h>
23
24 #include "mm_internal.h"
25
26 int numa_off;
27
numa_setup(char * opt)28 static __init int numa_setup(char *opt)
29 {
30 if (!opt)
31 return -EINVAL;
32 if (!strncmp(opt, "off", 3))
33 numa_off = 1;
34 if (!strncmp(opt, "fake=", 5))
35 return numa_emu_cmdline(opt + 5);
36 if (!strncmp(opt, "noacpi", 6))
37 disable_srat();
38 if (!strncmp(opt, "nohmat", 6))
39 disable_hmat();
40 return 0;
41 }
42 early_param("numa", numa_setup);
43
44 /*
45 * apicid, cpu, node mappings
46 */
47 s16 __apicid_to_node[MAX_LOCAL_APIC] = {
48 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
49 };
50
51 nodemask_t numa_phys_nodes_parsed __initdata;
52
numa_cpu_node(int cpu)53 int numa_cpu_node(int cpu)
54 {
55 u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
56
57 if (apicid != BAD_APICID)
58 return __apicid_to_node[apicid];
59 return NUMA_NO_NODE;
60 }
61
num_phys_nodes(void)62 int __init num_phys_nodes(void)
63 {
64 return bitmap_weight(numa_phys_nodes_parsed.bits, MAX_NUMNODES);
65 }
66
67 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
68 EXPORT_SYMBOL(node_to_cpumask_map);
69
70 /*
71 * Map cpu index to node index
72 */
73 DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
74 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
75
numa_set_node(int cpu,int node)76 void numa_set_node(int cpu, int node)
77 {
78 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
79
80 /* early setting, no percpu area yet */
81 if (cpu_to_node_map) {
82 cpu_to_node_map[cpu] = node;
83 return;
84 }
85
86 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
87 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
88 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
89 dump_stack();
90 return;
91 }
92 #endif
93 per_cpu(x86_cpu_to_node_map, cpu) = node;
94
95 set_cpu_numa_node(cpu, node);
96 }
97
numa_clear_node(int cpu)98 void numa_clear_node(int cpu)
99 {
100 numa_set_node(cpu, NUMA_NO_NODE);
101 }
102
103 /*
104 * Allocate node_to_cpumask_map based on number of available nodes
105 * Requires node_possible_map to be valid.
106 *
107 * Note: cpumask_of_node() is not valid until after this is done.
108 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
109 */
setup_node_to_cpumask_map(void)110 void __init setup_node_to_cpumask_map(void)
111 {
112 unsigned int node;
113
114 /* setup nr_node_ids if not done yet */
115 if (nr_node_ids == MAX_NUMNODES)
116 setup_nr_node_ids();
117
118 /* allocate the map */
119 for (node = 0; node < nr_node_ids; node++)
120 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
121
122 /* cpumask_of_node() will now work */
123 pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
124 }
125
numa_register_nodes(void)126 static int __init numa_register_nodes(void)
127 {
128 int nid;
129
130 if (!memblock_validate_numa_coverage(SZ_1M))
131 return -EINVAL;
132
133 /* Finally register nodes. */
134 for_each_node_mask(nid, node_possible_map) {
135 unsigned long start_pfn, end_pfn;
136
137 /*
138 * Note, get_pfn_range_for_nid() depends on
139 * memblock_set_node() having already happened
140 */
141 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
142 if (start_pfn >= end_pfn)
143 continue;
144
145 alloc_node_data(nid);
146 node_set_online(nid);
147 }
148
149 /* Dump memblock with node info and return. */
150 memblock_dump_all();
151 return 0;
152 }
153
154 /*
155 * There are unfortunately some poorly designed mainboards around that
156 * only connect memory to a single CPU. This breaks the 1:1 cpu->node
157 * mapping. To avoid this fill in the mapping for all possible CPUs,
158 * as the number of CPUs is not known yet. We round robin the existing
159 * nodes.
160 */
numa_init_array(void)161 static void __init numa_init_array(void)
162 {
163 int rr, i;
164
165 rr = first_node(node_online_map);
166 for (i = 0; i < nr_cpu_ids; i++) {
167 if (early_cpu_to_node(i) != NUMA_NO_NODE)
168 continue;
169 numa_set_node(i, rr);
170 rr = next_node_in(rr, node_online_map);
171 }
172 }
173
numa_init(int (* init_func)(void))174 static int __init numa_init(int (*init_func)(void))
175 {
176 int i;
177 int ret;
178
179 for (i = 0; i < MAX_LOCAL_APIC; i++)
180 set_apicid_to_node(i, NUMA_NO_NODE);
181
182 ret = numa_memblks_init(init_func, /* memblock_force_top_down */ true);
183 if (ret < 0)
184 return ret;
185
186 ret = numa_register_nodes();
187 if (ret < 0)
188 return ret;
189
190 for (i = 0; i < nr_cpu_ids; i++) {
191 int nid = early_cpu_to_node(i);
192
193 if (nid == NUMA_NO_NODE)
194 continue;
195 if (!node_online(nid))
196 numa_clear_node(i);
197 }
198 numa_init_array();
199
200 return 0;
201 }
202
203 /**
204 * dummy_numa_init - Fallback dummy NUMA init
205 *
206 * Used if there's no underlying NUMA architecture, NUMA initialization
207 * fails, or NUMA is disabled on the command line.
208 *
209 * Must online at least one node and add memory blocks that cover all
210 * allowed memory. This function must not fail.
211 */
dummy_numa_init(void)212 static int __init dummy_numa_init(void)
213 {
214 printk(KERN_INFO "%s\n",
215 numa_off ? "NUMA turned off" : "No NUMA configuration found");
216 printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
217 0LLU, PFN_PHYS(max_pfn) - 1);
218
219 node_set(0, numa_nodes_parsed);
220 node_set(0, numa_phys_nodes_parsed);
221 numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
222
223 return 0;
224 }
225
226 /**
227 * x86_numa_init - Initialize NUMA
228 *
229 * Try each configured NUMA initialization method until one succeeds. The
230 * last fallback is dummy single node config encompassing whole memory and
231 * never fails.
232 */
x86_numa_init(void)233 void __init x86_numa_init(void)
234 {
235 if (!numa_off) {
236 #ifdef CONFIG_ACPI_NUMA
237 if (!numa_init(x86_acpi_numa_init))
238 return;
239 #endif
240 #ifdef CONFIG_AMD_NUMA
241 if (!numa_init(amd_numa_init))
242 return;
243 #endif
244 if (acpi_disabled && !numa_init(of_numa_init))
245 return;
246 }
247
248 numa_init(dummy_numa_init);
249 }
250
251
252 /*
253 * A node may exist which has one or more Generic Initiators but no CPUs and no
254 * memory.
255 *
256 * This function must be called after init_cpu_to_node(), to ensure that any
257 * memoryless CPU nodes have already been brought online, and before the
258 * node_data[nid] is needed for zone list setup in build_all_zonelists().
259 *
260 * When this function is called, any nodes containing either memory and/or CPUs
261 * will already be online and there is no need to do anything extra, even if
262 * they also contain one or more Generic Initiators.
263 */
init_gi_nodes(void)264 void __init init_gi_nodes(void)
265 {
266 int nid;
267
268 /*
269 * Exclude this node from
270 * bringup_nonboot_cpus
271 * cpu_up
272 * __try_online_node
273 * register_node
274 * because node_subsys is not initialized yet.
275 * TODO remove dependency on node_online
276 */
277 for_each_node_state(nid, N_GENERIC_INITIATOR)
278 if (!node_online(nid))
279 node_set_online(nid);
280 }
281
282 /*
283 * Setup early cpu_to_node.
284 *
285 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
286 * and apicid_to_node[] tables have valid entries for a CPU.
287 * This means we skip cpu_to_node[] initialisation for NUMA
288 * emulation and faking node case (when running a kernel compiled
289 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
290 * is already initialized in a round robin manner at numa_init_array,
291 * prior to this call, and this initialization is good enough
292 * for the fake NUMA cases.
293 *
294 * Called before the per_cpu areas are setup.
295 */
init_cpu_to_node(void)296 void __init init_cpu_to_node(void)
297 {
298 int cpu;
299 u32 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
300
301 BUG_ON(cpu_to_apicid == NULL);
302
303 for_each_possible_cpu(cpu) {
304 int node = numa_cpu_node(cpu);
305
306 if (node == NUMA_NO_NODE)
307 continue;
308
309 /*
310 * Exclude this node from
311 * bringup_nonboot_cpus
312 * cpu_up
313 * __try_online_node
314 * register_node
315 * because node_subsys is not initialized yet.
316 * TODO remove dependency on node_online
317 */
318 if (!node_online(node))
319 node_set_online(node);
320
321 numa_set_node(cpu, node);
322 }
323 }
324
325 #ifndef CONFIG_DEBUG_PER_CPU_MAPS
326
327 # ifndef CONFIG_NUMA_EMU
numa_add_cpu(unsigned int cpu)328 void numa_add_cpu(unsigned int cpu)
329 {
330 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
331 }
332
numa_remove_cpu(unsigned int cpu)333 void numa_remove_cpu(unsigned int cpu)
334 {
335 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
336 }
337 # endif /* !CONFIG_NUMA_EMU */
338
339 #else /* !CONFIG_DEBUG_PER_CPU_MAPS */
340
__cpu_to_node(int cpu)341 int __cpu_to_node(int cpu)
342 {
343 if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
344 printk(KERN_WARNING
345 "cpu_to_node(%d): usage too early!\n", cpu);
346 dump_stack();
347 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
348 }
349 return per_cpu(x86_cpu_to_node_map, cpu);
350 }
351 EXPORT_SYMBOL(__cpu_to_node);
352
353 /*
354 * Same function as cpu_to_node() but used if called before the
355 * per_cpu areas are setup.
356 */
early_cpu_to_node(int cpu)357 int early_cpu_to_node(int cpu)
358 {
359 if (early_per_cpu_ptr(x86_cpu_to_node_map))
360 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
361
362 if (!cpu_possible(cpu)) {
363 printk(KERN_WARNING
364 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
365 dump_stack();
366 return NUMA_NO_NODE;
367 }
368 return per_cpu(x86_cpu_to_node_map, cpu);
369 }
370
debug_cpumask_set_cpu(unsigned int cpu,int node,bool enable)371 void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable)
372 {
373 struct cpumask *mask;
374
375 if (node == NUMA_NO_NODE) {
376 /* early_cpu_to_node() already emits a warning and trace */
377 return;
378 }
379 mask = node_to_cpumask_map[node];
380 if (!cpumask_available(mask)) {
381 pr_err("node_to_cpumask_map[%i] NULL\n", node);
382 dump_stack();
383 return;
384 }
385
386 if (enable)
387 cpumask_set_cpu(cpu, mask);
388 else
389 cpumask_clear_cpu(cpu, mask);
390
391 printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n",
392 enable ? "numa_add_cpu" : "numa_remove_cpu",
393 cpu, node, cpumask_pr_args(mask));
394 return;
395 }
396
397 # ifndef CONFIG_NUMA_EMU
numa_set_cpumask(int cpu,bool enable)398 static void numa_set_cpumask(int cpu, bool enable)
399 {
400 debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
401 }
402
numa_add_cpu(unsigned int cpu)403 void numa_add_cpu(unsigned int cpu)
404 {
405 numa_set_cpumask(cpu, true);
406 }
407
numa_remove_cpu(unsigned int cpu)408 void numa_remove_cpu(unsigned int cpu)
409 {
410 numa_set_cpumask(cpu, false);
411 }
412 # endif /* !CONFIG_NUMA_EMU */
413
414 /*
415 * Returns a pointer to the bitmask of CPUs on Node 'node'.
416 */
cpumask_of_node(int node)417 const struct cpumask *cpumask_of_node(int node)
418 {
419 if ((unsigned)node >= nr_node_ids) {
420 printk(KERN_WARNING
421 "cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n",
422 node, nr_node_ids);
423 dump_stack();
424 return cpu_none_mask;
425 }
426 if (!cpumask_available(node_to_cpumask_map[node])) {
427 printk(KERN_WARNING
428 "cpumask_of_node(%d): no node_to_cpumask_map!\n",
429 node);
430 dump_stack();
431 return cpu_online_mask;
432 }
433 return node_to_cpumask_map[node];
434 }
435 EXPORT_SYMBOL(cpumask_of_node);
436
437 #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
438
439 #ifdef CONFIG_NUMA_EMU
numa_emu_update_cpu_to_node(int * emu_nid_to_phys,unsigned int nr_emu_nids)440 void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys,
441 unsigned int nr_emu_nids)
442 {
443 int i, j;
444
445 /*
446 * Transform __apicid_to_node table to use emulated nids by
447 * reverse-mapping phys_nid. The maps should always exist but fall
448 * back to zero just in case.
449 */
450 for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
451 if (__apicid_to_node[i] == NUMA_NO_NODE)
452 continue;
453 for (j = 0; j < nr_emu_nids; j++)
454 if (__apicid_to_node[i] == emu_nid_to_phys[j])
455 break;
456 __apicid_to_node[i] = j < nr_emu_nids ? j : 0;
457 }
458 }
459
numa_emu_dma_end(void)460 u64 __init numa_emu_dma_end(void)
461 {
462 return PFN_PHYS(MAX_DMA32_PFN);
463 }
464 #endif /* CONFIG_NUMA_EMU */
465