1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Common code for 32 and 64-bit NUMA */ 3 #include <linux/acpi.h> 4 #include <linux/kernel.h> 5 #include <linux/mm.h> 6 #include <linux/of.h> 7 #include <linux/string.h> 8 #include <linux/init.h> 9 #include <linux/memblock.h> 10 #include <linux/mmzone.h> 11 #include <linux/ctype.h> 12 #include <linux/nodemask.h> 13 #include <linux/sched.h> 14 #include <linux/topology.h> 15 #include <linux/sort.h> 16 #include <linux/numa_memblks.h> 17 18 #include <asm/e820/api.h> 19 #include <asm/proto.h> 20 #include <asm/dma.h> 21 #include <asm/numa.h> 22 #include <asm/amd/nb.h> 23 24 #include "mm_internal.h" 25 26 int numa_off; 27 28 static __init int numa_setup(char *opt) 29 { 30 if (!opt) 31 return -EINVAL; 32 if (!strncmp(opt, "off", 3)) 33 numa_off = 1; 34 if (!strncmp(opt, "fake=", 5)) 35 return numa_emu_cmdline(opt + 5); 36 if (!strncmp(opt, "noacpi", 6)) 37 disable_srat(); 38 if (!strncmp(opt, "nohmat", 6)) 39 disable_hmat(); 40 return 0; 41 } 42 early_param("numa", numa_setup); 43 44 /* 45 * apicid, cpu, node mappings 46 */ 47 s16 __apicid_to_node[MAX_LOCAL_APIC] = { 48 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 49 }; 50 51 nodemask_t numa_phys_nodes_parsed __initdata; 52 53 int numa_cpu_node(int cpu) 54 { 55 u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu); 56 57 if (apicid != BAD_APICID) 58 return __apicid_to_node[apicid]; 59 return NUMA_NO_NODE; 60 } 61 62 int __init num_phys_nodes(void) 63 { 64 return bitmap_weight(numa_phys_nodes_parsed.bits, MAX_NUMNODES); 65 } 66 67 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 68 EXPORT_SYMBOL(node_to_cpumask_map); 69 70 /* 71 * Map cpu index to node index 72 */ 73 DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); 74 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); 75 76 void numa_set_node(int cpu, int node) 77 { 78 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); 79 80 /* early setting, no percpu area yet */ 81 if (cpu_to_node_map) { 82 cpu_to_node_map[cpu] = node; 83 return; 84 } 85 86 #ifdef CONFIG_DEBUG_PER_CPU_MAPS 87 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { 88 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); 89 dump_stack(); 90 return; 91 } 92 #endif 93 per_cpu(x86_cpu_to_node_map, cpu) = node; 94 95 set_cpu_numa_node(cpu, node); 96 } 97 98 void numa_clear_node(int cpu) 99 { 100 numa_set_node(cpu, NUMA_NO_NODE); 101 } 102 103 /* 104 * Allocate node_to_cpumask_map based on number of available nodes 105 * Requires node_possible_map to be valid. 106 * 107 * Note: cpumask_of_node() is not valid until after this is done. 108 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) 109 */ 110 void __init setup_node_to_cpumask_map(void) 111 { 112 unsigned int node; 113 114 /* setup nr_node_ids if not done yet */ 115 if (nr_node_ids == MAX_NUMNODES) 116 setup_nr_node_ids(); 117 118 /* allocate the map */ 119 for (node = 0; node < nr_node_ids; node++) 120 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); 121 122 /* cpumask_of_node() will now work */ 123 pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); 124 } 125 126 static int __init numa_register_nodes(void) 127 { 128 int nid; 129 130 if (!memblock_validate_numa_coverage(SZ_1M)) 131 return -EINVAL; 132 133 /* Finally register nodes. */ 134 for_each_node_mask(nid, node_possible_map) { 135 unsigned long start_pfn, end_pfn; 136 137 /* 138 * Note, get_pfn_range_for_nid() depends on 139 * memblock_set_node() having already happened 140 */ 141 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 142 if (start_pfn >= end_pfn) 143 continue; 144 145 alloc_node_data(nid); 146 node_set_online(nid); 147 } 148 149 /* Dump memblock with node info and return. */ 150 memblock_dump_all(); 151 return 0; 152 } 153 154 /* 155 * There are unfortunately some poorly designed mainboards around that 156 * only connect memory to a single CPU. This breaks the 1:1 cpu->node 157 * mapping. To avoid this fill in the mapping for all possible CPUs, 158 * as the number of CPUs is not known yet. We round robin the existing 159 * nodes. 160 */ 161 static void __init numa_init_array(void) 162 { 163 int rr, i; 164 165 rr = first_node(node_online_map); 166 for (i = 0; i < nr_cpu_ids; i++) { 167 if (early_cpu_to_node(i) != NUMA_NO_NODE) 168 continue; 169 numa_set_node(i, rr); 170 rr = next_node_in(rr, node_online_map); 171 } 172 } 173 174 static int __init numa_init(int (*init_func)(void)) 175 { 176 int i; 177 int ret; 178 179 for (i = 0; i < MAX_LOCAL_APIC; i++) 180 set_apicid_to_node(i, NUMA_NO_NODE); 181 182 ret = numa_memblks_init(init_func, /* memblock_force_top_down */ true); 183 if (ret < 0) 184 return ret; 185 186 ret = numa_register_nodes(); 187 if (ret < 0) 188 return ret; 189 190 for (i = 0; i < nr_cpu_ids; i++) { 191 int nid = early_cpu_to_node(i); 192 193 if (nid == NUMA_NO_NODE) 194 continue; 195 if (!node_online(nid)) 196 numa_clear_node(i); 197 } 198 numa_init_array(); 199 200 return 0; 201 } 202 203 /** 204 * dummy_numa_init - Fallback dummy NUMA init 205 * 206 * Used if there's no underlying NUMA architecture, NUMA initialization 207 * fails, or NUMA is disabled on the command line. 208 * 209 * Must online at least one node and add memory blocks that cover all 210 * allowed memory. This function must not fail. 211 */ 212 static int __init dummy_numa_init(void) 213 { 214 printk(KERN_INFO "%s\n", 215 numa_off ? "NUMA turned off" : "No NUMA configuration found"); 216 printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n", 217 0LLU, PFN_PHYS(max_pfn) - 1); 218 219 node_set(0, numa_nodes_parsed); 220 node_set(0, numa_phys_nodes_parsed); 221 numa_add_memblk(0, 0, PFN_PHYS(max_pfn)); 222 223 return 0; 224 } 225 226 /** 227 * x86_numa_init - Initialize NUMA 228 * 229 * Try each configured NUMA initialization method until one succeeds. The 230 * last fallback is dummy single node config encompassing whole memory and 231 * never fails. 232 */ 233 void __init x86_numa_init(void) 234 { 235 if (!numa_off) { 236 #ifdef CONFIG_ACPI_NUMA 237 if (!numa_init(x86_acpi_numa_init)) 238 return; 239 #endif 240 #ifdef CONFIG_AMD_NUMA 241 if (!numa_init(amd_numa_init)) 242 return; 243 #endif 244 if (acpi_disabled && !numa_init(of_numa_init)) 245 return; 246 } 247 248 numa_init(dummy_numa_init); 249 } 250 251 252 /* 253 * A node may exist which has one or more Generic Initiators but no CPUs and no 254 * memory. 255 * 256 * This function must be called after init_cpu_to_node(), to ensure that any 257 * memoryless CPU nodes have already been brought online, and before the 258 * node_data[nid] is needed for zone list setup in build_all_zonelists(). 259 * 260 * When this function is called, any nodes containing either memory and/or CPUs 261 * will already be online and there is no need to do anything extra, even if 262 * they also contain one or more Generic Initiators. 263 */ 264 void __init init_gi_nodes(void) 265 { 266 int nid; 267 268 /* 269 * Exclude this node from 270 * bringup_nonboot_cpus 271 * cpu_up 272 * __try_online_node 273 * register_node 274 * because node_subsys is not initialized yet. 275 * TODO remove dependency on node_online 276 */ 277 for_each_node_state(nid, N_GENERIC_INITIATOR) 278 if (!node_online(nid)) 279 node_set_online(nid); 280 } 281 282 /* 283 * Setup early cpu_to_node. 284 * 285 * Populate cpu_to_node[] only if x86_cpu_to_apicid[], 286 * and apicid_to_node[] tables have valid entries for a CPU. 287 * This means we skip cpu_to_node[] initialisation for NUMA 288 * emulation and faking node case (when running a kernel compiled 289 * for NUMA on a non NUMA box), which is OK as cpu_to_node[] 290 * is already initialized in a round robin manner at numa_init_array, 291 * prior to this call, and this initialization is good enough 292 * for the fake NUMA cases. 293 * 294 * Called before the per_cpu areas are setup. 295 */ 296 void __init init_cpu_to_node(void) 297 { 298 int cpu; 299 u32 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 300 301 BUG_ON(cpu_to_apicid == NULL); 302 303 for_each_possible_cpu(cpu) { 304 int node = numa_cpu_node(cpu); 305 306 if (node == NUMA_NO_NODE) 307 continue; 308 309 /* 310 * Exclude this node from 311 * bringup_nonboot_cpus 312 * cpu_up 313 * __try_online_node 314 * register_node 315 * because node_subsys is not initialized yet. 316 * TODO remove dependency on node_online 317 */ 318 if (!node_online(node)) 319 node_set_online(node); 320 321 numa_set_node(cpu, node); 322 } 323 } 324 325 #ifndef CONFIG_DEBUG_PER_CPU_MAPS 326 327 # ifndef CONFIG_NUMA_EMU 328 void numa_add_cpu(unsigned int cpu) 329 { 330 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 331 } 332 333 void numa_remove_cpu(unsigned int cpu) 334 { 335 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 336 } 337 # endif /* !CONFIG_NUMA_EMU */ 338 339 #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ 340 341 int __cpu_to_node(int cpu) 342 { 343 if (early_per_cpu_ptr(x86_cpu_to_node_map)) { 344 printk(KERN_WARNING 345 "cpu_to_node(%d): usage too early!\n", cpu); 346 dump_stack(); 347 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; 348 } 349 return per_cpu(x86_cpu_to_node_map, cpu); 350 } 351 EXPORT_SYMBOL(__cpu_to_node); 352 353 /* 354 * Same function as cpu_to_node() but used if called before the 355 * per_cpu areas are setup. 356 */ 357 int early_cpu_to_node(int cpu) 358 { 359 if (early_per_cpu_ptr(x86_cpu_to_node_map)) 360 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; 361 362 if (!cpu_possible(cpu)) { 363 printk(KERN_WARNING 364 "early_cpu_to_node(%d): no per_cpu area!\n", cpu); 365 dump_stack(); 366 return NUMA_NO_NODE; 367 } 368 return per_cpu(x86_cpu_to_node_map, cpu); 369 } 370 371 void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable) 372 { 373 struct cpumask *mask; 374 375 if (node == NUMA_NO_NODE) { 376 /* early_cpu_to_node() already emits a warning and trace */ 377 return; 378 } 379 mask = node_to_cpumask_map[node]; 380 if (!cpumask_available(mask)) { 381 pr_err("node_to_cpumask_map[%i] NULL\n", node); 382 dump_stack(); 383 return; 384 } 385 386 if (enable) 387 cpumask_set_cpu(cpu, mask); 388 else 389 cpumask_clear_cpu(cpu, mask); 390 391 printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n", 392 enable ? "numa_add_cpu" : "numa_remove_cpu", 393 cpu, node, cpumask_pr_args(mask)); 394 return; 395 } 396 397 # ifndef CONFIG_NUMA_EMU 398 static void numa_set_cpumask(int cpu, bool enable) 399 { 400 debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable); 401 } 402 403 void numa_add_cpu(unsigned int cpu) 404 { 405 numa_set_cpumask(cpu, true); 406 } 407 408 void numa_remove_cpu(unsigned int cpu) 409 { 410 numa_set_cpumask(cpu, false); 411 } 412 # endif /* !CONFIG_NUMA_EMU */ 413 414 /* 415 * Returns a pointer to the bitmask of CPUs on Node 'node'. 416 */ 417 const struct cpumask *cpumask_of_node(int node) 418 { 419 if ((unsigned)node >= nr_node_ids) { 420 printk(KERN_WARNING 421 "cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n", 422 node, nr_node_ids); 423 dump_stack(); 424 return cpu_none_mask; 425 } 426 if (!cpumask_available(node_to_cpumask_map[node])) { 427 printk(KERN_WARNING 428 "cpumask_of_node(%d): no node_to_cpumask_map!\n", 429 node); 430 dump_stack(); 431 return cpu_online_mask; 432 } 433 return node_to_cpumask_map[node]; 434 } 435 EXPORT_SYMBOL(cpumask_of_node); 436 437 #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 438 439 #ifdef CONFIG_NUMA_EMU 440 void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys, 441 unsigned int nr_emu_nids) 442 { 443 int i, j; 444 445 /* 446 * Transform __apicid_to_node table to use emulated nids by 447 * reverse-mapping phys_nid. The maps should always exist but fall 448 * back to zero just in case. 449 */ 450 for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { 451 if (__apicid_to_node[i] == NUMA_NO_NODE) 452 continue; 453 for (j = 0; j < nr_emu_nids; j++) 454 if (__apicid_to_node[i] == emu_nid_to_phys[j]) 455 break; 456 __apicid_to_node[i] = j < nr_emu_nids ? j : 0; 457 } 458 } 459 460 u64 __init numa_emu_dma_end(void) 461 { 462 return PFN_PHYS(MAX_DMA32_PFN); 463 } 464 #endif /* CONFIG_NUMA_EMU */ 465