1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * NUMA support, based on the x86 implementation. 4 * 5 * Copyright (C) 2015 Cavium Inc. 6 * Author: Ganapatrao Kulkarni <gkulkarni@cavium.com> 7 */ 8 9 #define pr_fmt(fmt) "NUMA: " fmt 10 11 #include <linux/acpi.h> 12 #include <linux/memblock.h> 13 #include <linux/module.h> 14 #include <linux/of.h> 15 #include <linux/numa_memblks.h> 16 17 #include <asm/sections.h> 18 19 static int cpu_to_node_map[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE }; 20 21 bool numa_off; 22 23 static __init int numa_parse_early_param(char *opt) 24 { 25 if (!opt) 26 return -EINVAL; 27 if (str_has_prefix(opt, "off")) 28 numa_off = true; 29 if (!strncmp(opt, "fake=", 5)) 30 return numa_emu_cmdline(opt + 5); 31 32 return 0; 33 } 34 early_param("numa", numa_parse_early_param); 35 36 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 37 EXPORT_SYMBOL(node_to_cpumask_map); 38 39 #ifdef CONFIG_DEBUG_PER_CPU_MAPS 40 41 /* 42 * Returns a pointer to the bitmask of CPUs on Node 'node'. 43 */ 44 const struct cpumask *cpumask_of_node(int node) 45 { 46 47 if (node == NUMA_NO_NODE) 48 return cpu_all_mask; 49 50 if (WARN_ON(node < 0 || node >= nr_node_ids)) 51 return cpu_none_mask; 52 53 if (WARN_ON(node_to_cpumask_map[node] == NULL)) 54 return cpu_online_mask; 55 56 return node_to_cpumask_map[node]; 57 } 58 EXPORT_SYMBOL(cpumask_of_node); 59 60 #endif 61 62 #ifndef CONFIG_NUMA_EMU 63 static void numa_update_cpu(unsigned int cpu, bool remove) 64 { 65 int nid = cpu_to_node(cpu); 66 67 if (nid == NUMA_NO_NODE) 68 return; 69 70 if (remove) 71 cpumask_clear_cpu(cpu, node_to_cpumask_map[nid]); 72 else 73 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); 74 } 75 76 void numa_add_cpu(unsigned int cpu) 77 { 78 numa_update_cpu(cpu, false); 79 } 80 81 void numa_remove_cpu(unsigned int cpu) 82 { 83 numa_update_cpu(cpu, true); 84 } 85 #endif 86 87 void numa_clear_node(unsigned int cpu) 88 { 89 numa_remove_cpu(cpu); 90 set_cpu_numa_node(cpu, NUMA_NO_NODE); 91 } 92 93 /* 94 * Allocate node_to_cpumask_map based on number of available nodes 95 * Requires node_possible_map to be valid. 96 * 97 * Note: cpumask_of_node() is not valid until after this is done. 98 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) 99 */ 100 static void __init setup_node_to_cpumask_map(void) 101 { 102 int node; 103 104 /* setup nr_node_ids if not done yet */ 105 if (nr_node_ids == MAX_NUMNODES) 106 setup_nr_node_ids(); 107 108 /* allocate and clear the mapping */ 109 for (node = 0; node < nr_node_ids; node++) { 110 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); 111 cpumask_clear(node_to_cpumask_map[node]); 112 } 113 114 /* cpumask_of_node() will now work */ 115 pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); 116 } 117 118 /* 119 * Set the cpu to node and mem mapping 120 */ 121 void numa_store_cpu_info(unsigned int cpu) 122 { 123 set_cpu_numa_node(cpu, cpu_to_node_map[cpu]); 124 } 125 126 void __init early_map_cpu_to_node(unsigned int cpu, int nid) 127 { 128 /* fallback to node 0 */ 129 if (nid < 0 || nid >= MAX_NUMNODES || numa_off) 130 nid = 0; 131 132 cpu_to_node_map[cpu] = nid; 133 134 /* 135 * We should set the numa node of cpu0 as soon as possible, because it 136 * has already been set up online before. cpu_to_node(0) will soon be 137 * called. 138 */ 139 if (!cpu) 140 set_cpu_numa_node(cpu, nid); 141 } 142 143 #ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA 144 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; 145 EXPORT_SYMBOL(__per_cpu_offset); 146 147 int early_cpu_to_node(int cpu) 148 { 149 return cpu_to_node_map[cpu]; 150 } 151 152 static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) 153 { 154 return node_distance(early_cpu_to_node(from), early_cpu_to_node(to)); 155 } 156 157 void __init setup_per_cpu_areas(void) 158 { 159 unsigned long delta; 160 unsigned int cpu; 161 int rc = -EINVAL; 162 163 if (pcpu_chosen_fc != PCPU_FC_PAGE) { 164 /* 165 * Always reserve area for module percpu variables. That's 166 * what the legacy allocator did. 167 */ 168 rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, 169 PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, 170 pcpu_cpu_distance, 171 early_cpu_to_node); 172 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK 173 if (rc < 0) 174 pr_warn("PERCPU: %s allocator failed (%d), falling back to page size\n", 175 pcpu_fc_names[pcpu_chosen_fc], rc); 176 #endif 177 } 178 179 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK 180 if (rc < 0) 181 rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, early_cpu_to_node); 182 #endif 183 if (rc < 0) 184 panic("Failed to initialize percpu areas (err=%d).", rc); 185 186 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; 187 for_each_possible_cpu(cpu) 188 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; 189 } 190 #endif 191 192 /* 193 * Initialize NODE_DATA for a node on the local memory 194 */ 195 static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) 196 { 197 if (start_pfn >= end_pfn) 198 pr_info("Initmem setup node %d [<memory-less node>]\n", nid); 199 200 alloc_node_data(nid); 201 202 NODE_DATA(nid)->node_id = nid; 203 NODE_DATA(nid)->node_start_pfn = start_pfn; 204 NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; 205 } 206 207 static int __init numa_register_nodes(void) 208 { 209 int nid; 210 211 /* Finally register nodes. */ 212 for_each_node_mask(nid, numa_nodes_parsed) { 213 unsigned long start_pfn, end_pfn; 214 215 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 216 setup_node_data(nid, start_pfn, end_pfn); 217 node_set_online(nid); 218 } 219 220 /* Setup online nodes to actual nodes*/ 221 node_possible_map = numa_nodes_parsed; 222 223 return 0; 224 } 225 226 static int __init numa_init(int (*init_func)(void)) 227 { 228 int ret; 229 230 nodes_clear(numa_nodes_parsed); 231 nodes_clear(node_possible_map); 232 nodes_clear(node_online_map); 233 234 ret = numa_memblks_init(init_func, /* memblock_force_top_down */ false); 235 if (ret < 0) 236 goto out_free_distance; 237 238 if (nodes_empty(numa_nodes_parsed)) { 239 pr_info("No NUMA configuration found\n"); 240 ret = -EINVAL; 241 goto out_free_distance; 242 } 243 244 ret = numa_register_nodes(); 245 if (ret < 0) 246 goto out_free_distance; 247 248 setup_node_to_cpumask_map(); 249 250 return 0; 251 out_free_distance: 252 numa_reset_distance(); 253 return ret; 254 } 255 256 /** 257 * dummy_numa_init() - Fallback dummy NUMA init 258 * 259 * Used if there's no underlying NUMA architecture, NUMA initialization 260 * fails, or NUMA is disabled on the command line. 261 * 262 * Must online at least one node (node 0) and add memory blocks that cover all 263 * allowed memory. It is unlikely that this function fails. 264 * 265 * Return: 0 on success, -errno on failure. 266 */ 267 static int __init dummy_numa_init(void) 268 { 269 phys_addr_t start = memblock_start_of_DRAM(); 270 phys_addr_t end = memblock_end_of_DRAM() - 1; 271 int ret; 272 273 if (numa_off) 274 pr_info("NUMA disabled\n"); /* Forced off on command line. */ 275 pr_info("Faking a node at [mem %pap-%pap]\n", &start, &end); 276 277 ret = numa_add_memblk(0, start, end + 1); 278 if (ret) { 279 pr_err("NUMA init failed\n"); 280 return ret; 281 } 282 node_set(0, numa_nodes_parsed); 283 284 numa_off = true; 285 return 0; 286 } 287 288 #ifdef CONFIG_ACPI_NUMA 289 static int __init arch_acpi_numa_init(void) 290 { 291 int ret; 292 293 ret = acpi_numa_init(); 294 if (ret) { 295 pr_debug("Failed to initialise from firmware\n"); 296 return ret; 297 } 298 299 return srat_disabled() ? -EINVAL : 0; 300 } 301 #else 302 static int __init arch_acpi_numa_init(void) 303 { 304 return -EOPNOTSUPP; 305 } 306 #endif 307 308 /** 309 * arch_numa_init() - Initialize NUMA 310 * 311 * Try each configured NUMA initialization method until one succeeds. The 312 * last fallback is dummy single node config encompassing whole memory. 313 */ 314 void __init arch_numa_init(void) 315 { 316 if (!numa_off) { 317 if (!acpi_disabled && !numa_init(arch_acpi_numa_init)) 318 return; 319 if (acpi_disabled && !numa_init(of_numa_init)) 320 return; 321 } 322 323 numa_init(dummy_numa_init); 324 } 325 326 #ifdef CONFIG_NUMA_EMU 327 void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys, 328 unsigned int nr_emu_nids) 329 { 330 int i, j; 331 332 /* 333 * Transform cpu_to_node_map table to use emulated nids by 334 * reverse-mapping phys_nid. The maps should always exist but fall 335 * back to zero just in case. 336 */ 337 for (i = 0; i < ARRAY_SIZE(cpu_to_node_map); i++) { 338 if (cpu_to_node_map[i] == NUMA_NO_NODE) 339 continue; 340 for (j = 0; j < nr_emu_nids; j++) 341 if (cpu_to_node_map[i] == emu_nid_to_phys[j]) 342 break; 343 cpu_to_node_map[i] = j < nr_emu_nids ? j : 0; 344 } 345 } 346 347 u64 __init numa_emu_dma_end(void) 348 { 349 return memblock_start_of_DRAM() + SZ_4G; 350 } 351 352 void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable) 353 { 354 struct cpumask *mask; 355 356 if (node == NUMA_NO_NODE) 357 return; 358 359 mask = node_to_cpumask_map[node]; 360 if (!cpumask_available(mask)) { 361 pr_err("node_to_cpumask_map[%i] NULL\n", node); 362 dump_stack(); 363 return; 364 } 365 366 if (enable) 367 cpumask_set_cpu(cpu, mask); 368 else 369 cpumask_clear_cpu(cpu, mask); 370 371 pr_debug("%s cpu %d node %d: mask now %*pbl\n", 372 enable ? "numa_add_cpu" : "numa_remove_cpu", 373 cpu, node, cpumask_pr_args(mask)); 374 } 375 #endif /* CONFIG_NUMA_EMU */ 376