1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * NUMA support, based on the x86 implementation. 4 * 5 * Copyright (C) 2015 Cavium Inc. 6 * Author: Ganapatrao Kulkarni <gkulkarni@cavium.com> 7 */ 8 9 #define pr_fmt(fmt) "NUMA: " fmt 10 11 #include <linux/acpi.h> 12 #include <linux/memblock.h> 13 #include <linux/module.h> 14 #include <linux/of.h> 15 #include <linux/numa_memblks.h> 16 17 #include <asm/sections.h> 18 19 static int cpu_to_node_map[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE }; 20 21 bool numa_off; 22 23 static __init int numa_parse_early_param(char *opt) 24 { 25 if (!opt) 26 return -EINVAL; 27 if (str_has_prefix(opt, "off")) 28 numa_off = true; 29 if (!strncmp(opt, "fake=", 5)) 30 return numa_emu_cmdline(opt + 5); 31 32 return 0; 33 } 34 early_param("numa", numa_parse_early_param); 35 36 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 37 EXPORT_SYMBOL(node_to_cpumask_map); 38 39 #ifdef CONFIG_DEBUG_PER_CPU_MAPS 40 41 /* 42 * Returns a pointer to the bitmask of CPUs on Node 'node'. 43 */ 44 const struct cpumask *cpumask_of_node(int node) 45 { 46 47 if (node == NUMA_NO_NODE) 48 return cpu_all_mask; 49 50 if (WARN_ON(node < 0 || node >= nr_node_ids)) 51 return cpu_none_mask; 52 53 if (WARN_ON(node_to_cpumask_map[node] == NULL)) 54 return cpu_online_mask; 55 56 return node_to_cpumask_map[node]; 57 } 58 EXPORT_SYMBOL(cpumask_of_node); 59 60 #endif 61 62 #ifndef CONFIG_NUMA_EMU 63 static void numa_update_cpu(unsigned int cpu, bool remove) 64 { 65 int nid = cpu_to_node(cpu); 66 67 if (nid == NUMA_NO_NODE) 68 return; 69 70 if (remove) 71 cpumask_clear_cpu(cpu, node_to_cpumask_map[nid]); 72 else 73 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); 74 } 75 76 void numa_add_cpu(unsigned int cpu) 77 { 78 numa_update_cpu(cpu, false); 79 } 80 81 void numa_remove_cpu(unsigned int cpu) 82 { 83 numa_update_cpu(cpu, true); 84 } 85 #endif 86 87 void numa_clear_node(unsigned int cpu) 88 { 89 numa_remove_cpu(cpu); 90 set_cpu_numa_node(cpu, NUMA_NO_NODE); 91 } 92 93 /* 94 * Allocate node_to_cpumask_map based on number of available nodes 95 * Requires node_possible_map to be valid. 96 * 97 * Note: cpumask_of_node() is not valid until after this is done. 98 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) 99 */ 100 static void __init setup_node_to_cpumask_map(void) 101 { 102 int node; 103 104 /* setup nr_node_ids if not done yet */ 105 if (nr_node_ids == MAX_NUMNODES) 106 setup_nr_node_ids(); 107 108 /* allocate and clear the mapping */ 109 for (node = 0; node < nr_node_ids; node++) { 110 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); 111 cpumask_clear(node_to_cpumask_map[node]); 112 } 113 114 /* cpumask_of_node() will now work */ 115 pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); 116 } 117 118 /* 119 * Set the cpu to node and mem mapping 120 */ 121 void numa_store_cpu_info(unsigned int cpu) 122 { 123 set_cpu_numa_node(cpu, cpu_to_node_map[cpu]); 124 } 125 126 void __init early_map_cpu_to_node(unsigned int cpu, int nid) 127 { 128 /* fallback to node 0 */ 129 if (nid < 0 || nid >= MAX_NUMNODES || numa_off) 130 nid = 0; 131 132 cpu_to_node_map[cpu] = nid; 133 134 /* 135 * We should set the numa node of cpu0 as soon as possible, because it 136 * has already been set up online before. cpu_to_node(0) will soon be 137 * called. 138 */ 139 if (!cpu) 140 set_cpu_numa_node(cpu, nid); 141 } 142 143 #ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA 144 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; 145 EXPORT_SYMBOL(__per_cpu_offset); 146 147 int early_cpu_to_node(int cpu) 148 { 149 return cpu_to_node_map[cpu]; 150 } 151 152 static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) 153 { 154 return node_distance(early_cpu_to_node(from), early_cpu_to_node(to)); 155 } 156 157 void __init setup_per_cpu_areas(void) 158 { 159 unsigned long delta; 160 unsigned int cpu; 161 int rc = -EINVAL; 162 163 if (pcpu_chosen_fc != PCPU_FC_PAGE) { 164 /* 165 * Always reserve area for module percpu variables. That's 166 * what the legacy allocator did. 167 */ 168 rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, 169 PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, 170 pcpu_cpu_distance, 171 early_cpu_to_node); 172 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK 173 if (rc < 0) 174 pr_warn("PERCPU: %s allocator failed (%d), falling back to page size\n", 175 pcpu_fc_names[pcpu_chosen_fc], rc); 176 #endif 177 } 178 179 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK 180 if (rc < 0) 181 rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, early_cpu_to_node); 182 #endif 183 if (rc < 0) 184 panic("Failed to initialize percpu areas (err=%d).", rc); 185 186 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; 187 for_each_possible_cpu(cpu) 188 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; 189 } 190 #endif 191 192 /* 193 * Initialize NODE_DATA for a node on the local memory 194 */ 195 static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) 196 { 197 if (start_pfn >= end_pfn) 198 pr_info("Initmem setup node %d [<memory-less node>]\n", nid); 199 200 alloc_node_data(nid); 201 202 NODE_DATA(nid)->node_id = nid; 203 NODE_DATA(nid)->node_start_pfn = start_pfn; 204 NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; 205 } 206 207 static int __init numa_register_nodes(void) 208 { 209 int nid; 210 211 /* Check the validity of the memblock/node mapping */ 212 if (!memblock_validate_numa_coverage(0)) 213 return -EINVAL; 214 215 /* Finally register nodes. */ 216 for_each_node_mask(nid, numa_nodes_parsed) { 217 unsigned long start_pfn, end_pfn; 218 219 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 220 setup_node_data(nid, start_pfn, end_pfn); 221 node_set_online(nid); 222 } 223 224 /* Setup online nodes to actual nodes*/ 225 node_possible_map = numa_nodes_parsed; 226 227 return 0; 228 } 229 230 static int __init numa_init(int (*init_func)(void)) 231 { 232 int ret; 233 234 nodes_clear(numa_nodes_parsed); 235 nodes_clear(node_possible_map); 236 nodes_clear(node_online_map); 237 238 ret = numa_memblks_init(init_func, /* memblock_force_top_down */ false); 239 if (ret < 0) 240 goto out_free_distance; 241 242 if (nodes_empty(numa_nodes_parsed)) { 243 pr_info("No NUMA configuration found\n"); 244 ret = -EINVAL; 245 goto out_free_distance; 246 } 247 248 ret = numa_register_nodes(); 249 if (ret < 0) 250 goto out_free_distance; 251 252 setup_node_to_cpumask_map(); 253 254 return 0; 255 out_free_distance: 256 numa_reset_distance(); 257 return ret; 258 } 259 260 /** 261 * dummy_numa_init() - Fallback dummy NUMA init 262 * 263 * Used if there's no underlying NUMA architecture, NUMA initialization 264 * fails, or NUMA is disabled on the command line. 265 * 266 * Must online at least one node (node 0) and add memory blocks that cover all 267 * allowed memory. It is unlikely that this function fails. 268 * 269 * Return: 0 on success, -errno on failure. 270 */ 271 static int __init dummy_numa_init(void) 272 { 273 phys_addr_t start = memblock_start_of_DRAM(); 274 phys_addr_t end = memblock_end_of_DRAM() - 1; 275 int ret; 276 277 if (numa_off) 278 pr_info("NUMA disabled\n"); /* Forced off on command line. */ 279 pr_info("Faking a node at [mem %pap-%pap]\n", &start, &end); 280 281 ret = numa_add_memblk(0, start, end + 1); 282 if (ret) { 283 pr_err("NUMA init failed\n"); 284 return ret; 285 } 286 node_set(0, numa_nodes_parsed); 287 288 numa_off = true; 289 return 0; 290 } 291 292 #ifdef CONFIG_ACPI_NUMA 293 static int __init arch_acpi_numa_init(void) 294 { 295 int ret; 296 297 ret = acpi_numa_init(); 298 if (ret) { 299 pr_debug("Failed to initialise from firmware\n"); 300 return ret; 301 } 302 303 return srat_disabled() ? -EINVAL : 0; 304 } 305 #else 306 static int __init arch_acpi_numa_init(void) 307 { 308 return -EOPNOTSUPP; 309 } 310 #endif 311 312 /** 313 * arch_numa_init() - Initialize NUMA 314 * 315 * Try each configured NUMA initialization method until one succeeds. The 316 * last fallback is dummy single node config encompassing whole memory. 317 */ 318 void __init arch_numa_init(void) 319 { 320 if (!numa_off) { 321 if (!acpi_disabled && !numa_init(arch_acpi_numa_init)) 322 return; 323 if (acpi_disabled && !numa_init(of_numa_init)) 324 return; 325 } 326 327 numa_init(dummy_numa_init); 328 } 329 330 #ifdef CONFIG_NUMA_EMU 331 void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys, 332 unsigned int nr_emu_nids) 333 { 334 int i, j; 335 336 /* 337 * Transform cpu_to_node_map table to use emulated nids by 338 * reverse-mapping phys_nid. The maps should always exist but fall 339 * back to zero just in case. 340 */ 341 for (i = 0; i < ARRAY_SIZE(cpu_to_node_map); i++) { 342 if (cpu_to_node_map[i] == NUMA_NO_NODE) 343 continue; 344 for (j = 0; j < nr_emu_nids; j++) 345 if (cpu_to_node_map[i] == emu_nid_to_phys[j]) 346 break; 347 cpu_to_node_map[i] = j < nr_emu_nids ? j : 0; 348 } 349 } 350 351 u64 __init numa_emu_dma_end(void) 352 { 353 return memblock_start_of_DRAM() + SZ_4G; 354 } 355 356 void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable) 357 { 358 struct cpumask *mask; 359 360 if (node == NUMA_NO_NODE) 361 return; 362 363 mask = node_to_cpumask_map[node]; 364 if (!cpumask_available(mask)) { 365 pr_err("node_to_cpumask_map[%i] NULL\n", node); 366 dump_stack(); 367 return; 368 } 369 370 if (enable) 371 cpumask_set_cpu(cpu, mask); 372 else 373 cpumask_clear_cpu(cpu, mask); 374 375 pr_debug("%s cpu %d node %d: mask now %*pbl\n", 376 enable ? "numa_add_cpu" : "numa_remove_cpu", 377 cpu, node, cpumask_pr_args(mask)); 378 } 379 #endif /* CONFIG_NUMA_EMU */ 380