1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Common code for 32 and 64-bit NUMA */ 3 #include <linux/acpi.h> 4 #include <linux/kernel.h> 5 #include <linux/mm.h> 6 #include <linux/of.h> 7 #include <linux/string.h> 8 #include <linux/init.h> 9 #include <linux/memblock.h> 10 #include <linux/mmzone.h> 11 #include <linux/ctype.h> 12 #include <linux/nodemask.h> 13 #include <linux/sched.h> 14 #include <linux/topology.h> 15 #include <linux/sort.h> 16 17 #include <asm/e820/api.h> 18 #include <asm/proto.h> 19 #include <asm/dma.h> 20 #include <asm/amd_nb.h> 21 22 #include "numa_internal.h" 23 24 int numa_off; 25 nodemask_t numa_nodes_parsed __initdata; 26 27 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 28 EXPORT_SYMBOL(node_data); 29 30 static struct numa_meminfo numa_meminfo __initdata_or_meminfo; 31 static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo; 32 33 static int numa_distance_cnt; 34 static u8 *numa_distance; 35 36 static __init int numa_setup(char *opt) 37 { 38 if (!opt) 39 return -EINVAL; 40 if (!strncmp(opt, "off", 3)) 41 numa_off = 1; 42 if (!strncmp(opt, "fake=", 5)) 43 return numa_emu_cmdline(opt + 5); 44 if (!strncmp(opt, "noacpi", 6)) 45 disable_srat(); 46 if (!strncmp(opt, "nohmat", 6)) 47 disable_hmat(); 48 return 0; 49 } 50 early_param("numa", numa_setup); 51 52 /* 53 * apicid, cpu, node mappings 54 */ 55 s16 __apicid_to_node[MAX_LOCAL_APIC] = { 56 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 57 }; 58 59 int numa_cpu_node(int cpu) 60 { 61 u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu); 62 63 if (apicid != BAD_APICID) 64 return __apicid_to_node[apicid]; 65 return NUMA_NO_NODE; 66 } 67 68 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 69 EXPORT_SYMBOL(node_to_cpumask_map); 70 71 /* 72 * Map cpu index to node index 73 */ 74 DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); 75 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); 76 77 void numa_set_node(int cpu, int node) 78 { 79 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); 80 81 /* early setting, no percpu area yet */ 82 if (cpu_to_node_map) { 83 cpu_to_node_map[cpu] = node; 84 return; 85 } 86 87 #ifdef CONFIG_DEBUG_PER_CPU_MAPS 88 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { 89 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); 90 dump_stack(); 91 return; 92 } 93 #endif 94 per_cpu(x86_cpu_to_node_map, cpu) = node; 95 96 set_cpu_numa_node(cpu, node); 97 } 98 99 void numa_clear_node(int cpu) 100 { 101 numa_set_node(cpu, NUMA_NO_NODE); 102 } 103 104 /* 105 * Allocate node_to_cpumask_map based on number of available nodes 106 * Requires node_possible_map to be valid. 107 * 108 * Note: cpumask_of_node() is not valid until after this is done. 109 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) 110 */ 111 void __init setup_node_to_cpumask_map(void) 112 { 113 unsigned int node; 114 115 /* setup nr_node_ids if not done yet */ 116 if (nr_node_ids == MAX_NUMNODES) 117 setup_nr_node_ids(); 118 119 /* allocate the map */ 120 for (node = 0; node < nr_node_ids; node++) 121 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); 122 123 /* cpumask_of_node() will now work */ 124 pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); 125 } 126 127 static int __init numa_add_memblk_to(int nid, u64 start, u64 end, 128 struct numa_meminfo *mi) 129 { 130 /* ignore zero length blks */ 131 if (start == end) 132 return 0; 133 134 /* whine about and ignore invalid blks */ 135 if (start > end || nid < 0 || nid >= MAX_NUMNODES) { 136 pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", 137 nid, start, end - 1); 138 return 0; 139 } 140 141 if (mi->nr_blks >= NR_NODE_MEMBLKS) { 142 pr_err("too many memblk ranges\n"); 143 return -EINVAL; 144 } 145 146 mi->blk[mi->nr_blks].start = start; 147 mi->blk[mi->nr_blks].end = end; 148 mi->blk[mi->nr_blks].nid = nid; 149 mi->nr_blks++; 150 return 0; 151 } 152 153 /** 154 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo 155 * @idx: Index of memblk to remove 156 * @mi: numa_meminfo to remove memblk from 157 * 158 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and 159 * decrementing @mi->nr_blks. 160 */ 161 void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) 162 { 163 mi->nr_blks--; 164 memmove(&mi->blk[idx], &mi->blk[idx + 1], 165 (mi->nr_blks - idx) * sizeof(mi->blk[0])); 166 } 167 168 /** 169 * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another 170 * @dst: numa_meminfo to append block to 171 * @idx: Index of memblk to remove 172 * @src: numa_meminfo to remove memblk from 173 */ 174 static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx, 175 struct numa_meminfo *src) 176 { 177 dst->blk[dst->nr_blks++] = src->blk[idx]; 178 numa_remove_memblk_from(idx, src); 179 } 180 181 /** 182 * numa_add_memblk - Add one numa_memblk to numa_meminfo 183 * @nid: NUMA node ID of the new memblk 184 * @start: Start address of the new memblk 185 * @end: End address of the new memblk 186 * 187 * Add a new memblk to the default numa_meminfo. 188 * 189 * RETURNS: 190 * 0 on success, -errno on failure. 191 */ 192 int __init numa_add_memblk(int nid, u64 start, u64 end) 193 { 194 return numa_add_memblk_to(nid, start, end, &numa_meminfo); 195 } 196 197 /* Allocate NODE_DATA for a node on the local memory */ 198 static void __init alloc_node_data(int nid) 199 { 200 const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); 201 u64 nd_pa; 202 void *nd; 203 int tnid; 204 205 /* 206 * Allocate node data. Try node-local memory and then any node. 207 * Never allocate in DMA zone. 208 */ 209 nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); 210 if (!nd_pa) { 211 pr_err("Cannot find %zu bytes in any node (initial node: %d)\n", 212 nd_size, nid); 213 return; 214 } 215 nd = __va(nd_pa); 216 217 /* report and initialize */ 218 printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid, 219 nd_pa, nd_pa + nd_size - 1); 220 tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); 221 if (tnid != nid) 222 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); 223 224 node_data[nid] = nd; 225 memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); 226 227 node_set_online(nid); 228 } 229 230 /** 231 * numa_cleanup_meminfo - Cleanup a numa_meminfo 232 * @mi: numa_meminfo to clean up 233 * 234 * Sanitize @mi by merging and removing unnecessary memblks. Also check for 235 * conflicts and clear unused memblks. 236 * 237 * RETURNS: 238 * 0 on success, -errno on failure. 239 */ 240 int __init numa_cleanup_meminfo(struct numa_meminfo *mi) 241 { 242 const u64 low = 0; 243 const u64 high = PFN_PHYS(max_pfn); 244 int i, j, k; 245 246 /* first, trim all entries */ 247 for (i = 0; i < mi->nr_blks; i++) { 248 struct numa_memblk *bi = &mi->blk[i]; 249 250 /* move / save reserved memory ranges */ 251 if (!memblock_overlaps_region(&memblock.memory, 252 bi->start, bi->end - bi->start)) { 253 numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi); 254 continue; 255 } 256 257 /* make sure all non-reserved blocks are inside the limits */ 258 bi->start = max(bi->start, low); 259 260 /* preserve info for non-RAM areas above 'max_pfn': */ 261 if (bi->end > high) { 262 numa_add_memblk_to(bi->nid, high, bi->end, 263 &numa_reserved_meminfo); 264 bi->end = high; 265 } 266 267 /* and there's no empty block */ 268 if (bi->start >= bi->end) 269 numa_remove_memblk_from(i--, mi); 270 } 271 272 /* merge neighboring / overlapping entries */ 273 for (i = 0; i < mi->nr_blks; i++) { 274 struct numa_memblk *bi = &mi->blk[i]; 275 276 for (j = i + 1; j < mi->nr_blks; j++) { 277 struct numa_memblk *bj = &mi->blk[j]; 278 u64 start, end; 279 280 /* 281 * See whether there are overlapping blocks. Whine 282 * about but allow overlaps of the same nid. They 283 * will be merged below. 284 */ 285 if (bi->end > bj->start && bi->start < bj->end) { 286 if (bi->nid != bj->nid) { 287 pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n", 288 bi->nid, bi->start, bi->end - 1, 289 bj->nid, bj->start, bj->end - 1); 290 return -EINVAL; 291 } 292 pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n", 293 bi->nid, bi->start, bi->end - 1, 294 bj->start, bj->end - 1); 295 } 296 297 /* 298 * Join together blocks on the same node, holes 299 * between which don't overlap with memory on other 300 * nodes. 301 */ 302 if (bi->nid != bj->nid) 303 continue; 304 start = min(bi->start, bj->start); 305 end = max(bi->end, bj->end); 306 for (k = 0; k < mi->nr_blks; k++) { 307 struct numa_memblk *bk = &mi->blk[k]; 308 309 if (bi->nid == bk->nid) 310 continue; 311 if (start < bk->end && end > bk->start) 312 break; 313 } 314 if (k < mi->nr_blks) 315 continue; 316 printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n", 317 bi->nid, bi->start, bi->end - 1, bj->start, 318 bj->end - 1, start, end - 1); 319 bi->start = start; 320 bi->end = end; 321 numa_remove_memblk_from(j--, mi); 322 } 323 } 324 325 /* clear unused ones */ 326 for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { 327 mi->blk[i].start = mi->blk[i].end = 0; 328 mi->blk[i].nid = NUMA_NO_NODE; 329 } 330 331 return 0; 332 } 333 334 /* 335 * Set nodes, which have memory in @mi, in *@nodemask. 336 */ 337 static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, 338 const struct numa_meminfo *mi) 339 { 340 int i; 341 342 for (i = 0; i < ARRAY_SIZE(mi->blk); i++) 343 if (mi->blk[i].start != mi->blk[i].end && 344 mi->blk[i].nid != NUMA_NO_NODE) 345 node_set(mi->blk[i].nid, *nodemask); 346 } 347 348 /** 349 * numa_reset_distance - Reset NUMA distance table 350 * 351 * The current table is freed. The next numa_set_distance() call will 352 * create a new one. 353 */ 354 void __init numa_reset_distance(void) 355 { 356 size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); 357 358 /* numa_distance could be 1LU marking allocation failure, test cnt */ 359 if (numa_distance_cnt) 360 memblock_free(numa_distance, size); 361 numa_distance_cnt = 0; 362 numa_distance = NULL; /* enable table creation */ 363 } 364 365 static int __init numa_alloc_distance(void) 366 { 367 nodemask_t nodes_parsed; 368 size_t size; 369 int i, j, cnt = 0; 370 u64 phys; 371 372 /* size the new table and allocate it */ 373 nodes_parsed = numa_nodes_parsed; 374 numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); 375 376 for_each_node_mask(i, nodes_parsed) 377 cnt = i; 378 cnt++; 379 size = cnt * cnt * sizeof(numa_distance[0]); 380 381 phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0, 382 PFN_PHYS(max_pfn_mapped)); 383 if (!phys) { 384 pr_warn("Warning: can't allocate distance table!\n"); 385 /* don't retry until explicitly reset */ 386 numa_distance = (void *)1LU; 387 return -ENOMEM; 388 } 389 390 numa_distance = __va(phys); 391 numa_distance_cnt = cnt; 392 393 /* fill with the default distances */ 394 for (i = 0; i < cnt; i++) 395 for (j = 0; j < cnt; j++) 396 numa_distance[i * cnt + j] = i == j ? 397 LOCAL_DISTANCE : REMOTE_DISTANCE; 398 printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); 399 400 return 0; 401 } 402 403 /** 404 * numa_set_distance - Set NUMA distance from one NUMA to another 405 * @from: the 'from' node to set distance 406 * @to: the 'to' node to set distance 407 * @distance: NUMA distance 408 * 409 * Set the distance from node @from to @to to @distance. If distance table 410 * doesn't exist, one which is large enough to accommodate all the currently 411 * known nodes will be created. 412 * 413 * If such table cannot be allocated, a warning is printed and further 414 * calls are ignored until the distance table is reset with 415 * numa_reset_distance(). 416 * 417 * If @from or @to is higher than the highest known node or lower than zero 418 * at the time of table creation or @distance doesn't make sense, the call 419 * is ignored. 420 * This is to allow simplification of specific NUMA config implementations. 421 */ 422 void __init numa_set_distance(int from, int to, int distance) 423 { 424 if (!numa_distance && numa_alloc_distance() < 0) 425 return; 426 427 if (from >= numa_distance_cnt || to >= numa_distance_cnt || 428 from < 0 || to < 0) { 429 pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n", 430 from, to, distance); 431 return; 432 } 433 434 if ((u8)distance != distance || 435 (from == to && distance != LOCAL_DISTANCE)) { 436 pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n", 437 from, to, distance); 438 return; 439 } 440 441 numa_distance[from * numa_distance_cnt + to] = distance; 442 } 443 444 int __node_distance(int from, int to) 445 { 446 if (from >= numa_distance_cnt || to >= numa_distance_cnt) 447 return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; 448 return numa_distance[from * numa_distance_cnt + to]; 449 } 450 EXPORT_SYMBOL(__node_distance); 451 452 /* 453 * Mark all currently memblock-reserved physical memory (which covers the 454 * kernel's own memory ranges) as hot-unswappable. 455 */ 456 static void __init numa_clear_kernel_node_hotplug(void) 457 { 458 nodemask_t reserved_nodemask = NODE_MASK_NONE; 459 struct memblock_region *mb_region; 460 int i; 461 462 /* 463 * We have to do some preprocessing of memblock regions, to 464 * make them suitable for reservation. 465 * 466 * At this time, all memory regions reserved by memblock are 467 * used by the kernel, but those regions are not split up 468 * along node boundaries yet, and don't necessarily have their 469 * node ID set yet either. 470 * 471 * So iterate over all memory known to the x86 architecture, 472 * and use those ranges to set the nid in memblock.reserved. 473 * This will split up the memblock regions along node 474 * boundaries and will set the node IDs as well. 475 */ 476 for (i = 0; i < numa_meminfo.nr_blks; i++) { 477 struct numa_memblk *mb = numa_meminfo.blk + i; 478 int ret; 479 480 ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid); 481 WARN_ON_ONCE(ret); 482 } 483 484 /* 485 * Now go over all reserved memblock regions, to construct a 486 * node mask of all kernel reserved memory areas. 487 * 488 * [ Note, when booting with mem=nn[kMG] or in a kdump kernel, 489 * numa_meminfo might not include all memblock.reserved 490 * memory ranges, because quirks such as trim_snb_memory() 491 * reserve specific pages for Sandy Bridge graphics. ] 492 */ 493 for_each_reserved_mem_region(mb_region) { 494 int nid = memblock_get_region_node(mb_region); 495 496 if (nid != MAX_NUMNODES) 497 node_set(nid, reserved_nodemask); 498 } 499 500 /* 501 * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory 502 * belonging to the reserved node mask. 503 * 504 * Note that this will include memory regions that reside 505 * on nodes that contain kernel memory - entire nodes 506 * become hot-unpluggable: 507 */ 508 for (i = 0; i < numa_meminfo.nr_blks; i++) { 509 struct numa_memblk *mb = numa_meminfo.blk + i; 510 511 if (!node_isset(mb->nid, reserved_nodemask)) 512 continue; 513 514 memblock_clear_hotplug(mb->start, mb->end - mb->start); 515 } 516 } 517 518 static int __init numa_register_memblks(struct numa_meminfo *mi) 519 { 520 int i, nid; 521 522 /* Account for nodes with cpus and no memory */ 523 node_possible_map = numa_nodes_parsed; 524 numa_nodemask_from_meminfo(&node_possible_map, mi); 525 if (WARN_ON(nodes_empty(node_possible_map))) 526 return -EINVAL; 527 528 for (i = 0; i < mi->nr_blks; i++) { 529 struct numa_memblk *mb = &mi->blk[i]; 530 memblock_set_node(mb->start, mb->end - mb->start, 531 &memblock.memory, mb->nid); 532 } 533 534 /* 535 * At very early time, the kernel have to use some memory such as 536 * loading the kernel image. We cannot prevent this anyway. So any 537 * node the kernel resides in should be un-hotpluggable. 538 * 539 * And when we come here, alloc node data won't fail. 540 */ 541 numa_clear_kernel_node_hotplug(); 542 543 /* 544 * If sections array is gonna be used for pfn -> nid mapping, check 545 * whether its granularity is fine enough. 546 */ 547 if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) { 548 unsigned long pfn_align = node_map_pfn_alignment(); 549 550 if (pfn_align && pfn_align < PAGES_PER_SECTION) { 551 pr_warn("Node alignment %LuMB < min %LuMB, rejecting NUMA config\n", 552 PFN_PHYS(pfn_align) >> 20, 553 PFN_PHYS(PAGES_PER_SECTION) >> 20); 554 return -EINVAL; 555 } 556 } 557 558 if (!memblock_validate_numa_coverage(SZ_1M)) 559 return -EINVAL; 560 561 /* Finally register nodes. */ 562 for_each_node_mask(nid, node_possible_map) { 563 u64 start = PFN_PHYS(max_pfn); 564 u64 end = 0; 565 566 for (i = 0; i < mi->nr_blks; i++) { 567 if (nid != mi->blk[i].nid) 568 continue; 569 start = min(mi->blk[i].start, start); 570 end = max(mi->blk[i].end, end); 571 } 572 573 if (start >= end) 574 continue; 575 576 alloc_node_data(nid); 577 } 578 579 /* Dump memblock with node info and return. */ 580 memblock_dump_all(); 581 return 0; 582 } 583 584 /* 585 * There are unfortunately some poorly designed mainboards around that 586 * only connect memory to a single CPU. This breaks the 1:1 cpu->node 587 * mapping. To avoid this fill in the mapping for all possible CPUs, 588 * as the number of CPUs is not known yet. We round robin the existing 589 * nodes. 590 */ 591 static void __init numa_init_array(void) 592 { 593 int rr, i; 594 595 rr = first_node(node_online_map); 596 for (i = 0; i < nr_cpu_ids; i++) { 597 if (early_cpu_to_node(i) != NUMA_NO_NODE) 598 continue; 599 numa_set_node(i, rr); 600 rr = next_node_in(rr, node_online_map); 601 } 602 } 603 604 static int __init numa_init(int (*init_func)(void)) 605 { 606 int i; 607 int ret; 608 609 for (i = 0; i < MAX_LOCAL_APIC; i++) 610 set_apicid_to_node(i, NUMA_NO_NODE); 611 612 nodes_clear(numa_nodes_parsed); 613 nodes_clear(node_possible_map); 614 nodes_clear(node_online_map); 615 memset(&numa_meminfo, 0, sizeof(numa_meminfo)); 616 WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory, 617 MAX_NUMNODES)); 618 WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved, 619 MAX_NUMNODES)); 620 /* In case that parsing SRAT failed. */ 621 WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX)); 622 numa_reset_distance(); 623 624 ret = init_func(); 625 if (ret < 0) 626 return ret; 627 628 /* 629 * We reset memblock back to the top-down direction 630 * here because if we configured ACPI_NUMA, we have 631 * parsed SRAT in init_func(). It is ok to have the 632 * reset here even if we did't configure ACPI_NUMA 633 * or acpi numa init fails and fallbacks to dummy 634 * numa init. 635 */ 636 memblock_set_bottom_up(false); 637 638 ret = numa_cleanup_meminfo(&numa_meminfo); 639 if (ret < 0) 640 return ret; 641 642 numa_emulation(&numa_meminfo, numa_distance_cnt); 643 644 ret = numa_register_memblks(&numa_meminfo); 645 if (ret < 0) 646 return ret; 647 648 for (i = 0; i < nr_cpu_ids; i++) { 649 int nid = early_cpu_to_node(i); 650 651 if (nid == NUMA_NO_NODE) 652 continue; 653 if (!node_online(nid)) 654 numa_clear_node(i); 655 } 656 numa_init_array(); 657 658 return 0; 659 } 660 661 /** 662 * dummy_numa_init - Fallback dummy NUMA init 663 * 664 * Used if there's no underlying NUMA architecture, NUMA initialization 665 * fails, or NUMA is disabled on the command line. 666 * 667 * Must online at least one node and add memory blocks that cover all 668 * allowed memory. This function must not fail. 669 */ 670 static int __init dummy_numa_init(void) 671 { 672 printk(KERN_INFO "%s\n", 673 numa_off ? "NUMA turned off" : "No NUMA configuration found"); 674 printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n", 675 0LLU, PFN_PHYS(max_pfn) - 1); 676 677 node_set(0, numa_nodes_parsed); 678 numa_add_memblk(0, 0, PFN_PHYS(max_pfn)); 679 680 return 0; 681 } 682 683 /** 684 * x86_numa_init - Initialize NUMA 685 * 686 * Try each configured NUMA initialization method until one succeeds. The 687 * last fallback is dummy single node config encompassing whole memory and 688 * never fails. 689 */ 690 void __init x86_numa_init(void) 691 { 692 if (!numa_off) { 693 #ifdef CONFIG_ACPI_NUMA 694 if (!numa_init(x86_acpi_numa_init)) 695 return; 696 #endif 697 #ifdef CONFIG_AMD_NUMA 698 if (!numa_init(amd_numa_init)) 699 return; 700 #endif 701 if (acpi_disabled && !numa_init(of_numa_init)) 702 return; 703 } 704 705 numa_init(dummy_numa_init); 706 } 707 708 709 /* 710 * A node may exist which has one or more Generic Initiators but no CPUs and no 711 * memory. 712 * 713 * This function must be called after init_cpu_to_node(), to ensure that any 714 * memoryless CPU nodes have already been brought online, and before the 715 * node_data[nid] is needed for zone list setup in build_all_zonelists(). 716 * 717 * When this function is called, any nodes containing either memory and/or CPUs 718 * will already be online and there is no need to do anything extra, even if 719 * they also contain one or more Generic Initiators. 720 */ 721 void __init init_gi_nodes(void) 722 { 723 int nid; 724 725 /* 726 * Exclude this node from 727 * bringup_nonboot_cpus 728 * cpu_up 729 * __try_online_node 730 * register_one_node 731 * because node_subsys is not initialized yet. 732 * TODO remove dependency on node_online 733 */ 734 for_each_node_state(nid, N_GENERIC_INITIATOR) 735 if (!node_online(nid)) 736 node_set_online(nid); 737 } 738 739 /* 740 * Setup early cpu_to_node. 741 * 742 * Populate cpu_to_node[] only if x86_cpu_to_apicid[], 743 * and apicid_to_node[] tables have valid entries for a CPU. 744 * This means we skip cpu_to_node[] initialisation for NUMA 745 * emulation and faking node case (when running a kernel compiled 746 * for NUMA on a non NUMA box), which is OK as cpu_to_node[] 747 * is already initialized in a round robin manner at numa_init_array, 748 * prior to this call, and this initialization is good enough 749 * for the fake NUMA cases. 750 * 751 * Called before the per_cpu areas are setup. 752 */ 753 void __init init_cpu_to_node(void) 754 { 755 int cpu; 756 u32 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 757 758 BUG_ON(cpu_to_apicid == NULL); 759 760 for_each_possible_cpu(cpu) { 761 int node = numa_cpu_node(cpu); 762 763 if (node == NUMA_NO_NODE) 764 continue; 765 766 /* 767 * Exclude this node from 768 * bringup_nonboot_cpus 769 * cpu_up 770 * __try_online_node 771 * register_one_node 772 * because node_subsys is not initialized yet. 773 * TODO remove dependency on node_online 774 */ 775 if (!node_online(node)) 776 node_set_online(node); 777 778 numa_set_node(cpu, node); 779 } 780 } 781 782 #ifndef CONFIG_DEBUG_PER_CPU_MAPS 783 784 # ifndef CONFIG_NUMA_EMU 785 void numa_add_cpu(int cpu) 786 { 787 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 788 } 789 790 void numa_remove_cpu(int cpu) 791 { 792 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 793 } 794 # endif /* !CONFIG_NUMA_EMU */ 795 796 #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ 797 798 int __cpu_to_node(int cpu) 799 { 800 if (early_per_cpu_ptr(x86_cpu_to_node_map)) { 801 printk(KERN_WARNING 802 "cpu_to_node(%d): usage too early!\n", cpu); 803 dump_stack(); 804 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; 805 } 806 return per_cpu(x86_cpu_to_node_map, cpu); 807 } 808 EXPORT_SYMBOL(__cpu_to_node); 809 810 /* 811 * Same function as cpu_to_node() but used if called before the 812 * per_cpu areas are setup. 813 */ 814 int early_cpu_to_node(int cpu) 815 { 816 if (early_per_cpu_ptr(x86_cpu_to_node_map)) 817 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; 818 819 if (!cpu_possible(cpu)) { 820 printk(KERN_WARNING 821 "early_cpu_to_node(%d): no per_cpu area!\n", cpu); 822 dump_stack(); 823 return NUMA_NO_NODE; 824 } 825 return per_cpu(x86_cpu_to_node_map, cpu); 826 } 827 828 void debug_cpumask_set_cpu(int cpu, int node, bool enable) 829 { 830 struct cpumask *mask; 831 832 if (node == NUMA_NO_NODE) { 833 /* early_cpu_to_node() already emits a warning and trace */ 834 return; 835 } 836 mask = node_to_cpumask_map[node]; 837 if (!cpumask_available(mask)) { 838 pr_err("node_to_cpumask_map[%i] NULL\n", node); 839 dump_stack(); 840 return; 841 } 842 843 if (enable) 844 cpumask_set_cpu(cpu, mask); 845 else 846 cpumask_clear_cpu(cpu, mask); 847 848 printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n", 849 enable ? "numa_add_cpu" : "numa_remove_cpu", 850 cpu, node, cpumask_pr_args(mask)); 851 return; 852 } 853 854 # ifndef CONFIG_NUMA_EMU 855 static void numa_set_cpumask(int cpu, bool enable) 856 { 857 debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable); 858 } 859 860 void numa_add_cpu(int cpu) 861 { 862 numa_set_cpumask(cpu, true); 863 } 864 865 void numa_remove_cpu(int cpu) 866 { 867 numa_set_cpumask(cpu, false); 868 } 869 # endif /* !CONFIG_NUMA_EMU */ 870 871 /* 872 * Returns a pointer to the bitmask of CPUs on Node 'node'. 873 */ 874 const struct cpumask *cpumask_of_node(int node) 875 { 876 if ((unsigned)node >= nr_node_ids) { 877 printk(KERN_WARNING 878 "cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n", 879 node, nr_node_ids); 880 dump_stack(); 881 return cpu_none_mask; 882 } 883 if (!cpumask_available(node_to_cpumask_map[node])) { 884 printk(KERN_WARNING 885 "cpumask_of_node(%d): no node_to_cpumask_map!\n", 886 node); 887 dump_stack(); 888 return cpu_online_mask; 889 } 890 return node_to_cpumask_map[node]; 891 } 892 EXPORT_SYMBOL(cpumask_of_node); 893 894 #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 895 896 #ifdef CONFIG_NUMA_KEEP_MEMINFO 897 static int meminfo_to_nid(struct numa_meminfo *mi, u64 start) 898 { 899 int i; 900 901 for (i = 0; i < mi->nr_blks; i++) 902 if (mi->blk[i].start <= start && mi->blk[i].end > start) 903 return mi->blk[i].nid; 904 return NUMA_NO_NODE; 905 } 906 907 int phys_to_target_node(phys_addr_t start) 908 { 909 int nid = meminfo_to_nid(&numa_meminfo, start); 910 911 /* 912 * Prefer online nodes, but if reserved memory might be 913 * hot-added continue the search with reserved ranges. 914 */ 915 if (nid != NUMA_NO_NODE) 916 return nid; 917 918 return meminfo_to_nid(&numa_reserved_meminfo, start); 919 } 920 EXPORT_SYMBOL_GPL(phys_to_target_node); 921 922 int memory_add_physaddr_to_nid(u64 start) 923 { 924 int nid = meminfo_to_nid(&numa_meminfo, start); 925 926 if (nid == NUMA_NO_NODE) 927 nid = numa_meminfo.blk[0].nid; 928 return nid; 929 } 930 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 931 932 static int __init cmp_memblk(const void *a, const void *b) 933 { 934 const struct numa_memblk *ma = *(const struct numa_memblk **)a; 935 const struct numa_memblk *mb = *(const struct numa_memblk **)b; 936 937 return ma->start - mb->start; 938 } 939 940 static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; 941 942 /** 943 * numa_fill_memblks - Fill gaps in numa_meminfo memblks 944 * @start: address to begin fill 945 * @end: address to end fill 946 * 947 * Find and extend numa_meminfo memblks to cover the @start-@end 948 * physical address range, such that the first memblk includes 949 * @start, the last memblk includes @end, and any gaps in between 950 * are filled. 951 * 952 * RETURNS: 953 * 0 : Success 954 * NUMA_NO_MEMBLK : No memblk exists in @start-@end range 955 */ 956 957 int __init numa_fill_memblks(u64 start, u64 end) 958 { 959 struct numa_memblk **blk = &numa_memblk_list[0]; 960 struct numa_meminfo *mi = &numa_meminfo; 961 int count = 0; 962 u64 prev_end; 963 964 /* 965 * Create a list of pointers to numa_meminfo memblks that 966 * overlap start, end. Exclude (start == bi->end) since 967 * end addresses in both a CFMWS range and a memblk range 968 * are exclusive. 969 * 970 * This list of pointers is used to make in-place changes 971 * that fill out the numa_meminfo memblks. 972 */ 973 for (int i = 0; i < mi->nr_blks; i++) { 974 struct numa_memblk *bi = &mi->blk[i]; 975 976 if (start < bi->end && end >= bi->start) { 977 blk[count] = &mi->blk[i]; 978 count++; 979 } 980 } 981 if (!count) 982 return NUMA_NO_MEMBLK; 983 984 /* Sort the list of pointers in memblk->start order */ 985 sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL); 986 987 /* Make sure the first/last memblks include start/end */ 988 blk[0]->start = min(blk[0]->start, start); 989 blk[count - 1]->end = max(blk[count - 1]->end, end); 990 991 /* 992 * Fill any gaps by tracking the previous memblks 993 * end address and backfilling to it if needed. 994 */ 995 prev_end = blk[0]->end; 996 for (int i = 1; i < count; i++) { 997 struct numa_memblk *curr = blk[i]; 998 999 if (prev_end >= curr->start) { 1000 if (prev_end < curr->end) 1001 prev_end = curr->end; 1002 } else { 1003 curr->start = prev_end; 1004 prev_end = curr->end; 1005 } 1006 } 1007 return 0; 1008 } 1009 1010 #endif 1011