1 /* Common code for 32 and 64-bit NUMA */ 2 #include <linux/acpi.h> 3 #include <linux/kernel.h> 4 #include <linux/mm.h> 5 #include <linux/string.h> 6 #include <linux/init.h> 7 #include <linux/bootmem.h> 8 #include <linux/memblock.h> 9 #include <linux/mmzone.h> 10 #include <linux/ctype.h> 11 #include <linux/nodemask.h> 12 #include <linux/sched.h> 13 #include <linux/topology.h> 14 15 #include <asm/e820/api.h> 16 #include <asm/proto.h> 17 #include <asm/dma.h> 18 #include <asm/amd_nb.h> 19 20 #include "numa_internal.h" 21 22 int numa_off; 23 nodemask_t numa_nodes_parsed __initdata; 24 25 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 26 EXPORT_SYMBOL(node_data); 27 28 static struct numa_meminfo numa_meminfo 29 #ifndef CONFIG_MEMORY_HOTPLUG 30 __initdata 31 #endif 32 ; 33 34 static int numa_distance_cnt; 35 static u8 *numa_distance; 36 37 static __init int numa_setup(char *opt) 38 { 39 if (!opt) 40 return -EINVAL; 41 if (!strncmp(opt, "off", 3)) 42 numa_off = 1; 43 #ifdef CONFIG_NUMA_EMU 44 if (!strncmp(opt, "fake=", 5)) 45 numa_emu_cmdline(opt + 5); 46 #endif 47 #ifdef CONFIG_ACPI_NUMA 48 if (!strncmp(opt, "noacpi", 6)) 49 acpi_numa = -1; 50 #endif 51 return 0; 52 } 53 early_param("numa", numa_setup); 54 55 /* 56 * apicid, cpu, node mappings 57 */ 58 s16 __apicid_to_node[MAX_LOCAL_APIC] = { 59 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 60 }; 61 62 int numa_cpu_node(int cpu) 63 { 64 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); 65 66 if (apicid != BAD_APICID) 67 return __apicid_to_node[apicid]; 68 return NUMA_NO_NODE; 69 } 70 71 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 72 EXPORT_SYMBOL(node_to_cpumask_map); 73 74 /* 75 * Map cpu index to node index 76 */ 77 DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); 78 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); 79 80 void numa_set_node(int cpu, int node) 81 { 82 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); 83 84 /* early setting, no percpu area yet */ 85 if (cpu_to_node_map) { 86 cpu_to_node_map[cpu] = node; 87 return; 88 } 89 90 #ifdef CONFIG_DEBUG_PER_CPU_MAPS 91 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { 92 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); 93 dump_stack(); 94 return; 95 } 96 #endif 97 per_cpu(x86_cpu_to_node_map, cpu) = node; 98 99 set_cpu_numa_node(cpu, node); 100 } 101 102 void numa_clear_node(int cpu) 103 { 104 numa_set_node(cpu, NUMA_NO_NODE); 105 } 106 107 /* 108 * Allocate node_to_cpumask_map based on number of available nodes 109 * Requires node_possible_map to be valid. 110 * 111 * Note: cpumask_of_node() is not valid until after this is done. 112 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) 113 */ 114 void __init setup_node_to_cpumask_map(void) 115 { 116 unsigned int node; 117 118 /* setup nr_node_ids if not done yet */ 119 if (nr_node_ids == MAX_NUMNODES) 120 setup_nr_node_ids(); 121 122 /* allocate the map */ 123 for (node = 0; node < nr_node_ids; node++) 124 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); 125 126 /* cpumask_of_node() will now work */ 127 pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); 128 } 129 130 static int __init numa_add_memblk_to(int nid, u64 start, u64 end, 131 struct numa_meminfo *mi) 132 { 133 /* ignore zero length blks */ 134 if (start == end) 135 return 0; 136 137 /* whine about and ignore invalid blks */ 138 if (start > end || nid < 0 || nid >= MAX_NUMNODES) { 139 pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", 140 nid, start, end - 1); 141 return 0; 142 } 143 144 if (mi->nr_blks >= NR_NODE_MEMBLKS) { 145 pr_err("too many memblk ranges\n"); 146 return -EINVAL; 147 } 148 149 mi->blk[mi->nr_blks].start = start; 150 mi->blk[mi->nr_blks].end = end; 151 mi->blk[mi->nr_blks].nid = nid; 152 mi->nr_blks++; 153 return 0; 154 } 155 156 /** 157 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo 158 * @idx: Index of memblk to remove 159 * @mi: numa_meminfo to remove memblk from 160 * 161 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and 162 * decrementing @mi->nr_blks. 163 */ 164 void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) 165 { 166 mi->nr_blks--; 167 memmove(&mi->blk[idx], &mi->blk[idx + 1], 168 (mi->nr_blks - idx) * sizeof(mi->blk[0])); 169 } 170 171 /** 172 * numa_add_memblk - Add one numa_memblk to numa_meminfo 173 * @nid: NUMA node ID of the new memblk 174 * @start: Start address of the new memblk 175 * @end: End address of the new memblk 176 * 177 * Add a new memblk to the default numa_meminfo. 178 * 179 * RETURNS: 180 * 0 on success, -errno on failure. 181 */ 182 int __init numa_add_memblk(int nid, u64 start, u64 end) 183 { 184 return numa_add_memblk_to(nid, start, end, &numa_meminfo); 185 } 186 187 /* Allocate NODE_DATA for a node on the local memory */ 188 static void __init alloc_node_data(int nid) 189 { 190 const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); 191 u64 nd_pa; 192 void *nd; 193 int tnid; 194 195 /* 196 * Allocate node data. Try node-local memory and then any node. 197 * Never allocate in DMA zone. 198 */ 199 nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); 200 if (!nd_pa) { 201 nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES, 202 MEMBLOCK_ALLOC_ACCESSIBLE); 203 if (!nd_pa) { 204 pr_err("Cannot find %zu bytes in any node (initial node: %d)\n", 205 nd_size, nid); 206 return; 207 } 208 } 209 nd = __va(nd_pa); 210 211 /* report and initialize */ 212 printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid, 213 nd_pa, nd_pa + nd_size - 1); 214 tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); 215 if (tnid != nid) 216 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); 217 218 node_data[nid] = nd; 219 memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); 220 221 node_set_online(nid); 222 } 223 224 /** 225 * numa_cleanup_meminfo - Cleanup a numa_meminfo 226 * @mi: numa_meminfo to clean up 227 * 228 * Sanitize @mi by merging and removing unnecessary memblks. Also check for 229 * conflicts and clear unused memblks. 230 * 231 * RETURNS: 232 * 0 on success, -errno on failure. 233 */ 234 int __init numa_cleanup_meminfo(struct numa_meminfo *mi) 235 { 236 const u64 low = 0; 237 const u64 high = PFN_PHYS(max_pfn); 238 int i, j, k; 239 240 /* first, trim all entries */ 241 for (i = 0; i < mi->nr_blks; i++) { 242 struct numa_memblk *bi = &mi->blk[i]; 243 244 /* make sure all blocks are inside the limits */ 245 bi->start = max(bi->start, low); 246 bi->end = min(bi->end, high); 247 248 /* and there's no empty or non-exist block */ 249 if (bi->start >= bi->end || 250 !memblock_overlaps_region(&memblock.memory, 251 bi->start, bi->end - bi->start)) 252 numa_remove_memblk_from(i--, mi); 253 } 254 255 /* merge neighboring / overlapping entries */ 256 for (i = 0; i < mi->nr_blks; i++) { 257 struct numa_memblk *bi = &mi->blk[i]; 258 259 for (j = i + 1; j < mi->nr_blks; j++) { 260 struct numa_memblk *bj = &mi->blk[j]; 261 u64 start, end; 262 263 /* 264 * See whether there are overlapping blocks. Whine 265 * about but allow overlaps of the same nid. They 266 * will be merged below. 267 */ 268 if (bi->end > bj->start && bi->start < bj->end) { 269 if (bi->nid != bj->nid) { 270 pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n", 271 bi->nid, bi->start, bi->end - 1, 272 bj->nid, bj->start, bj->end - 1); 273 return -EINVAL; 274 } 275 pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n", 276 bi->nid, bi->start, bi->end - 1, 277 bj->start, bj->end - 1); 278 } 279 280 /* 281 * Join together blocks on the same node, holes 282 * between which don't overlap with memory on other 283 * nodes. 284 */ 285 if (bi->nid != bj->nid) 286 continue; 287 start = min(bi->start, bj->start); 288 end = max(bi->end, bj->end); 289 for (k = 0; k < mi->nr_blks; k++) { 290 struct numa_memblk *bk = &mi->blk[k]; 291 292 if (bi->nid == bk->nid) 293 continue; 294 if (start < bk->end && end > bk->start) 295 break; 296 } 297 if (k < mi->nr_blks) 298 continue; 299 printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n", 300 bi->nid, bi->start, bi->end - 1, bj->start, 301 bj->end - 1, start, end - 1); 302 bi->start = start; 303 bi->end = end; 304 numa_remove_memblk_from(j--, mi); 305 } 306 } 307 308 /* clear unused ones */ 309 for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { 310 mi->blk[i].start = mi->blk[i].end = 0; 311 mi->blk[i].nid = NUMA_NO_NODE; 312 } 313 314 return 0; 315 } 316 317 /* 318 * Set nodes, which have memory in @mi, in *@nodemask. 319 */ 320 static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, 321 const struct numa_meminfo *mi) 322 { 323 int i; 324 325 for (i = 0; i < ARRAY_SIZE(mi->blk); i++) 326 if (mi->blk[i].start != mi->blk[i].end && 327 mi->blk[i].nid != NUMA_NO_NODE) 328 node_set(mi->blk[i].nid, *nodemask); 329 } 330 331 /** 332 * numa_reset_distance - Reset NUMA distance table 333 * 334 * The current table is freed. The next numa_set_distance() call will 335 * create a new one. 336 */ 337 void __init numa_reset_distance(void) 338 { 339 size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); 340 341 /* numa_distance could be 1LU marking allocation failure, test cnt */ 342 if (numa_distance_cnt) 343 memblock_free(__pa(numa_distance), size); 344 numa_distance_cnt = 0; 345 numa_distance = NULL; /* enable table creation */ 346 } 347 348 static int __init numa_alloc_distance(void) 349 { 350 nodemask_t nodes_parsed; 351 size_t size; 352 int i, j, cnt = 0; 353 u64 phys; 354 355 /* size the new table and allocate it */ 356 nodes_parsed = numa_nodes_parsed; 357 numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); 358 359 for_each_node_mask(i, nodes_parsed) 360 cnt = i; 361 cnt++; 362 size = cnt * cnt * sizeof(numa_distance[0]); 363 364 phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), 365 size, PAGE_SIZE); 366 if (!phys) { 367 pr_warn("Warning: can't allocate distance table!\n"); 368 /* don't retry until explicitly reset */ 369 numa_distance = (void *)1LU; 370 return -ENOMEM; 371 } 372 memblock_reserve(phys, size); 373 374 numa_distance = __va(phys); 375 numa_distance_cnt = cnt; 376 377 /* fill with the default distances */ 378 for (i = 0; i < cnt; i++) 379 for (j = 0; j < cnt; j++) 380 numa_distance[i * cnt + j] = i == j ? 381 LOCAL_DISTANCE : REMOTE_DISTANCE; 382 printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); 383 384 return 0; 385 } 386 387 /** 388 * numa_set_distance - Set NUMA distance from one NUMA to another 389 * @from: the 'from' node to set distance 390 * @to: the 'to' node to set distance 391 * @distance: NUMA distance 392 * 393 * Set the distance from node @from to @to to @distance. If distance table 394 * doesn't exist, one which is large enough to accommodate all the currently 395 * known nodes will be created. 396 * 397 * If such table cannot be allocated, a warning is printed and further 398 * calls are ignored until the distance table is reset with 399 * numa_reset_distance(). 400 * 401 * If @from or @to is higher than the highest known node or lower than zero 402 * at the time of table creation or @distance doesn't make sense, the call 403 * is ignored. 404 * This is to allow simplification of specific NUMA config implementations. 405 */ 406 void __init numa_set_distance(int from, int to, int distance) 407 { 408 if (!numa_distance && numa_alloc_distance() < 0) 409 return; 410 411 if (from >= numa_distance_cnt || to >= numa_distance_cnt || 412 from < 0 || to < 0) { 413 pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n", 414 from, to, distance); 415 return; 416 } 417 418 if ((u8)distance != distance || 419 (from == to && distance != LOCAL_DISTANCE)) { 420 pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n", 421 from, to, distance); 422 return; 423 } 424 425 numa_distance[from * numa_distance_cnt + to] = distance; 426 } 427 428 int __node_distance(int from, int to) 429 { 430 if (from >= numa_distance_cnt || to >= numa_distance_cnt) 431 return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; 432 return numa_distance[from * numa_distance_cnt + to]; 433 } 434 EXPORT_SYMBOL(__node_distance); 435 436 /* 437 * Sanity check to catch more bad NUMA configurations (they are amazingly 438 * common). Make sure the nodes cover all memory. 439 */ 440 static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) 441 { 442 u64 numaram, e820ram; 443 int i; 444 445 numaram = 0; 446 for (i = 0; i < mi->nr_blks; i++) { 447 u64 s = mi->blk[i].start >> PAGE_SHIFT; 448 u64 e = mi->blk[i].end >> PAGE_SHIFT; 449 numaram += e - s; 450 numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); 451 if ((s64)numaram < 0) 452 numaram = 0; 453 } 454 455 e820ram = max_pfn - absent_pages_in_range(0, max_pfn); 456 457 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ 458 if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { 459 printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n", 460 (numaram << PAGE_SHIFT) >> 20, 461 (e820ram << PAGE_SHIFT) >> 20); 462 return false; 463 } 464 return true; 465 } 466 467 /* 468 * Mark all currently memblock-reserved physical memory (which covers the 469 * kernel's own memory ranges) as hot-unswappable. 470 */ 471 static void __init numa_clear_kernel_node_hotplug(void) 472 { 473 nodemask_t reserved_nodemask = NODE_MASK_NONE; 474 struct memblock_region *mb_region; 475 int i; 476 477 /* 478 * We have to do some preprocessing of memblock regions, to 479 * make them suitable for reservation. 480 * 481 * At this time, all memory regions reserved by memblock are 482 * used by the kernel, but those regions are not split up 483 * along node boundaries yet, and don't necessarily have their 484 * node ID set yet either. 485 * 486 * So iterate over all memory known to the x86 architecture, 487 * and use those ranges to set the nid in memblock.reserved. 488 * This will split up the memblock regions along node 489 * boundaries and will set the node IDs as well. 490 */ 491 for (i = 0; i < numa_meminfo.nr_blks; i++) { 492 struct numa_memblk *mb = numa_meminfo.blk + i; 493 int ret; 494 495 ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid); 496 WARN_ON_ONCE(ret); 497 } 498 499 /* 500 * Now go over all reserved memblock regions, to construct a 501 * node mask of all kernel reserved memory areas. 502 * 503 * [ Note, when booting with mem=nn[kMG] or in a kdump kernel, 504 * numa_meminfo might not include all memblock.reserved 505 * memory ranges, because quirks such as trim_snb_memory() 506 * reserve specific pages for Sandy Bridge graphics. ] 507 */ 508 for_each_memblock(reserved, mb_region) { 509 if (mb_region->nid != MAX_NUMNODES) 510 node_set(mb_region->nid, reserved_nodemask); 511 } 512 513 /* 514 * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory 515 * belonging to the reserved node mask. 516 * 517 * Note that this will include memory regions that reside 518 * on nodes that contain kernel memory - entire nodes 519 * become hot-unpluggable: 520 */ 521 for (i = 0; i < numa_meminfo.nr_blks; i++) { 522 struct numa_memblk *mb = numa_meminfo.blk + i; 523 524 if (!node_isset(mb->nid, reserved_nodemask)) 525 continue; 526 527 memblock_clear_hotplug(mb->start, mb->end - mb->start); 528 } 529 } 530 531 static int __init numa_register_memblks(struct numa_meminfo *mi) 532 { 533 unsigned long uninitialized_var(pfn_align); 534 int i, nid; 535 536 /* Account for nodes with cpus and no memory */ 537 node_possible_map = numa_nodes_parsed; 538 numa_nodemask_from_meminfo(&node_possible_map, mi); 539 if (WARN_ON(nodes_empty(node_possible_map))) 540 return -EINVAL; 541 542 for (i = 0; i < mi->nr_blks; i++) { 543 struct numa_memblk *mb = &mi->blk[i]; 544 memblock_set_node(mb->start, mb->end - mb->start, 545 &memblock.memory, mb->nid); 546 } 547 548 /* 549 * At very early time, the kernel have to use some memory such as 550 * loading the kernel image. We cannot prevent this anyway. So any 551 * node the kernel resides in should be un-hotpluggable. 552 * 553 * And when we come here, alloc node data won't fail. 554 */ 555 numa_clear_kernel_node_hotplug(); 556 557 /* 558 * If sections array is gonna be used for pfn -> nid mapping, check 559 * whether its granularity is fine enough. 560 */ 561 #ifdef NODE_NOT_IN_PAGE_FLAGS 562 pfn_align = node_map_pfn_alignment(); 563 if (pfn_align && pfn_align < PAGES_PER_SECTION) { 564 printk(KERN_WARNING "Node alignment %LuMB < min %LuMB, rejecting NUMA config\n", 565 PFN_PHYS(pfn_align) >> 20, 566 PFN_PHYS(PAGES_PER_SECTION) >> 20); 567 return -EINVAL; 568 } 569 #endif 570 if (!numa_meminfo_cover_memory(mi)) 571 return -EINVAL; 572 573 /* Finally register nodes. */ 574 for_each_node_mask(nid, node_possible_map) { 575 u64 start = PFN_PHYS(max_pfn); 576 u64 end = 0; 577 578 for (i = 0; i < mi->nr_blks; i++) { 579 if (nid != mi->blk[i].nid) 580 continue; 581 start = min(mi->blk[i].start, start); 582 end = max(mi->blk[i].end, end); 583 } 584 585 if (start >= end) 586 continue; 587 588 /* 589 * Don't confuse VM with a node that doesn't have the 590 * minimum amount of memory: 591 */ 592 if (end && (end - start) < NODE_MIN_SIZE) 593 continue; 594 595 alloc_node_data(nid); 596 } 597 598 /* Dump memblock with node info and return. */ 599 memblock_dump_all(); 600 return 0; 601 } 602 603 /* 604 * There are unfortunately some poorly designed mainboards around that 605 * only connect memory to a single CPU. This breaks the 1:1 cpu->node 606 * mapping. To avoid this fill in the mapping for all possible CPUs, 607 * as the number of CPUs is not known yet. We round robin the existing 608 * nodes. 609 */ 610 static void __init numa_init_array(void) 611 { 612 int rr, i; 613 614 rr = first_node(node_online_map); 615 for (i = 0; i < nr_cpu_ids; i++) { 616 if (early_cpu_to_node(i) != NUMA_NO_NODE) 617 continue; 618 numa_set_node(i, rr); 619 rr = next_node_in(rr, node_online_map); 620 } 621 } 622 623 static int __init numa_init(int (*init_func)(void)) 624 { 625 int i; 626 int ret; 627 628 for (i = 0; i < MAX_LOCAL_APIC; i++) 629 set_apicid_to_node(i, NUMA_NO_NODE); 630 631 nodes_clear(numa_nodes_parsed); 632 nodes_clear(node_possible_map); 633 nodes_clear(node_online_map); 634 memset(&numa_meminfo, 0, sizeof(numa_meminfo)); 635 WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory, 636 MAX_NUMNODES)); 637 WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved, 638 MAX_NUMNODES)); 639 /* In case that parsing SRAT failed. */ 640 WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX)); 641 numa_reset_distance(); 642 643 ret = init_func(); 644 if (ret < 0) 645 return ret; 646 647 /* 648 * We reset memblock back to the top-down direction 649 * here because if we configured ACPI_NUMA, we have 650 * parsed SRAT in init_func(). It is ok to have the 651 * reset here even if we did't configure ACPI_NUMA 652 * or acpi numa init fails and fallbacks to dummy 653 * numa init. 654 */ 655 memblock_set_bottom_up(false); 656 657 ret = numa_cleanup_meminfo(&numa_meminfo); 658 if (ret < 0) 659 return ret; 660 661 numa_emulation(&numa_meminfo, numa_distance_cnt); 662 663 ret = numa_register_memblks(&numa_meminfo); 664 if (ret < 0) 665 return ret; 666 667 for (i = 0; i < nr_cpu_ids; i++) { 668 int nid = early_cpu_to_node(i); 669 670 if (nid == NUMA_NO_NODE) 671 continue; 672 if (!node_online(nid)) 673 numa_clear_node(i); 674 } 675 numa_init_array(); 676 677 return 0; 678 } 679 680 /** 681 * dummy_numa_init - Fallback dummy NUMA init 682 * 683 * Used if there's no underlying NUMA architecture, NUMA initialization 684 * fails, or NUMA is disabled on the command line. 685 * 686 * Must online at least one node and add memory blocks that cover all 687 * allowed memory. This function must not fail. 688 */ 689 static int __init dummy_numa_init(void) 690 { 691 printk(KERN_INFO "%s\n", 692 numa_off ? "NUMA turned off" : "No NUMA configuration found"); 693 printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n", 694 0LLU, PFN_PHYS(max_pfn) - 1); 695 696 node_set(0, numa_nodes_parsed); 697 numa_add_memblk(0, 0, PFN_PHYS(max_pfn)); 698 699 return 0; 700 } 701 702 /** 703 * x86_numa_init - Initialize NUMA 704 * 705 * Try each configured NUMA initialization method until one succeeds. The 706 * last fallback is dummy single node config encomapssing whole memory and 707 * never fails. 708 */ 709 void __init x86_numa_init(void) 710 { 711 if (!numa_off) { 712 #ifdef CONFIG_ACPI_NUMA 713 if (!numa_init(x86_acpi_numa_init)) 714 return; 715 #endif 716 #ifdef CONFIG_AMD_NUMA 717 if (!numa_init(amd_numa_init)) 718 return; 719 #endif 720 } 721 722 numa_init(dummy_numa_init); 723 } 724 725 static void __init init_memory_less_node(int nid) 726 { 727 unsigned long zones_size[MAX_NR_ZONES] = {0}; 728 unsigned long zholes_size[MAX_NR_ZONES] = {0}; 729 730 /* Allocate and initialize node data. Memory-less node is now online.*/ 731 alloc_node_data(nid); 732 free_area_init_node(nid, zones_size, 0, zholes_size); 733 734 /* 735 * All zonelists will be built later in start_kernel() after per cpu 736 * areas are initialized. 737 */ 738 } 739 740 /* 741 * Setup early cpu_to_node. 742 * 743 * Populate cpu_to_node[] only if x86_cpu_to_apicid[], 744 * and apicid_to_node[] tables have valid entries for a CPU. 745 * This means we skip cpu_to_node[] initialisation for NUMA 746 * emulation and faking node case (when running a kernel compiled 747 * for NUMA on a non NUMA box), which is OK as cpu_to_node[] 748 * is already initialized in a round robin manner at numa_init_array, 749 * prior to this call, and this initialization is good enough 750 * for the fake NUMA cases. 751 * 752 * Called before the per_cpu areas are setup. 753 */ 754 void __init init_cpu_to_node(void) 755 { 756 int cpu; 757 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 758 759 BUG_ON(cpu_to_apicid == NULL); 760 761 for_each_possible_cpu(cpu) { 762 int node = numa_cpu_node(cpu); 763 764 if (node == NUMA_NO_NODE) 765 continue; 766 767 if (!node_online(node)) 768 init_memory_less_node(node); 769 770 numa_set_node(cpu, node); 771 } 772 } 773 774 #ifndef CONFIG_DEBUG_PER_CPU_MAPS 775 776 # ifndef CONFIG_NUMA_EMU 777 void numa_add_cpu(int cpu) 778 { 779 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 780 } 781 782 void numa_remove_cpu(int cpu) 783 { 784 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 785 } 786 # endif /* !CONFIG_NUMA_EMU */ 787 788 #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ 789 790 int __cpu_to_node(int cpu) 791 { 792 if (early_per_cpu_ptr(x86_cpu_to_node_map)) { 793 printk(KERN_WARNING 794 "cpu_to_node(%d): usage too early!\n", cpu); 795 dump_stack(); 796 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; 797 } 798 return per_cpu(x86_cpu_to_node_map, cpu); 799 } 800 EXPORT_SYMBOL(__cpu_to_node); 801 802 /* 803 * Same function as cpu_to_node() but used if called before the 804 * per_cpu areas are setup. 805 */ 806 int early_cpu_to_node(int cpu) 807 { 808 if (early_per_cpu_ptr(x86_cpu_to_node_map)) 809 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; 810 811 if (!cpu_possible(cpu)) { 812 printk(KERN_WARNING 813 "early_cpu_to_node(%d): no per_cpu area!\n", cpu); 814 dump_stack(); 815 return NUMA_NO_NODE; 816 } 817 return per_cpu(x86_cpu_to_node_map, cpu); 818 } 819 820 void debug_cpumask_set_cpu(int cpu, int node, bool enable) 821 { 822 struct cpumask *mask; 823 824 if (node == NUMA_NO_NODE) { 825 /* early_cpu_to_node() already emits a warning and trace */ 826 return; 827 } 828 mask = node_to_cpumask_map[node]; 829 if (!mask) { 830 pr_err("node_to_cpumask_map[%i] NULL\n", node); 831 dump_stack(); 832 return; 833 } 834 835 if (enable) 836 cpumask_set_cpu(cpu, mask); 837 else 838 cpumask_clear_cpu(cpu, mask); 839 840 printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n", 841 enable ? "numa_add_cpu" : "numa_remove_cpu", 842 cpu, node, cpumask_pr_args(mask)); 843 return; 844 } 845 846 # ifndef CONFIG_NUMA_EMU 847 static void numa_set_cpumask(int cpu, bool enable) 848 { 849 debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable); 850 } 851 852 void numa_add_cpu(int cpu) 853 { 854 numa_set_cpumask(cpu, true); 855 } 856 857 void numa_remove_cpu(int cpu) 858 { 859 numa_set_cpumask(cpu, false); 860 } 861 # endif /* !CONFIG_NUMA_EMU */ 862 863 /* 864 * Returns a pointer to the bitmask of CPUs on Node 'node'. 865 */ 866 const struct cpumask *cpumask_of_node(int node) 867 { 868 if (node >= nr_node_ids) { 869 printk(KERN_WARNING 870 "cpumask_of_node(%d): node > nr_node_ids(%d)\n", 871 node, nr_node_ids); 872 dump_stack(); 873 return cpu_none_mask; 874 } 875 if (node_to_cpumask_map[node] == NULL) { 876 printk(KERN_WARNING 877 "cpumask_of_node(%d): no node_to_cpumask_map!\n", 878 node); 879 dump_stack(); 880 return cpu_online_mask; 881 } 882 return node_to_cpumask_map[node]; 883 } 884 EXPORT_SYMBOL(cpumask_of_node); 885 886 #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 887 888 #ifdef CONFIG_MEMORY_HOTPLUG 889 int memory_add_physaddr_to_nid(u64 start) 890 { 891 struct numa_meminfo *mi = &numa_meminfo; 892 int nid = mi->blk[0].nid; 893 int i; 894 895 for (i = 0; i < mi->nr_blks; i++) 896 if (mi->blk[i].start <= start && mi->blk[i].end > start) 897 nid = mi->blk[i].nid; 898 return nid; 899 } 900 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 901 #endif 902