1 /* 2 * Generic VM initialization for x86-64 NUMA setups. 3 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 4 */ 5 #include <linux/kernel.h> 6 #include <linux/mm.h> 7 #include <linux/string.h> 8 #include <linux/init.h> 9 #include <linux/bootmem.h> 10 #include <linux/mmzone.h> 11 #include <linux/ctype.h> 12 #include <linux/module.h> 13 #include <linux/nodemask.h> 14 #include <linux/sched.h> 15 16 #include <asm/e820.h> 17 #include <asm/proto.h> 18 #include <asm/dma.h> 19 #include <asm/numa.h> 20 #include <asm/acpi.h> 21 #include <asm/k8.h> 22 23 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 24 EXPORT_SYMBOL(node_data); 25 26 struct memnode memnode; 27 28 s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 29 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 30 }; 31 32 int numa_off __initdata; 33 static unsigned long __initdata nodemap_addr; 34 static unsigned long __initdata nodemap_size; 35 36 /* 37 * Given a shift value, try to populate memnodemap[] 38 * Returns : 39 * 1 if OK 40 * 0 if memnodmap[] too small (of shift too small) 41 * -1 if node overlap or lost ram (shift too big) 42 */ 43 static int __init populate_memnodemap(const struct bootnode *nodes, 44 int numnodes, int shift, int *nodeids) 45 { 46 unsigned long addr, end; 47 int i, res = -1; 48 49 memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); 50 for (i = 0; i < numnodes; i++) { 51 addr = nodes[i].start; 52 end = nodes[i].end; 53 if (addr >= end) 54 continue; 55 if ((end >> shift) >= memnodemapsize) 56 return 0; 57 do { 58 if (memnodemap[addr >> shift] != NUMA_NO_NODE) 59 return -1; 60 61 if (!nodeids) 62 memnodemap[addr >> shift] = i; 63 else 64 memnodemap[addr >> shift] = nodeids[i]; 65 66 addr += (1UL << shift); 67 } while (addr < end); 68 res = 1; 69 } 70 return res; 71 } 72 73 static int __init allocate_cachealigned_memnodemap(void) 74 { 75 unsigned long addr; 76 77 memnodemap = memnode.embedded_map; 78 if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map)) 79 return 0; 80 81 addr = 0x8000; 82 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); 83 nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT, 84 nodemap_size, L1_CACHE_BYTES); 85 if (nodemap_addr == -1UL) { 86 printk(KERN_ERR 87 "NUMA: Unable to allocate Memory to Node hash map\n"); 88 nodemap_addr = nodemap_size = 0; 89 return -1; 90 } 91 memnodemap = phys_to_virt(nodemap_addr); 92 reserve_early(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP"); 93 94 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", 95 nodemap_addr, nodemap_addr + nodemap_size); 96 return 0; 97 } 98 99 /* 100 * The LSB of all start and end addresses in the node map is the value of the 101 * maximum possible shift. 102 */ 103 static int __init extract_lsb_from_nodes(const struct bootnode *nodes, 104 int numnodes) 105 { 106 int i, nodes_used = 0; 107 unsigned long start, end; 108 unsigned long bitfield = 0, memtop = 0; 109 110 for (i = 0; i < numnodes; i++) { 111 start = nodes[i].start; 112 end = nodes[i].end; 113 if (start >= end) 114 continue; 115 bitfield |= start; 116 nodes_used++; 117 if (end > memtop) 118 memtop = end; 119 } 120 if (nodes_used <= 1) 121 i = 63; 122 else 123 i = find_first_bit(&bitfield, sizeof(unsigned long)*8); 124 memnodemapsize = (memtop >> i)+1; 125 return i; 126 } 127 128 int __init compute_hash_shift(struct bootnode *nodes, int numnodes, 129 int *nodeids) 130 { 131 int shift; 132 133 shift = extract_lsb_from_nodes(nodes, numnodes); 134 if (allocate_cachealigned_memnodemap()) 135 return -1; 136 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", 137 shift); 138 139 if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) { 140 printk(KERN_INFO "Your memory is not aligned you need to " 141 "rebuild your kernel with a bigger NODEMAPSIZE " 142 "shift=%d\n", shift); 143 return -1; 144 } 145 return shift; 146 } 147 148 int early_pfn_to_nid(unsigned long pfn) 149 { 150 return phys_to_nid(pfn << PAGE_SHIFT); 151 } 152 153 static void * __init early_node_mem(int nodeid, unsigned long start, 154 unsigned long end, unsigned long size, 155 unsigned long align) 156 { 157 unsigned long mem = find_e820_area(start, end, size, align); 158 void *ptr; 159 160 if (mem != -1L) 161 return __va(mem); 162 163 ptr = __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); 164 if (ptr == NULL) { 165 printk(KERN_ERR "Cannot find %lu bytes in node %d\n", 166 size, nodeid); 167 return NULL; 168 } 169 return ptr; 170 } 171 172 /* Initialize bootmem allocator for a node */ 173 void __init setup_node_bootmem(int nodeid, unsigned long start, 174 unsigned long end) 175 { 176 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; 177 unsigned long bootmap_start, nodedata_phys; 178 void *bootmap; 179 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); 180 int nid; 181 182 start = roundup(start, ZONE_ALIGN); 183 184 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, 185 start, end); 186 187 start_pfn = start >> PAGE_SHIFT; 188 last_pfn = end >> PAGE_SHIFT; 189 190 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, 191 SMP_CACHE_BYTES); 192 if (node_data[nodeid] == NULL) 193 return; 194 nodedata_phys = __pa(node_data[nodeid]); 195 printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, 196 nodedata_phys + pgdat_size - 1); 197 198 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 199 NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; 200 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 201 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; 202 203 /* 204 * Find a place for the bootmem map 205 * nodedata_phys could be on other nodes by alloc_bootmem, 206 * so need to sure bootmap_start not to be small, otherwise 207 * early_node_mem will get that with find_e820_area instead 208 * of alloc_bootmem, that could clash with reserved range 209 */ 210 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); 211 nid = phys_to_nid(nodedata_phys); 212 if (nid == nodeid) 213 bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE); 214 else 215 bootmap_start = roundup(start, PAGE_SIZE); 216 /* 217 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like 218 * to use that to align to PAGE_SIZE 219 */ 220 bootmap = early_node_mem(nodeid, bootmap_start, end, 221 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); 222 if (bootmap == NULL) { 223 if (nodedata_phys < start || nodedata_phys >= end) 224 free_bootmem(nodedata_phys, pgdat_size); 225 node_data[nodeid] = NULL; 226 return; 227 } 228 bootmap_start = __pa(bootmap); 229 230 bootmap_size = init_bootmem_node(NODE_DATA(nodeid), 231 bootmap_start >> PAGE_SHIFT, 232 start_pfn, last_pfn); 233 234 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", 235 bootmap_start, bootmap_start + bootmap_size - 1, 236 bootmap_pages); 237 238 free_bootmem_with_active_regions(nodeid, end); 239 240 /* 241 * convert early reserve to bootmem reserve earlier 242 * otherwise early_node_mem could use early reserved mem 243 * on previous node 244 */ 245 early_res_to_bootmem(start, end); 246 247 /* 248 * in some case early_node_mem could use alloc_bootmem 249 * to get range on other node, don't reserve that again 250 */ 251 if (nid != nodeid) 252 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid); 253 else 254 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, 255 pgdat_size, BOOTMEM_DEFAULT); 256 nid = phys_to_nid(bootmap_start); 257 if (nid != nodeid) 258 printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid); 259 else 260 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, 261 bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT); 262 263 #ifdef CONFIG_ACPI_NUMA 264 srat_reserve_add_area(nodeid); 265 #endif 266 node_set_online(nodeid); 267 } 268 269 /* 270 * There are unfortunately some poorly designed mainboards around that 271 * only connect memory to a single CPU. This breaks the 1:1 cpu->node 272 * mapping. To avoid this fill in the mapping for all possible CPUs, 273 * as the number of CPUs is not known yet. We round robin the existing 274 * nodes. 275 */ 276 void __init numa_init_array(void) 277 { 278 int rr, i; 279 280 rr = first_node(node_online_map); 281 for (i = 0; i < NR_CPUS; i++) { 282 if (early_cpu_to_node(i) != NUMA_NO_NODE) 283 continue; 284 numa_set_node(i, rr); 285 rr = next_node(rr, node_online_map); 286 if (rr == MAX_NUMNODES) 287 rr = first_node(node_online_map); 288 } 289 } 290 291 #ifdef CONFIG_NUMA_EMU 292 /* Numa emulation */ 293 static char *cmdline __initdata; 294 295 /* 296 * Setups up nid to range from addr to addr + size. If the end 297 * boundary is greater than max_addr, then max_addr is used instead. 298 * The return value is 0 if there is additional memory left for 299 * allocation past addr and -1 otherwise. addr is adjusted to be at 300 * the end of the node. 301 */ 302 static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, 303 u64 size, u64 max_addr) 304 { 305 int ret = 0; 306 307 nodes[nid].start = *addr; 308 *addr += size; 309 if (*addr >= max_addr) { 310 *addr = max_addr; 311 ret = -1; 312 } 313 nodes[nid].end = *addr; 314 node_set(nid, node_possible_map); 315 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, 316 nodes[nid].start, nodes[nid].end, 317 (nodes[nid].end - nodes[nid].start) >> 20); 318 return ret; 319 } 320 321 /* 322 * Splits num_nodes nodes up equally starting at node_start. The return value 323 * is the number of nodes split up and addr is adjusted to be at the end of the 324 * last node allocated. 325 */ 326 static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, 327 u64 max_addr, int node_start, 328 int num_nodes) 329 { 330 unsigned int big; 331 u64 size; 332 int i; 333 334 if (num_nodes <= 0) 335 return -1; 336 if (num_nodes > MAX_NUMNODES) 337 num_nodes = MAX_NUMNODES; 338 size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) / 339 num_nodes; 340 /* 341 * Calculate the number of big nodes that can be allocated as a result 342 * of consolidating the leftovers. 343 */ 344 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) / 345 FAKE_NODE_MIN_SIZE; 346 347 /* Round down to nearest FAKE_NODE_MIN_SIZE. */ 348 size &= FAKE_NODE_MIN_HASH_MASK; 349 if (!size) { 350 printk(KERN_ERR "Not enough memory for each node. " 351 "NUMA emulation disabled.\n"); 352 return -1; 353 } 354 355 for (i = node_start; i < num_nodes + node_start; i++) { 356 u64 end = *addr + size; 357 358 if (i < big) 359 end += FAKE_NODE_MIN_SIZE; 360 /* 361 * The final node can have the remaining system RAM. Other 362 * nodes receive roughly the same amount of available pages. 363 */ 364 if (i == num_nodes + node_start - 1) 365 end = max_addr; 366 else 367 while (end - *addr - e820_hole_size(*addr, end) < 368 size) { 369 end += FAKE_NODE_MIN_SIZE; 370 if (end > max_addr) { 371 end = max_addr; 372 break; 373 } 374 } 375 if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) 376 break; 377 } 378 return i - node_start + 1; 379 } 380 381 /* 382 * Splits the remaining system RAM into chunks of size. The remaining memory is 383 * always assigned to a final node and can be asymmetric. Returns the number of 384 * nodes split. 385 */ 386 static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, 387 u64 max_addr, int node_start, u64 size) 388 { 389 int i = node_start; 390 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; 391 while (!setup_node_range(i++, nodes, addr, size, max_addr)) 392 ; 393 return i - node_start; 394 } 395 396 /* 397 * Sets up the system RAM area from start_pfn to last_pfn according to the 398 * numa=fake command-line option. 399 */ 400 static struct bootnode nodes[MAX_NUMNODES] __initdata; 401 402 static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn) 403 { 404 u64 size, addr = start_pfn << PAGE_SHIFT; 405 u64 max_addr = last_pfn << PAGE_SHIFT; 406 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; 407 408 memset(&nodes, 0, sizeof(nodes)); 409 /* 410 * If the numa=fake command-line is just a single number N, split the 411 * system RAM into N fake nodes. 412 */ 413 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { 414 long n = simple_strtol(cmdline, NULL, 0); 415 416 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); 417 if (num_nodes < 0) 418 return num_nodes; 419 goto out; 420 } 421 422 /* Parse the command line. */ 423 for (coeff_flag = 0; ; cmdline++) { 424 if (*cmdline && isdigit(*cmdline)) { 425 num = num * 10 + *cmdline - '0'; 426 continue; 427 } 428 if (*cmdline == '*') { 429 if (num > 0) 430 coeff = num; 431 coeff_flag = 1; 432 } 433 if (!*cmdline || *cmdline == ',') { 434 if (!coeff_flag) 435 coeff = 1; 436 /* 437 * Round down to the nearest FAKE_NODE_MIN_SIZE. 438 * Command-line coefficients are in megabytes. 439 */ 440 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; 441 if (size) 442 for (i = 0; i < coeff; i++, num_nodes++) 443 if (setup_node_range(num_nodes, nodes, 444 &addr, size, max_addr) < 0) 445 goto done; 446 if (!*cmdline) 447 break; 448 coeff_flag = 0; 449 coeff = -1; 450 } 451 num = 0; 452 } 453 done: 454 if (!num_nodes) 455 return -1; 456 /* Fill remainder of system RAM, if appropriate. */ 457 if (addr < max_addr) { 458 if (coeff_flag && coeff < 0) { 459 /* Split remaining nodes into num-sized chunks */ 460 num_nodes += split_nodes_by_size(nodes, &addr, max_addr, 461 num_nodes, num); 462 goto out; 463 } 464 switch (*(cmdline - 1)) { 465 case '*': 466 /* Split remaining nodes into coeff chunks */ 467 if (coeff <= 0) 468 break; 469 num_nodes += split_nodes_equally(nodes, &addr, max_addr, 470 num_nodes, coeff); 471 break; 472 case ',': 473 /* Do not allocate remaining system RAM */ 474 break; 475 default: 476 /* Give one final node */ 477 setup_node_range(num_nodes, nodes, &addr, 478 max_addr - addr, max_addr); 479 num_nodes++; 480 } 481 } 482 out: 483 memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); 484 if (memnode_shift < 0) { 485 memnode_shift = 0; 486 printk(KERN_ERR "No NUMA hash function found. NUMA emulation " 487 "disabled.\n"); 488 return -1; 489 } 490 491 /* 492 * We need to vacate all active ranges that may have been registered by 493 * SRAT and set acpi_numa to -1 so that srat_disabled() always returns 494 * true. NUMA emulation has succeeded so we will not scan ACPI nodes. 495 */ 496 remove_all_active_ranges(); 497 #ifdef CONFIG_ACPI_NUMA 498 acpi_numa = -1; 499 #endif 500 for_each_node_mask(i, node_possible_map) { 501 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, 502 nodes[i].end >> PAGE_SHIFT); 503 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 504 } 505 acpi_fake_nodes(nodes, num_nodes); 506 numa_init_array(); 507 return 0; 508 } 509 #endif /* CONFIG_NUMA_EMU */ 510 511 void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) 512 { 513 int i; 514 515 nodes_clear(node_possible_map); 516 nodes_clear(node_online_map); 517 518 #ifdef CONFIG_NUMA_EMU 519 if (cmdline && !numa_emulation(start_pfn, last_pfn)) 520 return; 521 nodes_clear(node_possible_map); 522 nodes_clear(node_online_map); 523 #endif 524 525 #ifdef CONFIG_ACPI_NUMA 526 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 527 last_pfn << PAGE_SHIFT)) 528 return; 529 nodes_clear(node_possible_map); 530 nodes_clear(node_online_map); 531 #endif 532 533 #ifdef CONFIG_K8_NUMA 534 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, 535 last_pfn<<PAGE_SHIFT)) 536 return; 537 nodes_clear(node_possible_map); 538 nodes_clear(node_online_map); 539 #endif 540 printk(KERN_INFO "%s\n", 541 numa_off ? "NUMA turned off" : "No NUMA configuration found"); 542 543 printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 544 start_pfn << PAGE_SHIFT, 545 last_pfn << PAGE_SHIFT); 546 /* setup dummy node covering all memory */ 547 memnode_shift = 63; 548 memnodemap = memnode.embedded_map; 549 memnodemap[0] = 0; 550 node_set_online(0); 551 node_set(0, node_possible_map); 552 for (i = 0; i < NR_CPUS; i++) 553 numa_set_node(i, 0); 554 e820_register_active_regions(0, start_pfn, last_pfn); 555 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT); 556 } 557 558 unsigned long __init numa_free_all_bootmem(void) 559 { 560 unsigned long pages = 0; 561 int i; 562 563 for_each_online_node(i) 564 pages += free_all_bootmem_node(NODE_DATA(i)); 565 566 return pages; 567 } 568 569 void __init paging_init(void) 570 { 571 unsigned long max_zone_pfns[MAX_NR_ZONES]; 572 573 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 574 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; 575 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 576 max_zone_pfns[ZONE_NORMAL] = max_pfn; 577 578 sparse_memory_present_with_active_regions(MAX_NUMNODES); 579 sparse_init(); 580 581 free_area_init_nodes(max_zone_pfns); 582 } 583 584 static __init int numa_setup(char *opt) 585 { 586 if (!opt) 587 return -EINVAL; 588 if (!strncmp(opt, "off", 3)) 589 numa_off = 1; 590 #ifdef CONFIG_NUMA_EMU 591 if (!strncmp(opt, "fake=", 5)) 592 cmdline = opt + 5; 593 #endif 594 #ifdef CONFIG_ACPI_NUMA 595 if (!strncmp(opt, "noacpi", 6)) 596 acpi_numa = -1; 597 if (!strncmp(opt, "hotadd=", 7)) 598 hotadd_percent = simple_strtoul(opt+7, NULL, 10); 599 #endif 600 return 0; 601 } 602 early_param("numa", numa_setup); 603 604 #ifdef CONFIG_NUMA 605 /* 606 * Setup early cpu_to_node. 607 * 608 * Populate cpu_to_node[] only if x86_cpu_to_apicid[], 609 * and apicid_to_node[] tables have valid entries for a CPU. 610 * This means we skip cpu_to_node[] initialisation for NUMA 611 * emulation and faking node case (when running a kernel compiled 612 * for NUMA on a non NUMA box), which is OK as cpu_to_node[] 613 * is already initialized in a round robin manner at numa_init_array, 614 * prior to this call, and this initialization is good enough 615 * for the fake NUMA cases. 616 * 617 * Called before the per_cpu areas are setup. 618 */ 619 void __init init_cpu_to_node(void) 620 { 621 int cpu; 622 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 623 624 BUG_ON(cpu_to_apicid == NULL); 625 626 for_each_possible_cpu(cpu) { 627 int node; 628 u16 apicid = cpu_to_apicid[cpu]; 629 630 if (apicid == BAD_APICID) 631 continue; 632 node = apicid_to_node[apicid]; 633 if (node == NUMA_NO_NODE) 634 continue; 635 if (!node_online(node)) 636 continue; 637 numa_set_node(cpu, node); 638 } 639 } 640 #endif 641 642 643