1 /* 2 * pSeries NUMA support 3 * 4 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 */ 11 #include <linux/threads.h> 12 #include <linux/bootmem.h> 13 #include <linux/init.h> 14 #include <linux/mm.h> 15 #include <linux/mmzone.h> 16 #include <linux/module.h> 17 #include <linux/nodemask.h> 18 #include <linux/cpu.h> 19 #include <linux/notifier.h> 20 #include <linux/lmb.h> 21 #include <linux/of.h> 22 #include <linux/pfn.h> 23 #include <asm/sparsemem.h> 24 #include <asm/prom.h> 25 #include <asm/system.h> 26 #include <asm/smp.h> 27 28 static int numa_enabled = 1; 29 30 static char *cmdline __initdata; 31 32 static int numa_debug; 33 #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); } 34 35 int numa_cpu_lookup_table[NR_CPUS]; 36 cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES]; 37 struct pglist_data *node_data[MAX_NUMNODES]; 38 39 EXPORT_SYMBOL(numa_cpu_lookup_table); 40 EXPORT_SYMBOL(numa_cpumask_lookup_table); 41 EXPORT_SYMBOL(node_data); 42 43 static int min_common_depth; 44 static int n_mem_addr_cells, n_mem_size_cells; 45 46 static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn, 47 unsigned int *nid) 48 { 49 unsigned long long mem; 50 char *p = cmdline; 51 static unsigned int fake_nid; 52 static unsigned long long curr_boundary; 53 54 /* 55 * Modify node id, iff we started creating NUMA nodes 56 * We want to continue from where we left of the last time 57 */ 58 if (fake_nid) 59 *nid = fake_nid; 60 /* 61 * In case there are no more arguments to parse, the 62 * node_id should be the same as the last fake node id 63 * (we've handled this above). 64 */ 65 if (!p) 66 return 0; 67 68 mem = memparse(p, &p); 69 if (!mem) 70 return 0; 71 72 if (mem < curr_boundary) 73 return 0; 74 75 curr_boundary = mem; 76 77 if ((end_pfn << PAGE_SHIFT) > mem) { 78 /* 79 * Skip commas and spaces 80 */ 81 while (*p == ',' || *p == ' ' || *p == '\t') 82 p++; 83 84 cmdline = p; 85 fake_nid++; 86 *nid = fake_nid; 87 dbg("created new fake_node with id %d\n", fake_nid); 88 return 1; 89 } 90 return 0; 91 } 92 93 /* 94 * get_active_region_work_fn - A helper function for get_node_active_region 95 * Returns datax set to the start_pfn and end_pfn if they contain 96 * the initial value of datax->start_pfn between them 97 * @start_pfn: start page(inclusive) of region to check 98 * @end_pfn: end page(exclusive) of region to check 99 * @datax: comes in with ->start_pfn set to value to search for and 100 * goes out with active range if it contains it 101 * Returns 1 if search value is in range else 0 102 */ 103 static int __init get_active_region_work_fn(unsigned long start_pfn, 104 unsigned long end_pfn, void *datax) 105 { 106 struct node_active_region *data; 107 data = (struct node_active_region *)datax; 108 109 if (start_pfn <= data->start_pfn && end_pfn > data->start_pfn) { 110 data->start_pfn = start_pfn; 111 data->end_pfn = end_pfn; 112 return 1; 113 } 114 return 0; 115 116 } 117 118 /* 119 * get_node_active_region - Return active region containing start_pfn 120 * Active range returned is empty if none found. 121 * @start_pfn: The page to return the region for. 122 * @node_ar: Returned set to the active region containing start_pfn 123 */ 124 static void __init get_node_active_region(unsigned long start_pfn, 125 struct node_active_region *node_ar) 126 { 127 int nid = early_pfn_to_nid(start_pfn); 128 129 node_ar->nid = nid; 130 node_ar->start_pfn = start_pfn; 131 node_ar->end_pfn = start_pfn; 132 work_with_active_regions(nid, get_active_region_work_fn, node_ar); 133 } 134 135 static void __cpuinit map_cpu_to_node(int cpu, int node) 136 { 137 numa_cpu_lookup_table[cpu] = node; 138 139 dbg("adding cpu %d to node %d\n", cpu, node); 140 141 if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) 142 cpu_set(cpu, numa_cpumask_lookup_table[node]); 143 } 144 145 #ifdef CONFIG_HOTPLUG_CPU 146 static void unmap_cpu_from_node(unsigned long cpu) 147 { 148 int node = numa_cpu_lookup_table[cpu]; 149 150 dbg("removing cpu %lu from node %d\n", cpu, node); 151 152 if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) { 153 cpu_clear(cpu, numa_cpumask_lookup_table[node]); 154 } else { 155 printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n", 156 cpu, node); 157 } 158 } 159 #endif /* CONFIG_HOTPLUG_CPU */ 160 161 static struct device_node * __cpuinit find_cpu_node(unsigned int cpu) 162 { 163 unsigned int hw_cpuid = get_hard_smp_processor_id(cpu); 164 struct device_node *cpu_node = NULL; 165 const unsigned int *interrupt_server, *reg; 166 int len; 167 168 while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) { 169 /* Try interrupt server first */ 170 interrupt_server = of_get_property(cpu_node, 171 "ibm,ppc-interrupt-server#s", &len); 172 173 len = len / sizeof(u32); 174 175 if (interrupt_server && (len > 0)) { 176 while (len--) { 177 if (interrupt_server[len] == hw_cpuid) 178 return cpu_node; 179 } 180 } else { 181 reg = of_get_property(cpu_node, "reg", &len); 182 if (reg && (len > 0) && (reg[0] == hw_cpuid)) 183 return cpu_node; 184 } 185 } 186 187 return NULL; 188 } 189 190 /* must hold reference to node during call */ 191 static const int *of_get_associativity(struct device_node *dev) 192 { 193 return of_get_property(dev, "ibm,associativity", NULL); 194 } 195 196 /* 197 * Returns the property linux,drconf-usable-memory if 198 * it exists (the property exists only in kexec/kdump kernels, 199 * added by kexec-tools) 200 */ 201 static const u32 *of_get_usable_memory(struct device_node *memory) 202 { 203 const u32 *prop; 204 u32 len; 205 prop = of_get_property(memory, "linux,drconf-usable-memory", &len); 206 if (!prop || len < sizeof(unsigned int)) 207 return 0; 208 return prop; 209 } 210 211 /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa 212 * info is found. 213 */ 214 static int of_node_to_nid_single(struct device_node *device) 215 { 216 int nid = -1; 217 const unsigned int *tmp; 218 219 if (min_common_depth == -1) 220 goto out; 221 222 tmp = of_get_associativity(device); 223 if (!tmp) 224 goto out; 225 226 if (tmp[0] >= min_common_depth) 227 nid = tmp[min_common_depth]; 228 229 /* POWER4 LPAR uses 0xffff as invalid node */ 230 if (nid == 0xffff || nid >= MAX_NUMNODES) 231 nid = -1; 232 out: 233 return nid; 234 } 235 236 /* Walk the device tree upwards, looking for an associativity id */ 237 int of_node_to_nid(struct device_node *device) 238 { 239 struct device_node *tmp; 240 int nid = -1; 241 242 of_node_get(device); 243 while (device) { 244 nid = of_node_to_nid_single(device); 245 if (nid != -1) 246 break; 247 248 tmp = device; 249 device = of_get_parent(tmp); 250 of_node_put(tmp); 251 } 252 of_node_put(device); 253 254 return nid; 255 } 256 EXPORT_SYMBOL_GPL(of_node_to_nid); 257 258 /* 259 * In theory, the "ibm,associativity" property may contain multiple 260 * associativity lists because a resource may be multiply connected 261 * into the machine. This resource then has different associativity 262 * characteristics relative to its multiple connections. We ignore 263 * this for now. We also assume that all cpu and memory sets have 264 * their distances represented at a common level. This won't be 265 * true for hierarchical NUMA. 266 * 267 * In any case the ibm,associativity-reference-points should give 268 * the correct depth for a normal NUMA system. 269 * 270 * - Dave Hansen <haveblue@us.ibm.com> 271 */ 272 static int __init find_min_common_depth(void) 273 { 274 int depth; 275 const unsigned int *ref_points; 276 struct device_node *rtas_root; 277 unsigned int len; 278 279 rtas_root = of_find_node_by_path("/rtas"); 280 281 if (!rtas_root) 282 return -1; 283 284 /* 285 * this property is 2 32-bit integers, each representing a level of 286 * depth in the associativity nodes. The first is for an SMP 287 * configuration (should be all 0's) and the second is for a normal 288 * NUMA configuration. 289 */ 290 ref_points = of_get_property(rtas_root, 291 "ibm,associativity-reference-points", &len); 292 293 if ((len >= 1) && ref_points) { 294 depth = ref_points[1]; 295 } else { 296 dbg("NUMA: ibm,associativity-reference-points not found.\n"); 297 depth = -1; 298 } 299 of_node_put(rtas_root); 300 301 return depth; 302 } 303 304 static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells) 305 { 306 struct device_node *memory = NULL; 307 308 memory = of_find_node_by_type(memory, "memory"); 309 if (!memory) 310 panic("numa.c: No memory nodes found!"); 311 312 *n_addr_cells = of_n_addr_cells(memory); 313 *n_size_cells = of_n_size_cells(memory); 314 of_node_put(memory); 315 } 316 317 static unsigned long __devinit read_n_cells(int n, const unsigned int **buf) 318 { 319 unsigned long result = 0; 320 321 while (n--) { 322 result = (result << 32) | **buf; 323 (*buf)++; 324 } 325 return result; 326 } 327 328 struct of_drconf_cell { 329 u64 base_addr; 330 u32 drc_index; 331 u32 reserved; 332 u32 aa_index; 333 u32 flags; 334 }; 335 336 #define DRCONF_MEM_ASSIGNED 0x00000008 337 #define DRCONF_MEM_AI_INVALID 0x00000040 338 #define DRCONF_MEM_RESERVED 0x00000080 339 340 /* 341 * Read the next lmb list entry from the ibm,dynamic-memory property 342 * and return the information in the provided of_drconf_cell structure. 343 */ 344 static void read_drconf_cell(struct of_drconf_cell *drmem, const u32 **cellp) 345 { 346 const u32 *cp; 347 348 drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp); 349 350 cp = *cellp; 351 drmem->drc_index = cp[0]; 352 drmem->reserved = cp[1]; 353 drmem->aa_index = cp[2]; 354 drmem->flags = cp[3]; 355 356 *cellp = cp + 4; 357 } 358 359 /* 360 * Retreive and validate the ibm,dynamic-memory property of the device tree. 361 * 362 * The layout of the ibm,dynamic-memory property is a number N of lmb 363 * list entries followed by N lmb list entries. Each lmb list entry 364 * contains information as layed out in the of_drconf_cell struct above. 365 */ 366 static int of_get_drconf_memory(struct device_node *memory, const u32 **dm) 367 { 368 const u32 *prop; 369 u32 len, entries; 370 371 prop = of_get_property(memory, "ibm,dynamic-memory", &len); 372 if (!prop || len < sizeof(unsigned int)) 373 return 0; 374 375 entries = *prop++; 376 377 /* Now that we know the number of entries, revalidate the size 378 * of the property read in to ensure we have everything 379 */ 380 if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int)) 381 return 0; 382 383 *dm = prop; 384 return entries; 385 } 386 387 /* 388 * Retreive and validate the ibm,lmb-size property for drconf memory 389 * from the device tree. 390 */ 391 static u64 of_get_lmb_size(struct device_node *memory) 392 { 393 const u32 *prop; 394 u32 len; 395 396 prop = of_get_property(memory, "ibm,lmb-size", &len); 397 if (!prop || len < sizeof(unsigned int)) 398 return 0; 399 400 return read_n_cells(n_mem_size_cells, &prop); 401 } 402 403 struct assoc_arrays { 404 u32 n_arrays; 405 u32 array_sz; 406 const u32 *arrays; 407 }; 408 409 /* 410 * Retreive and validate the list of associativity arrays for drconf 411 * memory from the ibm,associativity-lookup-arrays property of the 412 * device tree.. 413 * 414 * The layout of the ibm,associativity-lookup-arrays property is a number N 415 * indicating the number of associativity arrays, followed by a number M 416 * indicating the size of each associativity array, followed by a list 417 * of N associativity arrays. 418 */ 419 static int of_get_assoc_arrays(struct device_node *memory, 420 struct assoc_arrays *aa) 421 { 422 const u32 *prop; 423 u32 len; 424 425 prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len); 426 if (!prop || len < 2 * sizeof(unsigned int)) 427 return -1; 428 429 aa->n_arrays = *prop++; 430 aa->array_sz = *prop++; 431 432 /* Now that we know the number of arrrays and size of each array, 433 * revalidate the size of the property read in. 434 */ 435 if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int)) 436 return -1; 437 438 aa->arrays = prop; 439 return 0; 440 } 441 442 /* 443 * This is like of_node_to_nid_single() for memory represented in the 444 * ibm,dynamic-reconfiguration-memory node. 445 */ 446 static int of_drconf_to_nid_single(struct of_drconf_cell *drmem, 447 struct assoc_arrays *aa) 448 { 449 int default_nid = 0; 450 int nid = default_nid; 451 int index; 452 453 if (min_common_depth > 0 && min_common_depth <= aa->array_sz && 454 !(drmem->flags & DRCONF_MEM_AI_INVALID) && 455 drmem->aa_index < aa->n_arrays) { 456 index = drmem->aa_index * aa->array_sz + min_common_depth - 1; 457 nid = aa->arrays[index]; 458 459 if (nid == 0xffff || nid >= MAX_NUMNODES) 460 nid = default_nid; 461 } 462 463 return nid; 464 } 465 466 /* 467 * Figure out to which domain a cpu belongs and stick it there. 468 * Return the id of the domain used. 469 */ 470 static int __cpuinit numa_setup_cpu(unsigned long lcpu) 471 { 472 int nid = 0; 473 struct device_node *cpu = find_cpu_node(lcpu); 474 475 if (!cpu) { 476 WARN_ON(1); 477 goto out; 478 } 479 480 nid = of_node_to_nid_single(cpu); 481 482 if (nid < 0 || !node_online(nid)) 483 nid = any_online_node(NODE_MASK_ALL); 484 out: 485 map_cpu_to_node(lcpu, nid); 486 487 of_node_put(cpu); 488 489 return nid; 490 } 491 492 static int __cpuinit cpu_numa_callback(struct notifier_block *nfb, 493 unsigned long action, 494 void *hcpu) 495 { 496 unsigned long lcpu = (unsigned long)hcpu; 497 int ret = NOTIFY_DONE; 498 499 switch (action) { 500 case CPU_UP_PREPARE: 501 case CPU_UP_PREPARE_FROZEN: 502 numa_setup_cpu(lcpu); 503 ret = NOTIFY_OK; 504 break; 505 #ifdef CONFIG_HOTPLUG_CPU 506 case CPU_DEAD: 507 case CPU_DEAD_FROZEN: 508 case CPU_UP_CANCELED: 509 case CPU_UP_CANCELED_FROZEN: 510 unmap_cpu_from_node(lcpu); 511 break; 512 ret = NOTIFY_OK; 513 #endif 514 } 515 return ret; 516 } 517 518 /* 519 * Check and possibly modify a memory region to enforce the memory limit. 520 * 521 * Returns the size the region should have to enforce the memory limit. 522 * This will either be the original value of size, a truncated value, 523 * or zero. If the returned value of size is 0 the region should be 524 * discarded as it lies wholy above the memory limit. 525 */ 526 static unsigned long __init numa_enforce_memory_limit(unsigned long start, 527 unsigned long size) 528 { 529 /* 530 * We use lmb_end_of_DRAM() in here instead of memory_limit because 531 * we've already adjusted it for the limit and it takes care of 532 * having memory holes below the limit. Also, in the case of 533 * iommu_is_off, memory_limit is not set but is implicitly enforced. 534 */ 535 536 if (start + size <= lmb_end_of_DRAM()) 537 return size; 538 539 if (start >= lmb_end_of_DRAM()) 540 return 0; 541 542 return lmb_end_of_DRAM() - start; 543 } 544 545 /* 546 * Reads the counter for a given entry in 547 * linux,drconf-usable-memory property 548 */ 549 static inline int __init read_usm_ranges(const u32 **usm) 550 { 551 /* 552 * For each lmb in ibm,dynamic-memory a corresponding 553 * entry in linux,drconf-usable-memory property contains 554 * a counter followed by that many (base, size) duple. 555 * read the counter from linux,drconf-usable-memory 556 */ 557 return read_n_cells(n_mem_size_cells, usm); 558 } 559 560 /* 561 * Extract NUMA information from the ibm,dynamic-reconfiguration-memory 562 * node. This assumes n_mem_{addr,size}_cells have been set. 563 */ 564 static void __init parse_drconf_memory(struct device_node *memory) 565 { 566 const u32 *dm, *usm; 567 unsigned int n, rc, ranges, is_kexec_kdump = 0; 568 unsigned long lmb_size, base, size, sz; 569 int nid; 570 struct assoc_arrays aa; 571 572 n = of_get_drconf_memory(memory, &dm); 573 if (!n) 574 return; 575 576 lmb_size = of_get_lmb_size(memory); 577 if (!lmb_size) 578 return; 579 580 rc = of_get_assoc_arrays(memory, &aa); 581 if (rc) 582 return; 583 584 /* check if this is a kexec/kdump kernel */ 585 usm = of_get_usable_memory(memory); 586 if (usm != NULL) 587 is_kexec_kdump = 1; 588 589 for (; n != 0; --n) { 590 struct of_drconf_cell drmem; 591 592 read_drconf_cell(&drmem, &dm); 593 594 /* skip this block if the reserved bit is set in flags (0x80) 595 or if the block is not assigned to this partition (0x8) */ 596 if ((drmem.flags & DRCONF_MEM_RESERVED) 597 || !(drmem.flags & DRCONF_MEM_ASSIGNED)) 598 continue; 599 600 base = drmem.base_addr; 601 size = lmb_size; 602 ranges = 1; 603 604 if (is_kexec_kdump) { 605 ranges = read_usm_ranges(&usm); 606 if (!ranges) /* there are no (base, size) duple */ 607 continue; 608 } 609 do { 610 if (is_kexec_kdump) { 611 base = read_n_cells(n_mem_addr_cells, &usm); 612 size = read_n_cells(n_mem_size_cells, &usm); 613 } 614 nid = of_drconf_to_nid_single(&drmem, &aa); 615 fake_numa_create_new_node( 616 ((base + size) >> PAGE_SHIFT), 617 &nid); 618 node_set_online(nid); 619 sz = numa_enforce_memory_limit(base, size); 620 if (sz) 621 add_active_range(nid, base >> PAGE_SHIFT, 622 (base >> PAGE_SHIFT) 623 + (sz >> PAGE_SHIFT)); 624 } while (--ranges); 625 } 626 } 627 628 static int __init parse_numa_properties(void) 629 { 630 struct device_node *cpu = NULL; 631 struct device_node *memory = NULL; 632 int default_nid = 0; 633 unsigned long i; 634 635 if (numa_enabled == 0) { 636 printk(KERN_WARNING "NUMA disabled by user\n"); 637 return -1; 638 } 639 640 min_common_depth = find_min_common_depth(); 641 642 if (min_common_depth < 0) 643 return min_common_depth; 644 645 dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); 646 647 /* 648 * Even though we connect cpus to numa domains later in SMP 649 * init, we need to know the node ids now. This is because 650 * each node to be onlined must have NODE_DATA etc backing it. 651 */ 652 for_each_present_cpu(i) { 653 int nid; 654 655 cpu = find_cpu_node(i); 656 BUG_ON(!cpu); 657 nid = of_node_to_nid_single(cpu); 658 of_node_put(cpu); 659 660 /* 661 * Don't fall back to default_nid yet -- we will plug 662 * cpus into nodes once the memory scan has discovered 663 * the topology. 664 */ 665 if (nid < 0) 666 continue; 667 node_set_online(nid); 668 } 669 670 get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells); 671 memory = NULL; 672 while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { 673 unsigned long start; 674 unsigned long size; 675 int nid; 676 int ranges; 677 const unsigned int *memcell_buf; 678 unsigned int len; 679 680 memcell_buf = of_get_property(memory, 681 "linux,usable-memory", &len); 682 if (!memcell_buf || len <= 0) 683 memcell_buf = of_get_property(memory, "reg", &len); 684 if (!memcell_buf || len <= 0) 685 continue; 686 687 /* ranges in cell */ 688 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); 689 new_range: 690 /* these are order-sensitive, and modify the buffer pointer */ 691 start = read_n_cells(n_mem_addr_cells, &memcell_buf); 692 size = read_n_cells(n_mem_size_cells, &memcell_buf); 693 694 /* 695 * Assumption: either all memory nodes or none will 696 * have associativity properties. If none, then 697 * everything goes to default_nid. 698 */ 699 nid = of_node_to_nid_single(memory); 700 if (nid < 0) 701 nid = default_nid; 702 703 fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid); 704 node_set_online(nid); 705 706 if (!(size = numa_enforce_memory_limit(start, size))) { 707 if (--ranges) 708 goto new_range; 709 else 710 continue; 711 } 712 713 add_active_range(nid, start >> PAGE_SHIFT, 714 (start >> PAGE_SHIFT) + (size >> PAGE_SHIFT)); 715 716 if (--ranges) 717 goto new_range; 718 } 719 720 /* 721 * Now do the same thing for each LMB listed in the ibm,dynamic-memory 722 * property in the ibm,dynamic-reconfiguration-memory node. 723 */ 724 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 725 if (memory) 726 parse_drconf_memory(memory); 727 728 return 0; 729 } 730 731 static void __init setup_nonnuma(void) 732 { 733 unsigned long top_of_ram = lmb_end_of_DRAM(); 734 unsigned long total_ram = lmb_phys_mem_size(); 735 unsigned long start_pfn, end_pfn; 736 unsigned int i, nid = 0; 737 738 printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", 739 top_of_ram, total_ram); 740 printk(KERN_DEBUG "Memory hole size: %ldMB\n", 741 (top_of_ram - total_ram) >> 20); 742 743 for (i = 0; i < lmb.memory.cnt; ++i) { 744 start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT; 745 end_pfn = start_pfn + lmb_size_pages(&lmb.memory, i); 746 747 fake_numa_create_new_node(end_pfn, &nid); 748 add_active_range(nid, start_pfn, end_pfn); 749 node_set_online(nid); 750 } 751 } 752 753 void __init dump_numa_cpu_topology(void) 754 { 755 unsigned int node; 756 unsigned int cpu, count; 757 758 if (min_common_depth == -1 || !numa_enabled) 759 return; 760 761 for_each_online_node(node) { 762 printk(KERN_DEBUG "Node %d CPUs:", node); 763 764 count = 0; 765 /* 766 * If we used a CPU iterator here we would miss printing 767 * the holes in the cpumap. 768 */ 769 for (cpu = 0; cpu < NR_CPUS; cpu++) { 770 if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) { 771 if (count == 0) 772 printk(" %u", cpu); 773 ++count; 774 } else { 775 if (count > 1) 776 printk("-%u", cpu - 1); 777 count = 0; 778 } 779 } 780 781 if (count > 1) 782 printk("-%u", NR_CPUS - 1); 783 printk("\n"); 784 } 785 } 786 787 static void __init dump_numa_memory_topology(void) 788 { 789 unsigned int node; 790 unsigned int count; 791 792 if (min_common_depth == -1 || !numa_enabled) 793 return; 794 795 for_each_online_node(node) { 796 unsigned long i; 797 798 printk(KERN_DEBUG "Node %d Memory:", node); 799 800 count = 0; 801 802 for (i = 0; i < lmb_end_of_DRAM(); 803 i += (1 << SECTION_SIZE_BITS)) { 804 if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) { 805 if (count == 0) 806 printk(" 0x%lx", i); 807 ++count; 808 } else { 809 if (count > 0) 810 printk("-0x%lx", i); 811 count = 0; 812 } 813 } 814 815 if (count > 0) 816 printk("-0x%lx", i); 817 printk("\n"); 818 } 819 } 820 821 /* 822 * Allocate some memory, satisfying the lmb or bootmem allocator where 823 * required. nid is the preferred node and end is the physical address of 824 * the highest address in the node. 825 * 826 * Returns the virtual address of the memory. 827 */ 828 static void __init *careful_zallocation(int nid, unsigned long size, 829 unsigned long align, 830 unsigned long end_pfn) 831 { 832 void *ret; 833 int new_nid; 834 unsigned long ret_paddr; 835 836 ret_paddr = __lmb_alloc_base(size, align, end_pfn << PAGE_SHIFT); 837 838 /* retry over all memory */ 839 if (!ret_paddr) 840 ret_paddr = __lmb_alloc_base(size, align, lmb_end_of_DRAM()); 841 842 if (!ret_paddr) 843 panic("numa.c: cannot allocate %lu bytes for node %d", 844 size, nid); 845 846 ret = __va(ret_paddr); 847 848 /* 849 * We initialize the nodes in numeric order: 0, 1, 2... 850 * and hand over control from the LMB allocator to the 851 * bootmem allocator. If this function is called for 852 * node 5, then we know that all nodes <5 are using the 853 * bootmem allocator instead of the LMB allocator. 854 * 855 * So, check the nid from which this allocation came 856 * and double check to see if we need to use bootmem 857 * instead of the LMB. We don't free the LMB memory 858 * since it would be useless. 859 */ 860 new_nid = early_pfn_to_nid(ret_paddr >> PAGE_SHIFT); 861 if (new_nid < nid) { 862 ret = __alloc_bootmem_node(NODE_DATA(new_nid), 863 size, align, 0); 864 865 dbg("alloc_bootmem %p %lx\n", ret, size); 866 } 867 868 memset(ret, 0, size); 869 return ret; 870 } 871 872 static struct notifier_block __cpuinitdata ppc64_numa_nb = { 873 .notifier_call = cpu_numa_callback, 874 .priority = 1 /* Must run before sched domains notifier. */ 875 }; 876 877 static void mark_reserved_regions_for_nid(int nid) 878 { 879 struct pglist_data *node = NODE_DATA(nid); 880 int i; 881 882 for (i = 0; i < lmb.reserved.cnt; i++) { 883 unsigned long physbase = lmb.reserved.region[i].base; 884 unsigned long size = lmb.reserved.region[i].size; 885 unsigned long start_pfn = physbase >> PAGE_SHIFT; 886 unsigned long end_pfn = PFN_UP(physbase + size); 887 struct node_active_region node_ar; 888 unsigned long node_end_pfn = node->node_start_pfn + 889 node->node_spanned_pages; 890 891 /* 892 * Check to make sure that this lmb.reserved area is 893 * within the bounds of the node that we care about. 894 * Checking the nid of the start and end points is not 895 * sufficient because the reserved area could span the 896 * entire node. 897 */ 898 if (end_pfn <= node->node_start_pfn || 899 start_pfn >= node_end_pfn) 900 continue; 901 902 get_node_active_region(start_pfn, &node_ar); 903 while (start_pfn < end_pfn && 904 node_ar.start_pfn < node_ar.end_pfn) { 905 unsigned long reserve_size = size; 906 /* 907 * if reserved region extends past active region 908 * then trim size to active region 909 */ 910 if (end_pfn > node_ar.end_pfn) 911 reserve_size = (node_ar.end_pfn << PAGE_SHIFT) 912 - physbase; 913 /* 914 * Only worry about *this* node, others may not 915 * yet have valid NODE_DATA(). 916 */ 917 if (node_ar.nid == nid) { 918 dbg("reserve_bootmem %lx %lx nid=%d\n", 919 physbase, reserve_size, node_ar.nid); 920 reserve_bootmem_node(NODE_DATA(node_ar.nid), 921 physbase, reserve_size, 922 BOOTMEM_DEFAULT); 923 } 924 /* 925 * if reserved region is contained in the active region 926 * then done. 927 */ 928 if (end_pfn <= node_ar.end_pfn) 929 break; 930 931 /* 932 * reserved region extends past the active region 933 * get next active region that contains this 934 * reserved region 935 */ 936 start_pfn = node_ar.end_pfn; 937 physbase = start_pfn << PAGE_SHIFT; 938 size = size - reserve_size; 939 get_node_active_region(start_pfn, &node_ar); 940 } 941 } 942 } 943 944 945 void __init do_init_bootmem(void) 946 { 947 int nid; 948 949 min_low_pfn = 0; 950 max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT; 951 max_pfn = max_low_pfn; 952 953 if (parse_numa_properties()) 954 setup_nonnuma(); 955 else 956 dump_numa_memory_topology(); 957 958 register_cpu_notifier(&ppc64_numa_nb); 959 cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE, 960 (void *)(unsigned long)boot_cpuid); 961 962 for_each_online_node(nid) { 963 unsigned long start_pfn, end_pfn; 964 void *bootmem_vaddr; 965 unsigned long bootmap_pages; 966 967 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 968 969 /* 970 * Allocate the node structure node local if possible 971 * 972 * Be careful moving this around, as it relies on all 973 * previous nodes' bootmem to be initialized and have 974 * all reserved areas marked. 975 */ 976 NODE_DATA(nid) = careful_zallocation(nid, 977 sizeof(struct pglist_data), 978 SMP_CACHE_BYTES, end_pfn); 979 980 dbg("node %d\n", nid); 981 dbg("NODE_DATA() = %p\n", NODE_DATA(nid)); 982 983 NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; 984 NODE_DATA(nid)->node_start_pfn = start_pfn; 985 NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; 986 987 if (NODE_DATA(nid)->node_spanned_pages == 0) 988 continue; 989 990 dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT); 991 dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT); 992 993 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 994 bootmem_vaddr = careful_zallocation(nid, 995 bootmap_pages << PAGE_SHIFT, 996 PAGE_SIZE, end_pfn); 997 998 dbg("bootmap_vaddr = %p\n", bootmem_vaddr); 999 1000 init_bootmem_node(NODE_DATA(nid), 1001 __pa(bootmem_vaddr) >> PAGE_SHIFT, 1002 start_pfn, end_pfn); 1003 1004 free_bootmem_with_active_regions(nid, end_pfn); 1005 /* 1006 * Be very careful about moving this around. Future 1007 * calls to careful_zallocation() depend on this getting 1008 * done correctly. 1009 */ 1010 mark_reserved_regions_for_nid(nid); 1011 sparse_memory_present_with_active_regions(nid); 1012 } 1013 } 1014 1015 void __init paging_init(void) 1016 { 1017 unsigned long max_zone_pfns[MAX_NR_ZONES]; 1018 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 1019 max_zone_pfns[ZONE_DMA] = lmb_end_of_DRAM() >> PAGE_SHIFT; 1020 free_area_init_nodes(max_zone_pfns); 1021 } 1022 1023 static int __init early_numa(char *p) 1024 { 1025 if (!p) 1026 return 0; 1027 1028 if (strstr(p, "off")) 1029 numa_enabled = 0; 1030 1031 if (strstr(p, "debug")) 1032 numa_debug = 1; 1033 1034 p = strstr(p, "fake="); 1035 if (p) 1036 cmdline = p + strlen("fake="); 1037 1038 return 0; 1039 } 1040 early_param("numa", early_numa); 1041 1042 #ifdef CONFIG_MEMORY_HOTPLUG 1043 /* 1044 * Validate the node associated with the memory section we are 1045 * trying to add. 1046 */ 1047 int valid_hot_add_scn(int *nid, unsigned long start, u32 lmb_size, 1048 unsigned long scn_addr) 1049 { 1050 nodemask_t nodes; 1051 1052 if (*nid < 0 || !node_online(*nid)) 1053 *nid = any_online_node(NODE_MASK_ALL); 1054 1055 if ((scn_addr >= start) && (scn_addr < (start + lmb_size))) { 1056 nodes_setall(nodes); 1057 while (NODE_DATA(*nid)->node_spanned_pages == 0) { 1058 node_clear(*nid, nodes); 1059 *nid = any_online_node(nodes); 1060 } 1061 1062 return 1; 1063 } 1064 1065 return 0; 1066 } 1067 1068 /* 1069 * Find the node associated with a hot added memory section represented 1070 * by the ibm,dynamic-reconfiguration-memory node. 1071 */ 1072 static int hot_add_drconf_scn_to_nid(struct device_node *memory, 1073 unsigned long scn_addr) 1074 { 1075 const u32 *dm; 1076 unsigned int n, rc; 1077 unsigned long lmb_size; 1078 int default_nid = any_online_node(NODE_MASK_ALL); 1079 int nid; 1080 struct assoc_arrays aa; 1081 1082 n = of_get_drconf_memory(memory, &dm); 1083 if (!n) 1084 return default_nid;; 1085 1086 lmb_size = of_get_lmb_size(memory); 1087 if (!lmb_size) 1088 return default_nid; 1089 1090 rc = of_get_assoc_arrays(memory, &aa); 1091 if (rc) 1092 return default_nid; 1093 1094 for (; n != 0; --n) { 1095 struct of_drconf_cell drmem; 1096 1097 read_drconf_cell(&drmem, &dm); 1098 1099 /* skip this block if it is reserved or not assigned to 1100 * this partition */ 1101 if ((drmem.flags & DRCONF_MEM_RESERVED) 1102 || !(drmem.flags & DRCONF_MEM_ASSIGNED)) 1103 continue; 1104 1105 nid = of_drconf_to_nid_single(&drmem, &aa); 1106 1107 if (valid_hot_add_scn(&nid, drmem.base_addr, lmb_size, 1108 scn_addr)) 1109 return nid; 1110 } 1111 1112 BUG(); /* section address should be found above */ 1113 return 0; 1114 } 1115 1116 /* 1117 * Find the node associated with a hot added memory section. Section 1118 * corresponds to a SPARSEMEM section, not an LMB. It is assumed that 1119 * sections are fully contained within a single LMB. 1120 */ 1121 int hot_add_scn_to_nid(unsigned long scn_addr) 1122 { 1123 struct device_node *memory = NULL; 1124 int nid; 1125 1126 if (!numa_enabled || (min_common_depth < 0)) 1127 return any_online_node(NODE_MASK_ALL); 1128 1129 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 1130 if (memory) { 1131 nid = hot_add_drconf_scn_to_nid(memory, scn_addr); 1132 of_node_put(memory); 1133 return nid; 1134 } 1135 1136 while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { 1137 unsigned long start, size; 1138 int ranges; 1139 const unsigned int *memcell_buf; 1140 unsigned int len; 1141 1142 memcell_buf = of_get_property(memory, "reg", &len); 1143 if (!memcell_buf || len <= 0) 1144 continue; 1145 1146 /* ranges in cell */ 1147 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); 1148 ha_new_range: 1149 start = read_n_cells(n_mem_addr_cells, &memcell_buf); 1150 size = read_n_cells(n_mem_size_cells, &memcell_buf); 1151 nid = of_node_to_nid_single(memory); 1152 1153 if (valid_hot_add_scn(&nid, start, size, scn_addr)) { 1154 of_node_put(memory); 1155 return nid; 1156 } 1157 1158 if (--ranges) /* process all ranges in cell */ 1159 goto ha_new_range; 1160 } 1161 BUG(); /* section address should be found above */ 1162 return 0; 1163 } 1164 #endif /* CONFIG_MEMORY_HOTPLUG */ 1165