1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * pSeries NUMA support 4 * 5 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM 6 */ 7 #define pr_fmt(fmt) "numa: " fmt 8 9 #include <linux/threads.h> 10 #include <linux/memblock.h> 11 #include <linux/init.h> 12 #include <linux/mm.h> 13 #include <linux/mmzone.h> 14 #include <linux/export.h> 15 #include <linux/nodemask.h> 16 #include <linux/cpu.h> 17 #include <linux/notifier.h> 18 #include <linux/of.h> 19 #include <linux/of_address.h> 20 #include <linux/pfn.h> 21 #include <linux/cpuset.h> 22 #include <linux/node.h> 23 #include <linux/stop_machine.h> 24 #include <linux/proc_fs.h> 25 #include <linux/seq_file.h> 26 #include <linux/uaccess.h> 27 #include <linux/slab.h> 28 #include <asm/cputhreads.h> 29 #include <asm/sparsemem.h> 30 #include <asm/smp.h> 31 #include <asm/topology.h> 32 #include <asm/firmware.h> 33 #include <asm/paca.h> 34 #include <asm/hvcall.h> 35 #include <asm/setup.h> 36 #include <asm/vdso.h> 37 #include <asm/vphn.h> 38 #include <asm/drmem.h> 39 40 static int numa_enabled = 1; 41 42 static char *cmdline __initdata; 43 44 int numa_cpu_lookup_table[NR_CPUS]; 45 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 46 47 EXPORT_SYMBOL(numa_cpu_lookup_table); 48 EXPORT_SYMBOL(node_to_cpumask_map); 49 50 static int primary_domain_index; 51 static int n_mem_addr_cells, n_mem_size_cells; 52 53 #define FORM0_AFFINITY 0 54 #define FORM1_AFFINITY 1 55 #define FORM2_AFFINITY 2 56 static int affinity_form; 57 58 #define MAX_DISTANCE_REF_POINTS 4 59 static int distance_ref_points_depth; 60 static const __be32 *distance_ref_points; 61 static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS]; 62 static int numa_distance_table[MAX_NUMNODES][MAX_NUMNODES] = { 63 [0 ... MAX_NUMNODES - 1] = { [0 ... MAX_NUMNODES - 1] = -1 } 64 }; 65 static int numa_id_index_table[MAX_NUMNODES] = { [0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE }; 66 67 /* 68 * Allocate node_to_cpumask_map based on number of available nodes 69 * Requires node_possible_map to be valid. 70 * 71 * Note: cpumask_of_node() is not valid until after this is done. 72 */ 73 static void __init setup_node_to_cpumask_map(void) 74 { 75 unsigned int node; 76 77 /* setup nr_node_ids if not done yet */ 78 if (nr_node_ids == MAX_NUMNODES) 79 setup_nr_node_ids(); 80 81 /* allocate the map */ 82 for_each_node(node) 83 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); 84 85 /* cpumask_of_node() will now work */ 86 pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); 87 } 88 89 static int __init fake_numa_create_new_node(unsigned long end_pfn, 90 unsigned int *nid) 91 { 92 unsigned long long mem; 93 char *p = cmdline; 94 static unsigned int fake_nid; 95 static unsigned long long curr_boundary; 96 97 /* 98 * Modify node id, iff we started creating NUMA nodes 99 * We want to continue from where we left of the last time 100 */ 101 if (fake_nid) 102 *nid = fake_nid; 103 /* 104 * In case there are no more arguments to parse, the 105 * node_id should be the same as the last fake node id 106 * (we've handled this above). 107 */ 108 if (!p) 109 return 0; 110 111 mem = memparse(p, &p); 112 if (!mem) 113 return 0; 114 115 if (mem < curr_boundary) 116 return 0; 117 118 curr_boundary = mem; 119 120 if ((end_pfn << PAGE_SHIFT) > mem) { 121 /* 122 * Skip commas and spaces 123 */ 124 while (*p == ',' || *p == ' ' || *p == '\t') 125 p++; 126 127 cmdline = p; 128 fake_nid++; 129 *nid = fake_nid; 130 pr_debug("created new fake_node with id %d\n", fake_nid); 131 return 1; 132 } 133 return 0; 134 } 135 136 static void __init reset_numa_cpu_lookup_table(void) 137 { 138 unsigned int cpu; 139 140 for_each_possible_cpu(cpu) 141 numa_cpu_lookup_table[cpu] = -1; 142 } 143 144 void map_cpu_to_node(int cpu, int node) 145 { 146 update_numa_cpu_lookup_table(cpu, node); 147 148 if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) { 149 pr_debug("adding cpu %d to node %d\n", cpu, node); 150 cpumask_set_cpu(cpu, node_to_cpumask_map[node]); 151 } 152 } 153 154 #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR) 155 void unmap_cpu_from_node(unsigned long cpu) 156 { 157 int node = numa_cpu_lookup_table[cpu]; 158 159 if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) { 160 cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); 161 pr_debug("removing cpu %lu from node %d\n", cpu, node); 162 } else { 163 pr_warn("Warning: cpu %lu not found in node %d\n", cpu, node); 164 } 165 } 166 #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ 167 168 static int __associativity_to_nid(const __be32 *associativity, 169 int max_array_sz) 170 { 171 int nid; 172 /* 173 * primary_domain_index is 1 based array index. 174 */ 175 int index = primary_domain_index - 1; 176 177 if (!numa_enabled || index >= max_array_sz) 178 return NUMA_NO_NODE; 179 180 nid = of_read_number(&associativity[index], 1); 181 182 /* POWER4 LPAR uses 0xffff as invalid node */ 183 if (nid == 0xffff || nid >= nr_node_ids) 184 nid = NUMA_NO_NODE; 185 return nid; 186 } 187 /* 188 * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA 189 * info is found. 190 */ 191 static int associativity_to_nid(const __be32 *associativity) 192 { 193 int array_sz = of_read_number(associativity, 1); 194 195 /* Skip the first element in the associativity array */ 196 return __associativity_to_nid((associativity + 1), array_sz); 197 } 198 199 static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) 200 { 201 int dist; 202 int node1, node2; 203 204 node1 = associativity_to_nid(cpu1_assoc); 205 node2 = associativity_to_nid(cpu2_assoc); 206 207 dist = numa_distance_table[node1][node2]; 208 if (dist <= LOCAL_DISTANCE) 209 return 0; 210 else if (dist <= REMOTE_DISTANCE) 211 return 1; 212 else 213 return 2; 214 } 215 216 static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) 217 { 218 int dist = 0; 219 220 int i, index; 221 222 for (i = 0; i < distance_ref_points_depth; i++) { 223 index = be32_to_cpu(distance_ref_points[i]); 224 if (cpu1_assoc[index] == cpu2_assoc[index]) 225 break; 226 dist++; 227 } 228 229 return dist; 230 } 231 232 int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) 233 { 234 /* We should not get called with FORM0 */ 235 VM_WARN_ON(affinity_form == FORM0_AFFINITY); 236 if (affinity_form == FORM1_AFFINITY) 237 return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc); 238 return __cpu_form2_relative_distance(cpu1_assoc, cpu2_assoc); 239 } 240 241 /* must hold reference to node during call */ 242 static const __be32 *of_get_associativity(struct device_node *dev) 243 { 244 return of_get_property(dev, "ibm,associativity", NULL); 245 } 246 247 int __node_distance(int a, int b) 248 { 249 int i; 250 int distance = LOCAL_DISTANCE; 251 252 if (affinity_form == FORM2_AFFINITY) 253 return numa_distance_table[a][b]; 254 else if (affinity_form == FORM0_AFFINITY) 255 return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE); 256 257 for (i = 0; i < distance_ref_points_depth; i++) { 258 if (distance_lookup_table[a][i] == distance_lookup_table[b][i]) 259 break; 260 261 /* Double the distance for each NUMA level */ 262 distance *= 2; 263 } 264 265 return distance; 266 } 267 EXPORT_SYMBOL(__node_distance); 268 269 /* Returns the nid associated with the given device tree node, 270 * or -1 if not found. 271 */ 272 static int of_node_to_nid_single(struct device_node *device) 273 { 274 int nid = NUMA_NO_NODE; 275 const __be32 *tmp; 276 277 tmp = of_get_associativity(device); 278 if (tmp) 279 nid = associativity_to_nid(tmp); 280 return nid; 281 } 282 283 /* Walk the device tree upwards, looking for an associativity id */ 284 int of_node_to_nid(struct device_node *device) 285 { 286 int nid = NUMA_NO_NODE; 287 288 of_node_get(device); 289 while (device) { 290 nid = of_node_to_nid_single(device); 291 if (nid != -1) 292 break; 293 294 device = of_get_next_parent(device); 295 } 296 of_node_put(device); 297 298 return nid; 299 } 300 EXPORT_SYMBOL(of_node_to_nid); 301 302 static void __initialize_form1_numa_distance(const __be32 *associativity, 303 int max_array_sz) 304 { 305 int i, nid; 306 307 if (affinity_form != FORM1_AFFINITY) 308 return; 309 310 nid = __associativity_to_nid(associativity, max_array_sz); 311 if (nid != NUMA_NO_NODE) { 312 for (i = 0; i < distance_ref_points_depth; i++) { 313 const __be32 *entry; 314 int index = be32_to_cpu(distance_ref_points[i]) - 1; 315 316 /* 317 * broken hierarchy, return with broken distance table 318 */ 319 if (WARN(index >= max_array_sz, "Broken ibm,associativity property")) 320 return; 321 322 entry = &associativity[index]; 323 distance_lookup_table[nid][i] = of_read_number(entry, 1); 324 } 325 } 326 } 327 328 static void initialize_form1_numa_distance(const __be32 *associativity) 329 { 330 int array_sz; 331 332 array_sz = of_read_number(associativity, 1); 333 /* Skip the first element in the associativity array */ 334 __initialize_form1_numa_distance(associativity + 1, array_sz); 335 } 336 337 /* 338 * Used to update distance information w.r.t newly added node. 339 */ 340 void update_numa_distance(struct device_node *node) 341 { 342 int nid; 343 344 if (affinity_form == FORM0_AFFINITY) 345 return; 346 else if (affinity_form == FORM1_AFFINITY) { 347 const __be32 *associativity; 348 349 associativity = of_get_associativity(node); 350 if (!associativity) 351 return; 352 353 initialize_form1_numa_distance(associativity); 354 return; 355 } 356 357 /* FORM2 affinity */ 358 nid = of_node_to_nid_single(node); 359 if (nid == NUMA_NO_NODE) 360 return; 361 362 /* 363 * With FORM2 we expect NUMA distance of all possible NUMA 364 * nodes to be provided during boot. 365 */ 366 WARN(numa_distance_table[nid][nid] == -1, 367 "NUMA distance details for node %d not provided\n", nid); 368 } 369 EXPORT_SYMBOL_GPL(update_numa_distance); 370 371 /* 372 * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN} 373 * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6, .... N elements} 374 */ 375 static void __init initialize_form2_numa_distance_lookup_table(void) 376 { 377 int i, j; 378 struct device_node *root; 379 const __u8 *form2_distances; 380 const __be32 *numa_lookup_index; 381 int form2_distances_length; 382 int max_numa_index, distance_index; 383 384 if (firmware_has_feature(FW_FEATURE_OPAL)) 385 root = of_find_node_by_path("/ibm,opal"); 386 else 387 root = of_find_node_by_path("/rtas"); 388 if (!root) 389 root = of_find_node_by_path("/"); 390 391 numa_lookup_index = of_get_property(root, "ibm,numa-lookup-index-table", NULL); 392 max_numa_index = of_read_number(&numa_lookup_index[0], 1); 393 394 /* first element of the array is the size and is encode-int */ 395 form2_distances = of_get_property(root, "ibm,numa-distance-table", NULL); 396 form2_distances_length = of_read_number((const __be32 *)&form2_distances[0], 1); 397 /* Skip the size which is encoded int */ 398 form2_distances += sizeof(__be32); 399 400 pr_debug("form2_distances_len = %d, numa_dist_indexes_len = %d\n", 401 form2_distances_length, max_numa_index); 402 403 for (i = 0; i < max_numa_index; i++) 404 /* +1 skip the max_numa_index in the property */ 405 numa_id_index_table[i] = of_read_number(&numa_lookup_index[i + 1], 1); 406 407 408 if (form2_distances_length != max_numa_index * max_numa_index) { 409 WARN(1, "Wrong NUMA distance information\n"); 410 form2_distances = NULL; // don't use it 411 } 412 distance_index = 0; 413 for (i = 0; i < max_numa_index; i++) { 414 for (j = 0; j < max_numa_index; j++) { 415 int nodeA = numa_id_index_table[i]; 416 int nodeB = numa_id_index_table[j]; 417 int dist; 418 419 if (form2_distances) 420 dist = form2_distances[distance_index++]; 421 else if (nodeA == nodeB) 422 dist = LOCAL_DISTANCE; 423 else 424 dist = REMOTE_DISTANCE; 425 numa_distance_table[nodeA][nodeB] = dist; 426 pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, dist); 427 } 428 } 429 430 of_node_put(root); 431 } 432 433 static int __init find_primary_domain_index(void) 434 { 435 int index; 436 struct device_node *root; 437 438 /* 439 * Check for which form of affinity. 440 */ 441 if (firmware_has_feature(FW_FEATURE_OPAL)) { 442 affinity_form = FORM1_AFFINITY; 443 } else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) { 444 pr_debug("Using form 2 affinity\n"); 445 affinity_form = FORM2_AFFINITY; 446 } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) { 447 pr_debug("Using form 1 affinity\n"); 448 affinity_form = FORM1_AFFINITY; 449 } else 450 affinity_form = FORM0_AFFINITY; 451 452 if (firmware_has_feature(FW_FEATURE_OPAL)) 453 root = of_find_node_by_path("/ibm,opal"); 454 else 455 root = of_find_node_by_path("/rtas"); 456 if (!root) 457 root = of_find_node_by_path("/"); 458 459 /* 460 * This property is a set of 32-bit integers, each representing 461 * an index into the ibm,associativity nodes. 462 * 463 * With form 0 affinity the first integer is for an SMP configuration 464 * (should be all 0's) and the second is for a normal NUMA 465 * configuration. We have only one level of NUMA. 466 * 467 * With form 1 affinity the first integer is the most significant 468 * NUMA boundary and the following are progressively less significant 469 * boundaries. There can be more than one level of NUMA. 470 */ 471 distance_ref_points = of_get_property(root, 472 "ibm,associativity-reference-points", 473 &distance_ref_points_depth); 474 475 if (!distance_ref_points) { 476 pr_debug("ibm,associativity-reference-points not found.\n"); 477 goto err; 478 } 479 480 distance_ref_points_depth /= sizeof(int); 481 if (affinity_form == FORM0_AFFINITY) { 482 if (distance_ref_points_depth < 2) { 483 pr_warn("short ibm,associativity-reference-points\n"); 484 goto err; 485 } 486 487 index = of_read_number(&distance_ref_points[1], 1); 488 } else { 489 /* 490 * Both FORM1 and FORM2 affinity find the primary domain details 491 * at the same offset. 492 */ 493 index = of_read_number(distance_ref_points, 1); 494 } 495 /* 496 * Warn and cap if the hardware supports more than 497 * MAX_DISTANCE_REF_POINTS domains. 498 */ 499 if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) { 500 pr_warn("distance array capped at %d entries\n", 501 MAX_DISTANCE_REF_POINTS); 502 distance_ref_points_depth = MAX_DISTANCE_REF_POINTS; 503 } 504 505 of_node_put(root); 506 return index; 507 508 err: 509 of_node_put(root); 510 return -1; 511 } 512 513 static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells) 514 { 515 struct device_node *memory = NULL; 516 517 memory = of_find_node_by_type(memory, "memory"); 518 if (!memory) 519 panic("numa.c: No memory nodes found!"); 520 521 *n_addr_cells = of_n_addr_cells(memory); 522 *n_size_cells = of_n_size_cells(memory); 523 of_node_put(memory); 524 } 525 526 static unsigned long read_n_cells(int n, const __be32 **buf) 527 { 528 unsigned long result = 0; 529 530 while (n--) { 531 result = (result << 32) | of_read_number(*buf, 1); 532 (*buf)++; 533 } 534 return result; 535 } 536 537 struct assoc_arrays { 538 u32 n_arrays; 539 u32 array_sz; 540 const __be32 *arrays; 541 }; 542 543 /* 544 * Retrieve and validate the list of associativity arrays for drconf 545 * memory from the ibm,associativity-lookup-arrays property of the 546 * device tree.. 547 * 548 * The layout of the ibm,associativity-lookup-arrays property is a number N 549 * indicating the number of associativity arrays, followed by a number M 550 * indicating the size of each associativity array, followed by a list 551 * of N associativity arrays. 552 */ 553 static int of_get_assoc_arrays(struct assoc_arrays *aa) 554 { 555 struct device_node *memory; 556 const __be32 *prop; 557 u32 len; 558 559 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 560 if (!memory) 561 return -1; 562 563 prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len); 564 if (!prop || len < 2 * sizeof(unsigned int)) { 565 of_node_put(memory); 566 return -1; 567 } 568 569 aa->n_arrays = of_read_number(prop++, 1); 570 aa->array_sz = of_read_number(prop++, 1); 571 572 of_node_put(memory); 573 574 /* Now that we know the number of arrays and size of each array, 575 * revalidate the size of the property read in. 576 */ 577 if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int)) 578 return -1; 579 580 aa->arrays = prop; 581 return 0; 582 } 583 584 static int __init get_nid_and_numa_distance(struct drmem_lmb *lmb) 585 { 586 struct assoc_arrays aa = { .arrays = NULL }; 587 int default_nid = NUMA_NO_NODE; 588 int nid = default_nid; 589 int rc, index; 590 591 if ((primary_domain_index < 0) || !numa_enabled) 592 return default_nid; 593 594 rc = of_get_assoc_arrays(&aa); 595 if (rc) 596 return default_nid; 597 598 if (primary_domain_index <= aa.array_sz && 599 !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { 600 const __be32 *associativity; 601 602 index = lmb->aa_index * aa.array_sz; 603 associativity = &aa.arrays[index]; 604 nid = __associativity_to_nid(associativity, aa.array_sz); 605 if (nid > 0 && affinity_form == FORM1_AFFINITY) { 606 /* 607 * lookup array associativity entries have 608 * no length of the array as the first element. 609 */ 610 __initialize_form1_numa_distance(associativity, aa.array_sz); 611 } 612 } 613 return nid; 614 } 615 616 /* 617 * This is like of_node_to_nid_single() for memory represented in the 618 * ibm,dynamic-reconfiguration-memory node. 619 */ 620 int of_drconf_to_nid_single(struct drmem_lmb *lmb) 621 { 622 struct assoc_arrays aa = { .arrays = NULL }; 623 int default_nid = NUMA_NO_NODE; 624 int nid = default_nid; 625 int rc, index; 626 627 if ((primary_domain_index < 0) || !numa_enabled) 628 return default_nid; 629 630 rc = of_get_assoc_arrays(&aa); 631 if (rc) 632 return default_nid; 633 634 if (primary_domain_index <= aa.array_sz && 635 !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { 636 const __be32 *associativity; 637 638 index = lmb->aa_index * aa.array_sz; 639 associativity = &aa.arrays[index]; 640 nid = __associativity_to_nid(associativity, aa.array_sz); 641 } 642 return nid; 643 } 644 645 #ifdef CONFIG_PPC_SPLPAR 646 647 static int __vphn_get_associativity(long lcpu, __be32 *associativity) 648 { 649 long rc, hwid; 650 651 /* 652 * On a shared lpar, device tree will not have node associativity. 653 * At this time lppaca, or its __old_status field may not be 654 * updated. Hence kernel cannot detect if its on a shared lpar. So 655 * request an explicit associativity irrespective of whether the 656 * lpar is shared or dedicated. Use the device tree property as a 657 * fallback. cpu_to_phys_id is only valid between 658 * smp_setup_cpu_maps() and smp_setup_pacas(). 659 */ 660 if (firmware_has_feature(FW_FEATURE_VPHN)) { 661 if (cpu_to_phys_id) 662 hwid = cpu_to_phys_id[lcpu]; 663 else 664 hwid = get_hard_smp_processor_id(lcpu); 665 666 rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity); 667 if (rc == H_SUCCESS) 668 return 0; 669 } 670 671 return -1; 672 } 673 674 static int vphn_get_nid(long lcpu) 675 { 676 __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; 677 678 679 if (!__vphn_get_associativity(lcpu, associativity)) 680 return associativity_to_nid(associativity); 681 682 return NUMA_NO_NODE; 683 684 } 685 #else 686 687 static int __vphn_get_associativity(long lcpu, __be32 *associativity) 688 { 689 return -1; 690 } 691 692 static int vphn_get_nid(long unused) 693 { 694 return NUMA_NO_NODE; 695 } 696 #endif /* CONFIG_PPC_SPLPAR */ 697 698 /* 699 * Figure out to which domain a cpu belongs and stick it there. 700 * Return the id of the domain used. 701 */ 702 static int numa_setup_cpu(unsigned long lcpu) 703 { 704 struct device_node *cpu; 705 int fcpu = cpu_first_thread_sibling(lcpu); 706 int nid = NUMA_NO_NODE; 707 708 if (!cpu_present(lcpu)) { 709 set_cpu_numa_node(lcpu, first_online_node); 710 return first_online_node; 711 } 712 713 /* 714 * If a valid cpu-to-node mapping is already available, use it 715 * directly instead of querying the firmware, since it represents 716 * the most recent mapping notified to us by the platform (eg: VPHN). 717 * Since cpu_to_node binding remains the same for all threads in the 718 * core. If a valid cpu-to-node mapping is already available, for 719 * the first thread in the core, use it. 720 */ 721 nid = numa_cpu_lookup_table[fcpu]; 722 if (nid >= 0) { 723 map_cpu_to_node(lcpu, nid); 724 return nid; 725 } 726 727 nid = vphn_get_nid(lcpu); 728 if (nid != NUMA_NO_NODE) 729 goto out_present; 730 731 cpu = of_get_cpu_node(lcpu, NULL); 732 733 if (!cpu) { 734 WARN_ON(1); 735 if (cpu_present(lcpu)) 736 goto out_present; 737 else 738 goto out; 739 } 740 741 nid = of_node_to_nid_single(cpu); 742 of_node_put(cpu); 743 744 out_present: 745 if (nid < 0 || !node_possible(nid)) 746 nid = first_online_node; 747 748 /* 749 * Update for the first thread of the core. All threads of a core 750 * have to be part of the same node. This not only avoids querying 751 * for every other thread in the core, but always avoids a case 752 * where virtual node associativity change causes subsequent threads 753 * of a core to be associated with different nid. However if first 754 * thread is already online, expect it to have a valid mapping. 755 */ 756 if (fcpu != lcpu) { 757 WARN_ON(cpu_online(fcpu)); 758 map_cpu_to_node(fcpu, nid); 759 } 760 761 map_cpu_to_node(lcpu, nid); 762 out: 763 return nid; 764 } 765 766 static void verify_cpu_node_mapping(int cpu, int node) 767 { 768 int base, sibling, i; 769 770 /* Verify that all the threads in the core belong to the same node */ 771 base = cpu_first_thread_sibling(cpu); 772 773 for (i = 0; i < threads_per_core; i++) { 774 sibling = base + i; 775 776 if (sibling == cpu || cpu_is_offline(sibling)) 777 continue; 778 779 if (cpu_to_node(sibling) != node) { 780 WARN(1, "CPU thread siblings %d and %d don't belong" 781 " to the same node!\n", cpu, sibling); 782 break; 783 } 784 } 785 } 786 787 /* Must run before sched domains notifier. */ 788 static int ppc_numa_cpu_prepare(unsigned int cpu) 789 { 790 int nid; 791 792 nid = numa_setup_cpu(cpu); 793 verify_cpu_node_mapping(cpu, nid); 794 return 0; 795 } 796 797 static int ppc_numa_cpu_dead(unsigned int cpu) 798 { 799 return 0; 800 } 801 802 /* 803 * Check and possibly modify a memory region to enforce the memory limit. 804 * 805 * Returns the size the region should have to enforce the memory limit. 806 * This will either be the original value of size, a truncated value, 807 * or zero. If the returned value of size is 0 the region should be 808 * discarded as it lies wholly above the memory limit. 809 */ 810 static unsigned long __init numa_enforce_memory_limit(unsigned long start, 811 unsigned long size) 812 { 813 /* 814 * We use memblock_end_of_DRAM() in here instead of memory_limit because 815 * we've already adjusted it for the limit and it takes care of 816 * having memory holes below the limit. Also, in the case of 817 * iommu_is_off, memory_limit is not set but is implicitly enforced. 818 */ 819 820 if (start + size <= memblock_end_of_DRAM()) 821 return size; 822 823 if (start >= memblock_end_of_DRAM()) 824 return 0; 825 826 return memblock_end_of_DRAM() - start; 827 } 828 829 /* 830 * Reads the counter for a given entry in 831 * linux,drconf-usable-memory property 832 */ 833 static inline int __init read_usm_ranges(const __be32 **usm) 834 { 835 /* 836 * For each lmb in ibm,dynamic-memory a corresponding 837 * entry in linux,drconf-usable-memory property contains 838 * a counter followed by that many (base, size) duple. 839 * read the counter from linux,drconf-usable-memory 840 */ 841 return read_n_cells(n_mem_size_cells, usm); 842 } 843 844 /* 845 * Extract NUMA information from the ibm,dynamic-reconfiguration-memory 846 * node. This assumes n_mem_{addr,size}_cells have been set. 847 */ 848 static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb, 849 const __be32 **usm, 850 void *data) 851 { 852 unsigned int ranges, is_kexec_kdump = 0; 853 unsigned long base, size, sz; 854 int nid; 855 856 /* 857 * Skip this block if the reserved bit is set in flags (0x80) 858 * or if the block is not assigned to this partition (0x8) 859 */ 860 if ((lmb->flags & DRCONF_MEM_RESERVED) 861 || !(lmb->flags & DRCONF_MEM_ASSIGNED)) 862 return 0; 863 864 if (*usm) 865 is_kexec_kdump = 1; 866 867 base = lmb->base_addr; 868 size = drmem_lmb_size(); 869 ranges = 1; 870 871 if (is_kexec_kdump) { 872 ranges = read_usm_ranges(usm); 873 if (!ranges) /* there are no (base, size) duple */ 874 return 0; 875 } 876 877 do { 878 if (is_kexec_kdump) { 879 base = read_n_cells(n_mem_addr_cells, usm); 880 size = read_n_cells(n_mem_size_cells, usm); 881 } 882 883 nid = get_nid_and_numa_distance(lmb); 884 fake_numa_create_new_node(((base + size) >> PAGE_SHIFT), 885 &nid); 886 node_set_online(nid); 887 sz = numa_enforce_memory_limit(base, size); 888 if (sz) 889 memblock_set_node(base, sz, &memblock.memory, nid); 890 } while (--ranges); 891 892 return 0; 893 } 894 895 static int __init parse_numa_properties(void) 896 { 897 struct device_node *memory, *pci; 898 int default_nid = 0; 899 unsigned long i; 900 const __be32 *associativity; 901 902 if (numa_enabled == 0) { 903 pr_warn("disabled by user\n"); 904 return -1; 905 } 906 907 primary_domain_index = find_primary_domain_index(); 908 909 if (primary_domain_index < 0) { 910 /* 911 * if we fail to parse primary_domain_index from device tree 912 * mark the numa disabled, boot with numa disabled. 913 */ 914 numa_enabled = false; 915 return primary_domain_index; 916 } 917 918 pr_debug("associativity depth for CPU/Memory: %d\n", primary_domain_index); 919 920 /* 921 * If it is FORM2 initialize the distance table here. 922 */ 923 if (affinity_form == FORM2_AFFINITY) 924 initialize_form2_numa_distance_lookup_table(); 925 926 /* 927 * Even though we connect cpus to numa domains later in SMP 928 * init, we need to know the node ids now. This is because 929 * each node to be onlined must have NODE_DATA etc backing it. 930 */ 931 for_each_present_cpu(i) { 932 __be32 vphn_assoc[VPHN_ASSOC_BUFSIZE]; 933 struct device_node *cpu; 934 int nid = NUMA_NO_NODE; 935 936 memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32)); 937 938 if (__vphn_get_associativity(i, vphn_assoc) == 0) { 939 nid = associativity_to_nid(vphn_assoc); 940 initialize_form1_numa_distance(vphn_assoc); 941 } else { 942 943 /* 944 * Don't fall back to default_nid yet -- we will plug 945 * cpus into nodes once the memory scan has discovered 946 * the topology. 947 */ 948 cpu = of_get_cpu_node(i, NULL); 949 BUG_ON(!cpu); 950 951 associativity = of_get_associativity(cpu); 952 if (associativity) { 953 nid = associativity_to_nid(associativity); 954 initialize_form1_numa_distance(associativity); 955 } 956 of_node_put(cpu); 957 } 958 959 /* node_set_online() is an UB if 'nid' is negative */ 960 if (likely(nid >= 0)) 961 node_set_online(nid); 962 } 963 964 get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells); 965 966 for_each_node_by_type(memory, "memory") { 967 unsigned long start; 968 unsigned long size; 969 int nid; 970 int ranges; 971 const __be32 *memcell_buf; 972 unsigned int len; 973 974 memcell_buf = of_get_property(memory, 975 "linux,usable-memory", &len); 976 if (!memcell_buf || len <= 0) 977 memcell_buf = of_get_property(memory, "reg", &len); 978 if (!memcell_buf || len <= 0) 979 continue; 980 981 /* ranges in cell */ 982 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); 983 new_range: 984 /* these are order-sensitive, and modify the buffer pointer */ 985 start = read_n_cells(n_mem_addr_cells, &memcell_buf); 986 size = read_n_cells(n_mem_size_cells, &memcell_buf); 987 988 /* 989 * Assumption: either all memory nodes or none will 990 * have associativity properties. If none, then 991 * everything goes to default_nid. 992 */ 993 associativity = of_get_associativity(memory); 994 if (associativity) { 995 nid = associativity_to_nid(associativity); 996 initialize_form1_numa_distance(associativity); 997 } else 998 nid = default_nid; 999 1000 fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid); 1001 node_set_online(nid); 1002 1003 size = numa_enforce_memory_limit(start, size); 1004 if (size) 1005 memblock_set_node(start, size, &memblock.memory, nid); 1006 1007 if (--ranges) 1008 goto new_range; 1009 } 1010 1011 for_each_node_by_name(pci, "pci") { 1012 int nid = NUMA_NO_NODE; 1013 1014 associativity = of_get_associativity(pci); 1015 if (associativity) { 1016 nid = associativity_to_nid(associativity); 1017 initialize_form1_numa_distance(associativity); 1018 } 1019 if (likely(nid >= 0) && !node_online(nid)) 1020 node_set_online(nid); 1021 } 1022 1023 /* 1024 * Now do the same thing for each MEMBLOCK listed in the 1025 * ibm,dynamic-memory property in the 1026 * ibm,dynamic-reconfiguration-memory node. 1027 */ 1028 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 1029 if (memory) { 1030 walk_drmem_lmbs(memory, NULL, numa_setup_drmem_lmb); 1031 of_node_put(memory); 1032 } 1033 1034 return 0; 1035 } 1036 1037 static void __init setup_nonnuma(void) 1038 { 1039 unsigned long top_of_ram = memblock_end_of_DRAM(); 1040 unsigned long total_ram = memblock_phys_mem_size(); 1041 unsigned long start_pfn, end_pfn; 1042 unsigned int nid = 0; 1043 int i; 1044 1045 pr_debug("Top of RAM: 0x%lx, Total RAM: 0x%lx\n", top_of_ram, total_ram); 1046 pr_debug("Memory hole size: %ldMB\n", (top_of_ram - total_ram) >> 20); 1047 1048 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { 1049 fake_numa_create_new_node(end_pfn, &nid); 1050 memblock_set_node(PFN_PHYS(start_pfn), 1051 PFN_PHYS(end_pfn - start_pfn), 1052 &memblock.memory, nid); 1053 node_set_online(nid); 1054 } 1055 } 1056 1057 void __init dump_numa_cpu_topology(void) 1058 { 1059 unsigned int node; 1060 unsigned int cpu, count; 1061 1062 if (!numa_enabled) 1063 return; 1064 1065 for_each_online_node(node) { 1066 pr_info("Node %d CPUs:", node); 1067 1068 count = 0; 1069 /* 1070 * If we used a CPU iterator here we would miss printing 1071 * the holes in the cpumap. 1072 */ 1073 for (cpu = 0; cpu < nr_cpu_ids; cpu++) { 1074 if (cpumask_test_cpu(cpu, 1075 node_to_cpumask_map[node])) { 1076 if (count == 0) 1077 pr_cont(" %u", cpu); 1078 ++count; 1079 } else { 1080 if (count > 1) 1081 pr_cont("-%u", cpu - 1); 1082 count = 0; 1083 } 1084 } 1085 1086 if (count > 1) 1087 pr_cont("-%u", nr_cpu_ids - 1); 1088 pr_cont("\n"); 1089 } 1090 } 1091 1092 /* Initialize NODE_DATA for a node on the local memory */ 1093 static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) 1094 { 1095 u64 spanned_pages = end_pfn - start_pfn; 1096 1097 alloc_node_data(nid); 1098 1099 NODE_DATA(nid)->node_id = nid; 1100 NODE_DATA(nid)->node_start_pfn = start_pfn; 1101 NODE_DATA(nid)->node_spanned_pages = spanned_pages; 1102 } 1103 1104 static void __init find_possible_nodes(void) 1105 { 1106 struct device_node *rtas, *root; 1107 const __be32 *domains = NULL; 1108 int prop_length, max_nodes; 1109 u32 i; 1110 1111 if (!numa_enabled) 1112 return; 1113 1114 rtas = of_find_node_by_path("/rtas"); 1115 if (!rtas) 1116 return; 1117 1118 /* 1119 * ibm,current-associativity-domains is a fairly recent property. If 1120 * it doesn't exist, then fallback on ibm,max-associativity-domains. 1121 * Current denotes what the platform can support compared to max 1122 * which denotes what the Hypervisor can support. 1123 * 1124 * If the LPAR is migratable, new nodes might be activated after a LPM, 1125 * so we should consider the max number in that case. 1126 */ 1127 root = of_find_node_by_path("/"); 1128 if (!of_get_property(root, "ibm,migratable-partition", NULL)) 1129 domains = of_get_property(rtas, 1130 "ibm,current-associativity-domains", 1131 &prop_length); 1132 of_node_put(root); 1133 if (!domains) { 1134 domains = of_get_property(rtas, "ibm,max-associativity-domains", 1135 &prop_length); 1136 if (!domains) 1137 goto out; 1138 } 1139 1140 max_nodes = of_read_number(&domains[primary_domain_index], 1); 1141 pr_info("Partition configured for %d NUMA nodes.\n", max_nodes); 1142 1143 for (i = 0; i < max_nodes; i++) { 1144 if (!node_possible(i)) 1145 node_set(i, node_possible_map); 1146 } 1147 1148 prop_length /= sizeof(int); 1149 if (prop_length > primary_domain_index + 2) 1150 coregroup_enabled = 1; 1151 1152 out: 1153 of_node_put(rtas); 1154 } 1155 1156 void __init mem_topology_setup(void) 1157 { 1158 int cpu; 1159 1160 max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; 1161 min_low_pfn = MEMORY_START >> PAGE_SHIFT; 1162 1163 /* 1164 * Linux/mm assumes node 0 to be online at boot. However this is not 1165 * true on PowerPC, where node 0 is similar to any other node, it 1166 * could be cpuless, memoryless node. So force node 0 to be offline 1167 * for now. This will prevent cpuless, memoryless node 0 showing up 1168 * unnecessarily as online. If a node has cpus or memory that need 1169 * to be online, then node will anyway be marked online. 1170 */ 1171 node_set_offline(0); 1172 1173 if (parse_numa_properties()) 1174 setup_nonnuma(); 1175 1176 /* 1177 * Modify the set of possible NUMA nodes to reflect information 1178 * available about the set of online nodes, and the set of nodes 1179 * that we expect to make use of for this platform's affinity 1180 * calculations. 1181 */ 1182 nodes_and(node_possible_map, node_possible_map, node_online_map); 1183 1184 find_possible_nodes(); 1185 1186 setup_node_to_cpumask_map(); 1187 1188 reset_numa_cpu_lookup_table(); 1189 1190 for_each_possible_cpu(cpu) { 1191 /* 1192 * Powerpc with CONFIG_NUMA always used to have a node 0, 1193 * even if it was memoryless or cpuless. For all cpus that 1194 * are possible but not present, cpu_to_node() would point 1195 * to node 0. To remove a cpuless, memoryless dummy node, 1196 * powerpc need to make sure all possible but not present 1197 * cpu_to_node are set to a proper node. 1198 */ 1199 numa_setup_cpu(cpu); 1200 } 1201 } 1202 1203 void __init initmem_init(void) 1204 { 1205 int nid; 1206 1207 memblock_dump_all(); 1208 1209 for_each_online_node(nid) { 1210 unsigned long start_pfn, end_pfn; 1211 1212 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 1213 setup_node_data(nid, start_pfn, end_pfn); 1214 } 1215 1216 sparse_init(); 1217 1218 /* 1219 * We need the numa_cpu_lookup_table to be accurate for all CPUs, 1220 * even before we online them, so that we can use cpu_to_{node,mem} 1221 * early in boot, cf. smp_prepare_cpus(). 1222 * _nocalls() + manual invocation is used because cpuhp is not yet 1223 * initialized for the boot CPU. 1224 */ 1225 cpuhp_setup_state_nocalls(CPUHP_POWER_NUMA_PREPARE, "powerpc/numa:prepare", 1226 ppc_numa_cpu_prepare, ppc_numa_cpu_dead); 1227 } 1228 1229 static int __init early_numa(char *p) 1230 { 1231 if (!p) 1232 return 0; 1233 1234 if (strstr(p, "off")) 1235 numa_enabled = 0; 1236 1237 p = strstr(p, "fake="); 1238 if (p) 1239 cmdline = p + strlen("fake="); 1240 1241 return 0; 1242 } 1243 early_param("numa", early_numa); 1244 1245 #ifdef CONFIG_MEMORY_HOTPLUG 1246 /* 1247 * Find the node associated with a hot added memory section for 1248 * memory represented in the device tree by the property 1249 * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory. 1250 */ 1251 static int hot_add_drconf_scn_to_nid(unsigned long scn_addr) 1252 { 1253 struct drmem_lmb *lmb; 1254 unsigned long lmb_size; 1255 int nid = NUMA_NO_NODE; 1256 1257 lmb_size = drmem_lmb_size(); 1258 1259 for_each_drmem_lmb(lmb) { 1260 /* skip this block if it is reserved or not assigned to 1261 * this partition */ 1262 if ((lmb->flags & DRCONF_MEM_RESERVED) 1263 || !(lmb->flags & DRCONF_MEM_ASSIGNED)) 1264 continue; 1265 1266 if ((scn_addr < lmb->base_addr) 1267 || (scn_addr >= (lmb->base_addr + lmb_size))) 1268 continue; 1269 1270 nid = of_drconf_to_nid_single(lmb); 1271 break; 1272 } 1273 1274 return nid; 1275 } 1276 1277 /* 1278 * Find the node associated with a hot added memory section for memory 1279 * represented in the device tree as a node (i.e. memory@XXXX) for 1280 * each memblock. 1281 */ 1282 static int hot_add_node_scn_to_nid(unsigned long scn_addr) 1283 { 1284 struct device_node *memory; 1285 int nid = NUMA_NO_NODE; 1286 1287 for_each_node_by_type(memory, "memory") { 1288 int i = 0; 1289 1290 while (1) { 1291 struct resource res; 1292 1293 if (of_address_to_resource(memory, i++, &res)) 1294 break; 1295 1296 if ((scn_addr < res.start) || (scn_addr > res.end)) 1297 continue; 1298 1299 nid = of_node_to_nid_single(memory); 1300 break; 1301 } 1302 1303 if (nid >= 0) 1304 break; 1305 } 1306 1307 of_node_put(memory); 1308 1309 return nid; 1310 } 1311 1312 /* 1313 * Find the node associated with a hot added memory section. Section 1314 * corresponds to a SPARSEMEM section, not an MEMBLOCK. It is assumed that 1315 * sections are fully contained within a single MEMBLOCK. 1316 */ 1317 int hot_add_scn_to_nid(unsigned long scn_addr) 1318 { 1319 struct device_node *memory = NULL; 1320 int nid; 1321 1322 if (!numa_enabled) 1323 return first_online_node; 1324 1325 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 1326 if (memory) { 1327 nid = hot_add_drconf_scn_to_nid(scn_addr); 1328 of_node_put(memory); 1329 } else { 1330 nid = hot_add_node_scn_to_nid(scn_addr); 1331 } 1332 1333 if (nid < 0 || !node_possible(nid)) 1334 nid = first_online_node; 1335 1336 return nid; 1337 } 1338 1339 static u64 hot_add_drconf_memory_max(void) 1340 { 1341 struct device_node *memory = NULL; 1342 struct device_node *dn = NULL; 1343 const __be64 *lrdr = NULL; 1344 1345 dn = of_find_node_by_path("/rtas"); 1346 if (dn) { 1347 lrdr = of_get_property(dn, "ibm,lrdr-capacity", NULL); 1348 of_node_put(dn); 1349 if (lrdr) 1350 return be64_to_cpup(lrdr); 1351 } 1352 1353 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 1354 if (memory) { 1355 of_node_put(memory); 1356 return drmem_lmb_memory_max(); 1357 } 1358 return 0; 1359 } 1360 1361 /* 1362 * memory_hotplug_max - return max address of memory that may be added 1363 * 1364 * This is currently only used on systems that support drconfig memory 1365 * hotplug. 1366 */ 1367 u64 memory_hotplug_max(void) 1368 { 1369 return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM()); 1370 } 1371 #endif /* CONFIG_MEMORY_HOTPLUG */ 1372 1373 /* Virtual Processor Home Node (VPHN) support */ 1374 #ifdef CONFIG_PPC_SPLPAR 1375 static int topology_inited; 1376 1377 /* 1378 * Retrieve the new associativity information for a virtual processor's 1379 * home node. 1380 */ 1381 static long vphn_get_associativity(unsigned long cpu, 1382 __be32 *associativity) 1383 { 1384 long rc; 1385 1386 rc = hcall_vphn(get_hard_smp_processor_id(cpu), 1387 VPHN_FLAG_VCPU, associativity); 1388 1389 switch (rc) { 1390 case H_SUCCESS: 1391 pr_debug("VPHN hcall succeeded. Reset polling...\n"); 1392 goto out; 1393 1394 case H_FUNCTION: 1395 pr_err_ratelimited("VPHN unsupported. Disabling polling...\n"); 1396 break; 1397 case H_HARDWARE: 1398 pr_err_ratelimited("hcall_vphn() experienced a hardware fault " 1399 "preventing VPHN. Disabling polling...\n"); 1400 break; 1401 case H_PARAMETER: 1402 pr_err_ratelimited("hcall_vphn() was passed an invalid parameter. " 1403 "Disabling polling...\n"); 1404 break; 1405 default: 1406 pr_err_ratelimited("hcall_vphn() returned %ld. Disabling polling...\n" 1407 , rc); 1408 break; 1409 } 1410 out: 1411 return rc; 1412 } 1413 1414 void find_and_update_cpu_nid(int cpu) 1415 { 1416 __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; 1417 int new_nid; 1418 1419 /* Use associativity from first thread for all siblings */ 1420 if (vphn_get_associativity(cpu, associativity)) 1421 return; 1422 1423 /* Do not have previous associativity, so find it now. */ 1424 new_nid = associativity_to_nid(associativity); 1425 1426 if (new_nid < 0 || !node_possible(new_nid)) 1427 new_nid = first_online_node; 1428 else 1429 // Associate node <-> cpu, so cpu_up() calls 1430 // try_online_node() on the right node. 1431 set_cpu_numa_node(cpu, new_nid); 1432 1433 pr_debug("%s:%d cpu %d nid %d\n", __func__, __LINE__, cpu, new_nid); 1434 } 1435 1436 int cpu_to_coregroup_id(int cpu) 1437 { 1438 __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; 1439 int index; 1440 1441 if (cpu < 0 || cpu > nr_cpu_ids) 1442 return -1; 1443 1444 if (!coregroup_enabled) 1445 goto out; 1446 1447 if (!firmware_has_feature(FW_FEATURE_VPHN)) 1448 goto out; 1449 1450 if (vphn_get_associativity(cpu, associativity)) 1451 goto out; 1452 1453 index = of_read_number(associativity, 1); 1454 if (index > primary_domain_index + 1) 1455 return of_read_number(&associativity[index - 1], 1); 1456 1457 out: 1458 return cpu_to_core_id(cpu); 1459 } 1460 1461 static int topology_update_init(void) 1462 { 1463 topology_inited = 1; 1464 return 0; 1465 } 1466 device_initcall(topology_update_init); 1467 #endif /* CONFIG_PPC_SPLPAR */ 1468