1 // SPDX-License-Identifier: GPL-2.0-or-later 2 3 #include <linux/array_size.h> 4 #include <linux/sort.h> 5 #include <linux/printk.h> 6 #include <linux/memblock.h> 7 #include <linux/numa.h> 8 #include <linux/numa_memblks.h> 9 10 #include <asm/numa.h> 11 12 int numa_distance_cnt; 13 static u8 *numa_distance; 14 15 nodemask_t numa_nodes_parsed __initdata; 16 17 static struct numa_meminfo numa_meminfo __initdata_or_meminfo; 18 static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo; 19 20 /* 21 * Set nodes, which have memory in @mi, in *@nodemask. 22 */ 23 static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, 24 const struct numa_meminfo *mi) 25 { 26 int i; 27 28 for (i = 0; i < ARRAY_SIZE(mi->blk); i++) 29 if (mi->blk[i].start != mi->blk[i].end && 30 mi->blk[i].nid != NUMA_NO_NODE) 31 node_set(mi->blk[i].nid, *nodemask); 32 } 33 34 /** 35 * numa_reset_distance - Reset NUMA distance table 36 * 37 * The current table is freed. The next numa_set_distance() call will 38 * create a new one. 39 */ 40 void __init numa_reset_distance(void) 41 { 42 size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); 43 44 /* numa_distance could be 1LU marking allocation failure, test cnt */ 45 if (numa_distance_cnt) 46 memblock_free(numa_distance, size); 47 numa_distance_cnt = 0; 48 numa_distance = NULL; /* enable table creation */ 49 } 50 51 static int __init numa_alloc_distance(void) 52 { 53 nodemask_t nodes_parsed; 54 size_t size; 55 int i, j, cnt = 0; 56 57 /* size the new table and allocate it */ 58 nodes_parsed = numa_nodes_parsed; 59 numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); 60 61 for_each_node_mask(i, nodes_parsed) 62 cnt = i; 63 cnt++; 64 size = cnt * cnt * sizeof(numa_distance[0]); 65 66 numa_distance = memblock_alloc(size, PAGE_SIZE); 67 if (!numa_distance) { 68 pr_warn("Warning: can't allocate distance table!\n"); 69 /* don't retry until explicitly reset */ 70 numa_distance = (void *)1LU; 71 return -ENOMEM; 72 } 73 74 numa_distance_cnt = cnt; 75 76 /* fill with the default distances */ 77 for (i = 0; i < cnt; i++) 78 for (j = 0; j < cnt; j++) 79 numa_distance[i * cnt + j] = i == j ? 80 LOCAL_DISTANCE : REMOTE_DISTANCE; 81 pr_debug("NUMA: Initialized distance table, cnt=%d\n", cnt); 82 83 return 0; 84 } 85 86 /** 87 * numa_set_distance - Set NUMA distance from one NUMA to another 88 * @from: the 'from' node to set distance 89 * @to: the 'to' node to set distance 90 * @distance: NUMA distance 91 * 92 * Set the distance from node @from to @to to @distance. If distance table 93 * doesn't exist, one which is large enough to accommodate all the currently 94 * known nodes will be created. 95 * 96 * If such table cannot be allocated, a warning is printed and further 97 * calls are ignored until the distance table is reset with 98 * numa_reset_distance(). 99 * 100 * If @from or @to is higher than the highest known node or lower than zero 101 * at the time of table creation or @distance doesn't make sense, the call 102 * is ignored. 103 * This is to allow simplification of specific NUMA config implementations. 104 */ 105 void __init numa_set_distance(int from, int to, int distance) 106 { 107 if (!numa_distance && numa_alloc_distance() < 0) 108 return; 109 110 if (from >= numa_distance_cnt || to >= numa_distance_cnt || 111 from < 0 || to < 0) { 112 pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n", 113 from, to, distance); 114 return; 115 } 116 117 if ((u8)distance != distance || 118 (from == to && distance != LOCAL_DISTANCE)) { 119 pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n", 120 from, to, distance); 121 return; 122 } 123 124 numa_distance[from * numa_distance_cnt + to] = distance; 125 } 126 127 int __node_distance(int from, int to) 128 { 129 if (from >= numa_distance_cnt || to >= numa_distance_cnt) 130 return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; 131 return numa_distance[from * numa_distance_cnt + to]; 132 } 133 EXPORT_SYMBOL(__node_distance); 134 135 static int __init numa_add_memblk_to(int nid, u64 start, u64 end, 136 struct numa_meminfo *mi) 137 { 138 /* ignore zero length blks */ 139 if (start == end) 140 return 0; 141 142 /* whine about and ignore invalid blks */ 143 if (start > end || nid < 0 || nid >= MAX_NUMNODES) { 144 pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", 145 nid, start, end - 1); 146 return 0; 147 } 148 149 if (mi->nr_blks >= NR_NODE_MEMBLKS) { 150 pr_err("too many memblk ranges\n"); 151 return -EINVAL; 152 } 153 154 mi->blk[mi->nr_blks].start = start; 155 mi->blk[mi->nr_blks].end = end; 156 mi->blk[mi->nr_blks].nid = nid; 157 mi->nr_blks++; 158 return 0; 159 } 160 161 /** 162 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo 163 * @idx: Index of memblk to remove 164 * @mi: numa_meminfo to remove memblk from 165 * 166 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and 167 * decrementing @mi->nr_blks. 168 */ 169 void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) 170 { 171 mi->nr_blks--; 172 memmove(&mi->blk[idx], &mi->blk[idx + 1], 173 (mi->nr_blks - idx) * sizeof(mi->blk[0])); 174 } 175 176 /** 177 * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another 178 * @dst: numa_meminfo to append block to 179 * @idx: Index of memblk to remove 180 * @src: numa_meminfo to remove memblk from 181 */ 182 static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx, 183 struct numa_meminfo *src) 184 { 185 dst->blk[dst->nr_blks++] = src->blk[idx]; 186 numa_remove_memblk_from(idx, src); 187 } 188 189 /** 190 * numa_add_memblk - Add one numa_memblk to numa_meminfo 191 * @nid: NUMA node ID of the new memblk 192 * @start: Start address of the new memblk 193 * @end: End address of the new memblk 194 * 195 * Add a new memblk to the default numa_meminfo. 196 * 197 * RETURNS: 198 * 0 on success, -errno on failure. 199 */ 200 int __init numa_add_memblk(int nid, u64 start, u64 end) 201 { 202 return numa_add_memblk_to(nid, start, end, &numa_meminfo); 203 } 204 205 /** 206 * numa_add_reserved_memblk - Add one numa_memblk to numa_reserved_meminfo 207 * @nid: NUMA node ID of the new memblk 208 * @start: Start address of the new memblk 209 * @end: End address of the new memblk 210 * 211 * Add a new memblk to the numa_reserved_meminfo. 212 * 213 * Usage Case: numa_cleanup_meminfo() reconciles all numa_memblk instances 214 * against memblock_type information and moves any that intersect reserved 215 * ranges to numa_reserved_meminfo. However, when that information is known 216 * ahead of time, we use numa_add_reserved_memblk() to add the numa_memblk 217 * to numa_reserved_meminfo directly. 218 * 219 * RETURNS: 220 * 0 on success, -errno on failure. 221 */ 222 int __init numa_add_reserved_memblk(int nid, u64 start, u64 end) 223 { 224 return numa_add_memblk_to(nid, start, end, &numa_reserved_meminfo); 225 } 226 227 /** 228 * numa_cleanup_meminfo - Cleanup a numa_meminfo 229 * @mi: numa_meminfo to clean up 230 * 231 * Sanitize @mi by merging and removing unnecessary memblks. Also check for 232 * conflicts and clear unused memblks. 233 * 234 * RETURNS: 235 * 0 on success, -errno on failure. 236 */ 237 int __init numa_cleanup_meminfo(struct numa_meminfo *mi) 238 { 239 const u64 low = memblock_start_of_DRAM(); 240 const u64 high = memblock_end_of_DRAM(); 241 int i, j, k; 242 243 /* first, trim all entries */ 244 for (i = 0; i < mi->nr_blks; i++) { 245 struct numa_memblk *bi = &mi->blk[i]; 246 247 /* move / save reserved memory ranges */ 248 if (!memblock_overlaps_region(&memblock.memory, 249 bi->start, bi->end - bi->start)) { 250 numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi); 251 continue; 252 } 253 254 /* make sure all non-reserved blocks are inside the limits */ 255 bi->start = max(bi->start, low); 256 257 /* preserve info for non-RAM areas above 'max_pfn': */ 258 if (bi->end > high) { 259 numa_add_memblk_to(bi->nid, high, bi->end, 260 &numa_reserved_meminfo); 261 bi->end = high; 262 } 263 264 /* and there's no empty block */ 265 if (bi->start >= bi->end) 266 numa_remove_memblk_from(i--, mi); 267 } 268 269 /* merge neighboring / overlapping entries */ 270 for (i = 0; i < mi->nr_blks; i++) { 271 struct numa_memblk *bi = &mi->blk[i]; 272 273 for (j = i + 1; j < mi->nr_blks; j++) { 274 struct numa_memblk *bj = &mi->blk[j]; 275 u64 start, end; 276 277 /* 278 * See whether there are overlapping blocks. Whine 279 * about but allow overlaps of the same nid. They 280 * will be merged below. 281 */ 282 if (bi->end > bj->start && bi->start < bj->end) { 283 if (bi->nid != bj->nid) { 284 pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n", 285 bi->nid, bi->start, bi->end - 1, 286 bj->nid, bj->start, bj->end - 1); 287 return -EINVAL; 288 } 289 pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n", 290 bi->nid, bi->start, bi->end - 1, 291 bj->start, bj->end - 1); 292 } 293 294 /* 295 * Join together blocks on the same node, holes 296 * between which don't overlap with memory on other 297 * nodes. 298 */ 299 if (bi->nid != bj->nid) 300 continue; 301 start = min(bi->start, bj->start); 302 end = max(bi->end, bj->end); 303 for (k = 0; k < mi->nr_blks; k++) { 304 struct numa_memblk *bk = &mi->blk[k]; 305 306 if (bi->nid == bk->nid) 307 continue; 308 if (start < bk->end && end > bk->start) 309 break; 310 } 311 if (k < mi->nr_blks) 312 continue; 313 pr_info("NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n", 314 bi->nid, bi->start, bi->end - 1, bj->start, 315 bj->end - 1, start, end - 1); 316 bi->start = start; 317 bi->end = end; 318 numa_remove_memblk_from(j--, mi); 319 } 320 } 321 322 /* clear unused ones */ 323 for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { 324 mi->blk[i].start = mi->blk[i].end = 0; 325 mi->blk[i].nid = NUMA_NO_NODE; 326 } 327 328 return 0; 329 } 330 331 /* 332 * Mark all currently memblock-reserved physical memory (which covers the 333 * kernel's own memory ranges) as hot-unswappable. 334 */ 335 static void __init numa_clear_kernel_node_hotplug(void) 336 { 337 nodemask_t reserved_nodemask = NODE_MASK_NONE; 338 struct memblock_region *mb_region; 339 int i; 340 341 /* 342 * We have to do some preprocessing of memblock regions, to 343 * make them suitable for reservation. 344 * 345 * At this time, all memory regions reserved by memblock are 346 * used by the kernel, but those regions are not split up 347 * along node boundaries yet, and don't necessarily have their 348 * node ID set yet either. 349 * 350 * So iterate over all parsed memory blocks and use those ranges to 351 * set the nid in memblock.reserved. This will split up the 352 * memblock regions along node boundaries and will set the node IDs 353 * as well. 354 */ 355 for (i = 0; i < numa_meminfo.nr_blks; i++) { 356 struct numa_memblk *mb = numa_meminfo.blk + i; 357 int ret; 358 359 ret = memblock_set_node(mb->start, mb->end - mb->start, 360 &memblock.reserved, mb->nid); 361 WARN_ON_ONCE(ret); 362 } 363 364 /* 365 * Now go over all reserved memblock regions, to construct a 366 * node mask of all kernel reserved memory areas. 367 * 368 * [ Note, when booting with mem=nn[kMG] or in a kdump kernel, 369 * numa_meminfo might not include all memblock.reserved 370 * memory ranges, because quirks such as trim_snb_memory() 371 * reserve specific pages for Sandy Bridge graphics. ] 372 */ 373 for_each_reserved_mem_region(mb_region) { 374 int nid = memblock_get_region_node(mb_region); 375 376 if (numa_valid_node(nid)) 377 node_set(nid, reserved_nodemask); 378 } 379 380 /* 381 * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory 382 * belonging to the reserved node mask. 383 * 384 * Note that this will include memory regions that reside 385 * on nodes that contain kernel memory - entire nodes 386 * become hot-unpluggable: 387 */ 388 for (i = 0; i < numa_meminfo.nr_blks; i++) { 389 struct numa_memblk *mb = numa_meminfo.blk + i; 390 391 if (!node_isset(mb->nid, reserved_nodemask)) 392 continue; 393 394 memblock_clear_hotplug(mb->start, mb->end - mb->start); 395 } 396 } 397 398 static int __init numa_register_meminfo(struct numa_meminfo *mi) 399 { 400 int i; 401 402 /* Account for nodes with cpus and no memory */ 403 node_possible_map = numa_nodes_parsed; 404 numa_nodemask_from_meminfo(&node_possible_map, mi); 405 if (WARN_ON(nodes_empty(node_possible_map))) 406 return -EINVAL; 407 408 for (i = 0; i < mi->nr_blks; i++) { 409 struct numa_memblk *mb = &mi->blk[i]; 410 411 memblock_set_node(mb->start, mb->end - mb->start, 412 &memblock.memory, mb->nid); 413 } 414 415 /* 416 * At very early time, the kernel have to use some memory such as 417 * loading the kernel image. We cannot prevent this anyway. So any 418 * node the kernel resides in should be un-hotpluggable. 419 * 420 * And when we come here, alloc node data won't fail. 421 */ 422 numa_clear_kernel_node_hotplug(); 423 424 /* 425 * If sections array is gonna be used for pfn -> nid mapping, check 426 * whether its granularity is fine enough. 427 */ 428 if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) { 429 unsigned long pfn_align = node_map_pfn_alignment(); 430 431 if (pfn_align && pfn_align < PAGES_PER_SECTION) { 432 unsigned long node_align_mb = PFN_PHYS(pfn_align) / SZ_1M; 433 434 unsigned long sect_align_mb = PFN_PHYS(PAGES_PER_SECTION) / SZ_1M; 435 436 pr_warn("Node alignment %luMB < min %luMB, rejecting NUMA config\n", 437 node_align_mb, sect_align_mb); 438 return -EINVAL; 439 } 440 } 441 442 return 0; 443 } 444 445 int __init numa_memblks_init(int (*init_func)(void), 446 bool memblock_force_top_down) 447 { 448 phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; 449 int ret; 450 451 nodes_clear(numa_nodes_parsed); 452 nodes_clear(node_possible_map); 453 nodes_clear(node_online_map); 454 memset(&numa_meminfo, 0, sizeof(numa_meminfo)); 455 WARN_ON(memblock_set_node(0, max_addr, &memblock.memory, NUMA_NO_NODE)); 456 WARN_ON(memblock_set_node(0, max_addr, &memblock.reserved, 457 NUMA_NO_NODE)); 458 /* In case that parsing SRAT failed. */ 459 WARN_ON(memblock_clear_hotplug(0, max_addr)); 460 numa_reset_distance(); 461 462 ret = init_func(); 463 if (ret < 0) 464 return ret; 465 466 /* 467 * We reset memblock back to the top-down direction 468 * here because if we configured ACPI_NUMA, we have 469 * parsed SRAT in init_func(). It is ok to have the 470 * reset here even if we did't configure ACPI_NUMA 471 * or acpi numa init fails and fallbacks to dummy 472 * numa init. 473 */ 474 if (memblock_force_top_down) 475 memblock_set_bottom_up(false); 476 477 ret = numa_cleanup_meminfo(&numa_meminfo); 478 if (ret < 0) 479 return ret; 480 481 numa_emulation(&numa_meminfo, numa_distance_cnt); 482 483 return numa_register_meminfo(&numa_meminfo); 484 } 485 486 static int __init cmp_memblk(const void *a, const void *b) 487 { 488 const struct numa_memblk *ma = *(const struct numa_memblk **)a; 489 const struct numa_memblk *mb = *(const struct numa_memblk **)b; 490 491 return (ma->start > mb->start) - (ma->start < mb->start); 492 } 493 494 static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; 495 496 /** 497 * numa_fill_memblks - Fill gaps in numa_meminfo memblks 498 * @start: address to begin fill 499 * @end: address to end fill 500 * 501 * Find and extend numa_meminfo memblks to cover the physical 502 * address range @start-@end 503 * 504 * RETURNS: 505 * 0 : Success 506 * NUMA_NO_MEMBLK : No memblks exist in address range @start-@end 507 */ 508 509 int __init numa_fill_memblks(u64 start, u64 end) 510 { 511 struct numa_memblk **blk = &numa_memblk_list[0]; 512 struct numa_meminfo *mi = &numa_meminfo; 513 int count = 0; 514 u64 prev_end; 515 516 /* 517 * Create a list of pointers to numa_meminfo memblks that 518 * overlap start, end. The list is used to make in-place 519 * changes that fill out the numa_meminfo memblks. 520 */ 521 for (int i = 0; i < mi->nr_blks; i++) { 522 struct numa_memblk *bi = &mi->blk[i]; 523 524 if (memblock_addrs_overlap(start, end - start, bi->start, 525 bi->end - bi->start)) { 526 blk[count] = &mi->blk[i]; 527 count++; 528 } 529 } 530 if (!count) 531 return NUMA_NO_MEMBLK; 532 533 /* Sort the list of pointers in memblk->start order */ 534 sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL); 535 536 /* Make sure the first/last memblks include start/end */ 537 blk[0]->start = min(blk[0]->start, start); 538 blk[count - 1]->end = max(blk[count - 1]->end, end); 539 540 /* 541 * Fill any gaps by tracking the previous memblks 542 * end address and backfilling to it if needed. 543 */ 544 prev_end = blk[0]->end; 545 for (int i = 1; i < count; i++) { 546 struct numa_memblk *curr = blk[i]; 547 548 if (prev_end >= curr->start) { 549 if (prev_end < curr->end) 550 prev_end = curr->end; 551 } else { 552 curr->start = prev_end; 553 prev_end = curr->end; 554 } 555 } 556 return 0; 557 } 558 559 #ifdef CONFIG_NUMA_KEEP_MEMINFO 560 static int meminfo_to_nid(struct numa_meminfo *mi, u64 start) 561 { 562 int i; 563 564 for (i = 0; i < mi->nr_blks; i++) 565 if (mi->blk[i].start <= start && mi->blk[i].end > start) 566 return mi->blk[i].nid; 567 return NUMA_NO_NODE; 568 } 569 570 int phys_to_target_node(u64 start) 571 { 572 int nid = meminfo_to_nid(&numa_meminfo, start); 573 574 /* 575 * Prefer online nodes, but if reserved memory might be 576 * hot-added continue the search with reserved ranges. 577 */ 578 if (nid != NUMA_NO_NODE) 579 return nid; 580 581 return meminfo_to_nid(&numa_reserved_meminfo, start); 582 } 583 EXPORT_SYMBOL_GPL(phys_to_target_node); 584 585 int memory_add_physaddr_to_nid(u64 start) 586 { 587 int nid = meminfo_to_nid(&numa_meminfo, start); 588 589 if (nid == NUMA_NO_NODE) 590 nid = numa_meminfo.blk[0].nid; 591 return nid; 592 } 593 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 594 595 #endif /* CONFIG_NUMA_KEEP_MEMINFO */ 596