1 // SPDX-License-Identifier: GPL-2.0-or-later 2 3 #include <linux/array_size.h> 4 #include <linux/sort.h> 5 #include <linux/printk.h> 6 #include <linux/memblock.h> 7 #include <linux/numa.h> 8 #include <linux/numa_memblks.h> 9 10 int numa_distance_cnt; 11 static u8 *numa_distance; 12 13 nodemask_t numa_nodes_parsed __initdata; 14 15 static struct numa_meminfo numa_meminfo __initdata_or_meminfo; 16 static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo; 17 18 /* 19 * Set nodes, which have memory in @mi, in *@nodemask. 20 */ 21 static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, 22 const struct numa_meminfo *mi) 23 { 24 int i; 25 26 for (i = 0; i < ARRAY_SIZE(mi->blk); i++) 27 if (mi->blk[i].start != mi->blk[i].end && 28 mi->blk[i].nid != NUMA_NO_NODE) 29 node_set(mi->blk[i].nid, *nodemask); 30 } 31 32 /** 33 * numa_reset_distance - Reset NUMA distance table 34 * 35 * The current table is freed. The next numa_set_distance() call will 36 * create a new one. 37 */ 38 void __init numa_reset_distance(void) 39 { 40 size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); 41 42 /* numa_distance could be 1LU marking allocation failure, test cnt */ 43 if (numa_distance_cnt) 44 memblock_free(numa_distance, size); 45 numa_distance_cnt = 0; 46 numa_distance = NULL; /* enable table creation */ 47 } 48 49 static int __init numa_alloc_distance(void) 50 { 51 nodemask_t nodes_parsed; 52 size_t size; 53 int i, j, cnt = 0; 54 55 /* size the new table and allocate it */ 56 nodes_parsed = numa_nodes_parsed; 57 numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); 58 59 for_each_node_mask(i, nodes_parsed) 60 cnt = i; 61 cnt++; 62 size = cnt * cnt * sizeof(numa_distance[0]); 63 64 numa_distance = memblock_alloc(size, PAGE_SIZE); 65 if (!numa_distance) { 66 pr_warn("Warning: can't allocate distance table!\n"); 67 /* don't retry until explicitly reset */ 68 numa_distance = (void *)1LU; 69 return -ENOMEM; 70 } 71 72 numa_distance_cnt = cnt; 73 74 /* fill with the default distances */ 75 for (i = 0; i < cnt; i++) 76 for (j = 0; j < cnt; j++) 77 numa_distance[i * cnt + j] = i == j ? 78 LOCAL_DISTANCE : REMOTE_DISTANCE; 79 printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); 80 81 return 0; 82 } 83 84 /** 85 * numa_set_distance - Set NUMA distance from one NUMA to another 86 * @from: the 'from' node to set distance 87 * @to: the 'to' node to set distance 88 * @distance: NUMA distance 89 * 90 * Set the distance from node @from to @to to @distance. If distance table 91 * doesn't exist, one which is large enough to accommodate all the currently 92 * known nodes will be created. 93 * 94 * If such table cannot be allocated, a warning is printed and further 95 * calls are ignored until the distance table is reset with 96 * numa_reset_distance(). 97 * 98 * If @from or @to is higher than the highest known node or lower than zero 99 * at the time of table creation or @distance doesn't make sense, the call 100 * is ignored. 101 * This is to allow simplification of specific NUMA config implementations. 102 */ 103 void __init numa_set_distance(int from, int to, int distance) 104 { 105 if (!numa_distance && numa_alloc_distance() < 0) 106 return; 107 108 if (from >= numa_distance_cnt || to >= numa_distance_cnt || 109 from < 0 || to < 0) { 110 pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n", 111 from, to, distance); 112 return; 113 } 114 115 if ((u8)distance != distance || 116 (from == to && distance != LOCAL_DISTANCE)) { 117 pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n", 118 from, to, distance); 119 return; 120 } 121 122 numa_distance[from * numa_distance_cnt + to] = distance; 123 } 124 125 int __node_distance(int from, int to) 126 { 127 if (from >= numa_distance_cnt || to >= numa_distance_cnt) 128 return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; 129 return numa_distance[from * numa_distance_cnt + to]; 130 } 131 EXPORT_SYMBOL(__node_distance); 132 133 static int __init numa_add_memblk_to(int nid, u64 start, u64 end, 134 struct numa_meminfo *mi) 135 { 136 /* ignore zero length blks */ 137 if (start == end) 138 return 0; 139 140 /* whine about and ignore invalid blks */ 141 if (start > end || nid < 0 || nid >= MAX_NUMNODES) { 142 pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", 143 nid, start, end - 1); 144 return 0; 145 } 146 147 if (mi->nr_blks >= NR_NODE_MEMBLKS) { 148 pr_err("too many memblk ranges\n"); 149 return -EINVAL; 150 } 151 152 mi->blk[mi->nr_blks].start = start; 153 mi->blk[mi->nr_blks].end = end; 154 mi->blk[mi->nr_blks].nid = nid; 155 mi->nr_blks++; 156 return 0; 157 } 158 159 /** 160 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo 161 * @idx: Index of memblk to remove 162 * @mi: numa_meminfo to remove memblk from 163 * 164 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and 165 * decrementing @mi->nr_blks. 166 */ 167 void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) 168 { 169 mi->nr_blks--; 170 memmove(&mi->blk[idx], &mi->blk[idx + 1], 171 (mi->nr_blks - idx) * sizeof(mi->blk[0])); 172 } 173 174 /** 175 * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another 176 * @dst: numa_meminfo to append block to 177 * @idx: Index of memblk to remove 178 * @src: numa_meminfo to remove memblk from 179 */ 180 static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx, 181 struct numa_meminfo *src) 182 { 183 dst->blk[dst->nr_blks++] = src->blk[idx]; 184 numa_remove_memblk_from(idx, src); 185 } 186 187 /** 188 * numa_add_memblk - Add one numa_memblk to numa_meminfo 189 * @nid: NUMA node ID of the new memblk 190 * @start: Start address of the new memblk 191 * @end: End address of the new memblk 192 * 193 * Add a new memblk to the default numa_meminfo. 194 * 195 * RETURNS: 196 * 0 on success, -errno on failure. 197 */ 198 int __init numa_add_memblk(int nid, u64 start, u64 end) 199 { 200 return numa_add_memblk_to(nid, start, end, &numa_meminfo); 201 } 202 203 /** 204 * numa_add_reserved_memblk - Add one numa_memblk to numa_reserved_meminfo 205 * @nid: NUMA node ID of the new memblk 206 * @start: Start address of the new memblk 207 * @end: End address of the new memblk 208 * 209 * Add a new memblk to the numa_reserved_meminfo. 210 * 211 * Usage Case: numa_cleanup_meminfo() reconciles all numa_memblk instances 212 * against memblock_type information and moves any that intersect reserved 213 * ranges to numa_reserved_meminfo. However, when that information is known 214 * ahead of time, we use numa_add_reserved_memblk() to add the numa_memblk 215 * to numa_reserved_meminfo directly. 216 * 217 * RETURNS: 218 * 0 on success, -errno on failure. 219 */ 220 int __init numa_add_reserved_memblk(int nid, u64 start, u64 end) 221 { 222 return numa_add_memblk_to(nid, start, end, &numa_reserved_meminfo); 223 } 224 225 /** 226 * numa_cleanup_meminfo - Cleanup a numa_meminfo 227 * @mi: numa_meminfo to clean up 228 * 229 * Sanitize @mi by merging and removing unnecessary memblks. Also check for 230 * conflicts and clear unused memblks. 231 * 232 * RETURNS: 233 * 0 on success, -errno on failure. 234 */ 235 int __init numa_cleanup_meminfo(struct numa_meminfo *mi) 236 { 237 const u64 low = memblock_start_of_DRAM(); 238 const u64 high = memblock_end_of_DRAM(); 239 int i, j, k; 240 241 /* first, trim all entries */ 242 for (i = 0; i < mi->nr_blks; i++) { 243 struct numa_memblk *bi = &mi->blk[i]; 244 245 /* move / save reserved memory ranges */ 246 if (!memblock_overlaps_region(&memblock.memory, 247 bi->start, bi->end - bi->start)) { 248 numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi); 249 continue; 250 } 251 252 /* make sure all non-reserved blocks are inside the limits */ 253 bi->start = max(bi->start, low); 254 255 /* preserve info for non-RAM areas above 'max_pfn': */ 256 if (bi->end > high) { 257 numa_add_memblk_to(bi->nid, high, bi->end, 258 &numa_reserved_meminfo); 259 bi->end = high; 260 } 261 262 /* and there's no empty block */ 263 if (bi->start >= bi->end) 264 numa_remove_memblk_from(i--, mi); 265 } 266 267 /* merge neighboring / overlapping entries */ 268 for (i = 0; i < mi->nr_blks; i++) { 269 struct numa_memblk *bi = &mi->blk[i]; 270 271 for (j = i + 1; j < mi->nr_blks; j++) { 272 struct numa_memblk *bj = &mi->blk[j]; 273 u64 start, end; 274 275 /* 276 * See whether there are overlapping blocks. Whine 277 * about but allow overlaps of the same nid. They 278 * will be merged below. 279 */ 280 if (bi->end > bj->start && bi->start < bj->end) { 281 if (bi->nid != bj->nid) { 282 pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n", 283 bi->nid, bi->start, bi->end - 1, 284 bj->nid, bj->start, bj->end - 1); 285 return -EINVAL; 286 } 287 pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n", 288 bi->nid, bi->start, bi->end - 1, 289 bj->start, bj->end - 1); 290 } 291 292 /* 293 * Join together blocks on the same node, holes 294 * between which don't overlap with memory on other 295 * nodes. 296 */ 297 if (bi->nid != bj->nid) 298 continue; 299 start = min(bi->start, bj->start); 300 end = max(bi->end, bj->end); 301 for (k = 0; k < mi->nr_blks; k++) { 302 struct numa_memblk *bk = &mi->blk[k]; 303 304 if (bi->nid == bk->nid) 305 continue; 306 if (start < bk->end && end > bk->start) 307 break; 308 } 309 if (k < mi->nr_blks) 310 continue; 311 pr_info("NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n", 312 bi->nid, bi->start, bi->end - 1, bj->start, 313 bj->end - 1, start, end - 1); 314 bi->start = start; 315 bi->end = end; 316 numa_remove_memblk_from(j--, mi); 317 } 318 } 319 320 /* clear unused ones */ 321 for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { 322 mi->blk[i].start = mi->blk[i].end = 0; 323 mi->blk[i].nid = NUMA_NO_NODE; 324 } 325 326 return 0; 327 } 328 329 /* 330 * Mark all currently memblock-reserved physical memory (which covers the 331 * kernel's own memory ranges) as hot-unswappable. 332 */ 333 static void __init numa_clear_kernel_node_hotplug(void) 334 { 335 nodemask_t reserved_nodemask = NODE_MASK_NONE; 336 struct memblock_region *mb_region; 337 int i; 338 339 /* 340 * We have to do some preprocessing of memblock regions, to 341 * make them suitable for reservation. 342 * 343 * At this time, all memory regions reserved by memblock are 344 * used by the kernel, but those regions are not split up 345 * along node boundaries yet, and don't necessarily have their 346 * node ID set yet either. 347 * 348 * So iterate over all parsed memory blocks and use those ranges to 349 * set the nid in memblock.reserved. This will split up the 350 * memblock regions along node boundaries and will set the node IDs 351 * as well. 352 */ 353 for (i = 0; i < numa_meminfo.nr_blks; i++) { 354 struct numa_memblk *mb = numa_meminfo.blk + i; 355 int ret; 356 357 ret = memblock_set_node(mb->start, mb->end - mb->start, 358 &memblock.reserved, mb->nid); 359 WARN_ON_ONCE(ret); 360 } 361 362 /* 363 * Now go over all reserved memblock regions, to construct a 364 * node mask of all kernel reserved memory areas. 365 * 366 * [ Note, when booting with mem=nn[kMG] or in a kdump kernel, 367 * numa_meminfo might not include all memblock.reserved 368 * memory ranges, because quirks such as trim_snb_memory() 369 * reserve specific pages for Sandy Bridge graphics. ] 370 */ 371 for_each_reserved_mem_region(mb_region) { 372 int nid = memblock_get_region_node(mb_region); 373 374 if (numa_valid_node(nid)) 375 node_set(nid, reserved_nodemask); 376 } 377 378 /* 379 * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory 380 * belonging to the reserved node mask. 381 * 382 * Note that this will include memory regions that reside 383 * on nodes that contain kernel memory - entire nodes 384 * become hot-unpluggable: 385 */ 386 for (i = 0; i < numa_meminfo.nr_blks; i++) { 387 struct numa_memblk *mb = numa_meminfo.blk + i; 388 389 if (!node_isset(mb->nid, reserved_nodemask)) 390 continue; 391 392 memblock_clear_hotplug(mb->start, mb->end - mb->start); 393 } 394 } 395 396 static int __init numa_register_meminfo(struct numa_meminfo *mi) 397 { 398 int i; 399 400 /* Account for nodes with cpus and no memory */ 401 node_possible_map = numa_nodes_parsed; 402 numa_nodemask_from_meminfo(&node_possible_map, mi); 403 if (WARN_ON(nodes_empty(node_possible_map))) 404 return -EINVAL; 405 406 for (i = 0; i < mi->nr_blks; i++) { 407 struct numa_memblk *mb = &mi->blk[i]; 408 409 memblock_set_node(mb->start, mb->end - mb->start, 410 &memblock.memory, mb->nid); 411 } 412 413 /* 414 * At very early time, the kernel have to use some memory such as 415 * loading the kernel image. We cannot prevent this anyway. So any 416 * node the kernel resides in should be un-hotpluggable. 417 * 418 * And when we come here, alloc node data won't fail. 419 */ 420 numa_clear_kernel_node_hotplug(); 421 422 /* 423 * If sections array is gonna be used for pfn -> nid mapping, check 424 * whether its granularity is fine enough. 425 */ 426 if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) { 427 unsigned long pfn_align = node_map_pfn_alignment(); 428 429 if (pfn_align && pfn_align < PAGES_PER_SECTION) { 430 unsigned long node_align_mb = PFN_PHYS(pfn_align) >> 20; 431 432 unsigned long sect_align_mb = PFN_PHYS(PAGES_PER_SECTION) >> 20; 433 434 pr_warn("Node alignment %luMB < min %luMB, rejecting NUMA config\n", 435 node_align_mb, sect_align_mb); 436 return -EINVAL; 437 } 438 } 439 440 return 0; 441 } 442 443 int __init numa_memblks_init(int (*init_func)(void), 444 bool memblock_force_top_down) 445 { 446 phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; 447 int ret; 448 449 nodes_clear(numa_nodes_parsed); 450 nodes_clear(node_possible_map); 451 nodes_clear(node_online_map); 452 memset(&numa_meminfo, 0, sizeof(numa_meminfo)); 453 WARN_ON(memblock_set_node(0, max_addr, &memblock.memory, NUMA_NO_NODE)); 454 WARN_ON(memblock_set_node(0, max_addr, &memblock.reserved, 455 NUMA_NO_NODE)); 456 /* In case that parsing SRAT failed. */ 457 WARN_ON(memblock_clear_hotplug(0, max_addr)); 458 numa_reset_distance(); 459 460 ret = init_func(); 461 if (ret < 0) 462 return ret; 463 464 /* 465 * We reset memblock back to the top-down direction 466 * here because if we configured ACPI_NUMA, we have 467 * parsed SRAT in init_func(). It is ok to have the 468 * reset here even if we did't configure ACPI_NUMA 469 * or acpi numa init fails and fallbacks to dummy 470 * numa init. 471 */ 472 if (memblock_force_top_down) 473 memblock_set_bottom_up(false); 474 475 ret = numa_cleanup_meminfo(&numa_meminfo); 476 if (ret < 0) 477 return ret; 478 479 numa_emulation(&numa_meminfo, numa_distance_cnt); 480 481 return numa_register_meminfo(&numa_meminfo); 482 } 483 484 static int __init cmp_memblk(const void *a, const void *b) 485 { 486 const struct numa_memblk *ma = *(const struct numa_memblk **)a; 487 const struct numa_memblk *mb = *(const struct numa_memblk **)b; 488 489 return (ma->start > mb->start) - (ma->start < mb->start); 490 } 491 492 static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; 493 494 /** 495 * numa_fill_memblks - Fill gaps in numa_meminfo memblks 496 * @start: address to begin fill 497 * @end: address to end fill 498 * 499 * Find and extend numa_meminfo memblks to cover the physical 500 * address range @start-@end 501 * 502 * RETURNS: 503 * 0 : Success 504 * NUMA_NO_MEMBLK : No memblks exist in address range @start-@end 505 */ 506 507 int __init numa_fill_memblks(u64 start, u64 end) 508 { 509 struct numa_memblk **blk = &numa_memblk_list[0]; 510 struct numa_meminfo *mi = &numa_meminfo; 511 int count = 0; 512 u64 prev_end; 513 514 /* 515 * Create a list of pointers to numa_meminfo memblks that 516 * overlap start, end. The list is used to make in-place 517 * changes that fill out the numa_meminfo memblks. 518 */ 519 for (int i = 0; i < mi->nr_blks; i++) { 520 struct numa_memblk *bi = &mi->blk[i]; 521 522 if (memblock_addrs_overlap(start, end - start, bi->start, 523 bi->end - bi->start)) { 524 blk[count] = &mi->blk[i]; 525 count++; 526 } 527 } 528 if (!count) 529 return NUMA_NO_MEMBLK; 530 531 /* Sort the list of pointers in memblk->start order */ 532 sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL); 533 534 /* Make sure the first/last memblks include start/end */ 535 blk[0]->start = min(blk[0]->start, start); 536 blk[count - 1]->end = max(blk[count - 1]->end, end); 537 538 /* 539 * Fill any gaps by tracking the previous memblks 540 * end address and backfilling to it if needed. 541 */ 542 prev_end = blk[0]->end; 543 for (int i = 1; i < count; i++) { 544 struct numa_memblk *curr = blk[i]; 545 546 if (prev_end >= curr->start) { 547 if (prev_end < curr->end) 548 prev_end = curr->end; 549 } else { 550 curr->start = prev_end; 551 prev_end = curr->end; 552 } 553 } 554 return 0; 555 } 556 557 #ifdef CONFIG_NUMA_KEEP_MEMINFO 558 static int meminfo_to_nid(struct numa_meminfo *mi, u64 start) 559 { 560 int i; 561 562 for (i = 0; i < mi->nr_blks; i++) 563 if (mi->blk[i].start <= start && mi->blk[i].end > start) 564 return mi->blk[i].nid; 565 return NUMA_NO_NODE; 566 } 567 568 int phys_to_target_node(u64 start) 569 { 570 int nid = meminfo_to_nid(&numa_meminfo, start); 571 572 /* 573 * Prefer online nodes, but if reserved memory might be 574 * hot-added continue the search with reserved ranges. 575 */ 576 if (nid != NUMA_NO_NODE) 577 return nid; 578 579 return meminfo_to_nid(&numa_reserved_meminfo, start); 580 } 581 EXPORT_SYMBOL_GPL(phys_to_target_node); 582 583 int memory_add_physaddr_to_nid(u64 start) 584 { 585 int nid = meminfo_to_nid(&numa_meminfo, start); 586 587 if (nid == NUMA_NO_NODE) 588 nid = numa_meminfo.blk[0].nid; 589 return nid; 590 } 591 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 592 593 #endif /* CONFIG_NUMA_KEEP_MEMINFO */ 594