1 // SPDX-License-Identifier: GPL-2.0-or-later 2 3 #include <linux/array_size.h> 4 #include <linux/sort.h> 5 #include <linux/printk.h> 6 #include <linux/memblock.h> 7 #include <linux/numa.h> 8 #include <linux/numa_memblks.h> 9 10 static int numa_distance_cnt; 11 static u8 *numa_distance; 12 13 nodemask_t numa_nodes_parsed __initdata; 14 15 static struct numa_meminfo numa_meminfo __initdata_or_meminfo; 16 static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo; 17 18 /* 19 * Set nodes, which have memory in @mi, in *@nodemask. 20 */ 21 static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, 22 const struct numa_meminfo *mi) 23 { 24 int i; 25 26 for (i = 0; i < ARRAY_SIZE(mi->blk); i++) 27 if (mi->blk[i].start != mi->blk[i].end && 28 mi->blk[i].nid != NUMA_NO_NODE) 29 node_set(mi->blk[i].nid, *nodemask); 30 } 31 32 /** 33 * numa_reset_distance - Reset NUMA distance table 34 * 35 * The current table is freed. The next numa_set_distance() call will 36 * create a new one. 37 */ 38 void __init numa_reset_distance(void) 39 { 40 size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); 41 42 /* numa_distance could be 1LU marking allocation failure, test cnt */ 43 if (numa_distance_cnt) 44 memblock_free(numa_distance, size); 45 numa_distance_cnt = 0; 46 numa_distance = NULL; /* enable table creation */ 47 } 48 49 static int __init numa_alloc_distance(void) 50 { 51 nodemask_t nodes_parsed; 52 size_t size; 53 int i, j, cnt = 0; 54 55 /* size the new table and allocate it */ 56 nodes_parsed = numa_nodes_parsed; 57 numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); 58 59 for_each_node_mask(i, nodes_parsed) 60 cnt = i; 61 cnt++; 62 size = cnt * cnt * sizeof(numa_distance[0]); 63 64 numa_distance = memblock_alloc(size, PAGE_SIZE); 65 if (!numa_distance) { 66 pr_warn("Warning: can't allocate distance table!\n"); 67 /* don't retry until explicitly reset */ 68 numa_distance = (void *)1LU; 69 return -ENOMEM; 70 } 71 72 numa_distance_cnt = cnt; 73 74 /* fill with the default distances */ 75 for (i = 0; i < cnt; i++) 76 for (j = 0; j < cnt; j++) 77 numa_distance[i * cnt + j] = i == j ? 78 LOCAL_DISTANCE : REMOTE_DISTANCE; 79 printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); 80 81 return 0; 82 } 83 84 /** 85 * numa_set_distance - Set NUMA distance from one NUMA to another 86 * @from: the 'from' node to set distance 87 * @to: the 'to' node to set distance 88 * @distance: NUMA distance 89 * 90 * Set the distance from node @from to @to to @distance. If distance table 91 * doesn't exist, one which is large enough to accommodate all the currently 92 * known nodes will be created. 93 * 94 * If such table cannot be allocated, a warning is printed and further 95 * calls are ignored until the distance table is reset with 96 * numa_reset_distance(). 97 * 98 * If @from or @to is higher than the highest known node or lower than zero 99 * at the time of table creation or @distance doesn't make sense, the call 100 * is ignored. 101 * This is to allow simplification of specific NUMA config implementations. 102 */ 103 void __init numa_set_distance(int from, int to, int distance) 104 { 105 if (!numa_distance && numa_alloc_distance() < 0) 106 return; 107 108 if (from >= numa_distance_cnt || to >= numa_distance_cnt || 109 from < 0 || to < 0) { 110 pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n", 111 from, to, distance); 112 return; 113 } 114 115 if ((u8)distance != distance || 116 (from == to && distance != LOCAL_DISTANCE)) { 117 pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n", 118 from, to, distance); 119 return; 120 } 121 122 numa_distance[from * numa_distance_cnt + to] = distance; 123 } 124 125 int __node_distance(int from, int to) 126 { 127 if (from >= numa_distance_cnt || to >= numa_distance_cnt) 128 return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; 129 return numa_distance[from * numa_distance_cnt + to]; 130 } 131 EXPORT_SYMBOL(__node_distance); 132 133 static int __init numa_add_memblk_to(int nid, u64 start, u64 end, 134 struct numa_meminfo *mi) 135 { 136 /* ignore zero length blks */ 137 if (start == end) 138 return 0; 139 140 /* whine about and ignore invalid blks */ 141 if (start > end || nid < 0 || nid >= MAX_NUMNODES) { 142 pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", 143 nid, start, end - 1); 144 return 0; 145 } 146 147 if (mi->nr_blks >= NR_NODE_MEMBLKS) { 148 pr_err("too many memblk ranges\n"); 149 return -EINVAL; 150 } 151 152 mi->blk[mi->nr_blks].start = start; 153 mi->blk[mi->nr_blks].end = end; 154 mi->blk[mi->nr_blks].nid = nid; 155 mi->nr_blks++; 156 return 0; 157 } 158 159 /** 160 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo 161 * @idx: Index of memblk to remove 162 * @mi: numa_meminfo to remove memblk from 163 * 164 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and 165 * decrementing @mi->nr_blks. 166 */ 167 void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) 168 { 169 mi->nr_blks--; 170 memmove(&mi->blk[idx], &mi->blk[idx + 1], 171 (mi->nr_blks - idx) * sizeof(mi->blk[0])); 172 } 173 174 /** 175 * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another 176 * @dst: numa_meminfo to append block to 177 * @idx: Index of memblk to remove 178 * @src: numa_meminfo to remove memblk from 179 */ 180 static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx, 181 struct numa_meminfo *src) 182 { 183 dst->blk[dst->nr_blks++] = src->blk[idx]; 184 numa_remove_memblk_from(idx, src); 185 } 186 187 /** 188 * numa_add_memblk - Add one numa_memblk to numa_meminfo 189 * @nid: NUMA node ID of the new memblk 190 * @start: Start address of the new memblk 191 * @end: End address of the new memblk 192 * 193 * Add a new memblk to the default numa_meminfo. 194 * 195 * RETURNS: 196 * 0 on success, -errno on failure. 197 */ 198 int __init numa_add_memblk(int nid, u64 start, u64 end) 199 { 200 return numa_add_memblk_to(nid, start, end, &numa_meminfo); 201 } 202 203 /** 204 * numa_cleanup_meminfo - Cleanup a numa_meminfo 205 * @mi: numa_meminfo to clean up 206 * 207 * Sanitize @mi by merging and removing unnecessary memblks. Also check for 208 * conflicts and clear unused memblks. 209 * 210 * RETURNS: 211 * 0 on success, -errno on failure. 212 */ 213 int __init numa_cleanup_meminfo(struct numa_meminfo *mi) 214 { 215 const u64 low = memblock_start_of_DRAM(); 216 const u64 high = memblock_end_of_DRAM(); 217 int i, j, k; 218 219 /* first, trim all entries */ 220 for (i = 0; i < mi->nr_blks; i++) { 221 struct numa_memblk *bi = &mi->blk[i]; 222 223 /* move / save reserved memory ranges */ 224 if (!memblock_overlaps_region(&memblock.memory, 225 bi->start, bi->end - bi->start)) { 226 numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi); 227 continue; 228 } 229 230 /* make sure all non-reserved blocks are inside the limits */ 231 bi->start = max(bi->start, low); 232 233 /* preserve info for non-RAM areas above 'max_pfn': */ 234 if (bi->end > high) { 235 numa_add_memblk_to(bi->nid, high, bi->end, 236 &numa_reserved_meminfo); 237 bi->end = high; 238 } 239 240 /* and there's no empty block */ 241 if (bi->start >= bi->end) 242 numa_remove_memblk_from(i--, mi); 243 } 244 245 /* merge neighboring / overlapping entries */ 246 for (i = 0; i < mi->nr_blks; i++) { 247 struct numa_memblk *bi = &mi->blk[i]; 248 249 for (j = i + 1; j < mi->nr_blks; j++) { 250 struct numa_memblk *bj = &mi->blk[j]; 251 u64 start, end; 252 253 /* 254 * See whether there are overlapping blocks. Whine 255 * about but allow overlaps of the same nid. They 256 * will be merged below. 257 */ 258 if (bi->end > bj->start && bi->start < bj->end) { 259 if (bi->nid != bj->nid) { 260 pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n", 261 bi->nid, bi->start, bi->end - 1, 262 bj->nid, bj->start, bj->end - 1); 263 return -EINVAL; 264 } 265 pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n", 266 bi->nid, bi->start, bi->end - 1, 267 bj->start, bj->end - 1); 268 } 269 270 /* 271 * Join together blocks on the same node, holes 272 * between which don't overlap with memory on other 273 * nodes. 274 */ 275 if (bi->nid != bj->nid) 276 continue; 277 start = min(bi->start, bj->start); 278 end = max(bi->end, bj->end); 279 for (k = 0; k < mi->nr_blks; k++) { 280 struct numa_memblk *bk = &mi->blk[k]; 281 282 if (bi->nid == bk->nid) 283 continue; 284 if (start < bk->end && end > bk->start) 285 break; 286 } 287 if (k < mi->nr_blks) 288 continue; 289 pr_info("NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n", 290 bi->nid, bi->start, bi->end - 1, bj->start, 291 bj->end - 1, start, end - 1); 292 bi->start = start; 293 bi->end = end; 294 numa_remove_memblk_from(j--, mi); 295 } 296 } 297 298 /* clear unused ones */ 299 for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { 300 mi->blk[i].start = mi->blk[i].end = 0; 301 mi->blk[i].nid = NUMA_NO_NODE; 302 } 303 304 return 0; 305 } 306 307 /* 308 * Mark all currently memblock-reserved physical memory (which covers the 309 * kernel's own memory ranges) as hot-unswappable. 310 */ 311 static void __init numa_clear_kernel_node_hotplug(void) 312 { 313 nodemask_t reserved_nodemask = NODE_MASK_NONE; 314 struct memblock_region *mb_region; 315 int i; 316 317 /* 318 * We have to do some preprocessing of memblock regions, to 319 * make them suitable for reservation. 320 * 321 * At this time, all memory regions reserved by memblock are 322 * used by the kernel, but those regions are not split up 323 * along node boundaries yet, and don't necessarily have their 324 * node ID set yet either. 325 * 326 * So iterate over all parsed memory blocks and use those ranges to 327 * set the nid in memblock.reserved. This will split up the 328 * memblock regions along node boundaries and will set the node IDs 329 * as well. 330 */ 331 for (i = 0; i < numa_meminfo.nr_blks; i++) { 332 struct numa_memblk *mb = numa_meminfo.blk + i; 333 int ret; 334 335 ret = memblock_set_node(mb->start, mb->end - mb->start, 336 &memblock.reserved, mb->nid); 337 WARN_ON_ONCE(ret); 338 } 339 340 /* 341 * Now go over all reserved memblock regions, to construct a 342 * node mask of all kernel reserved memory areas. 343 * 344 * [ Note, when booting with mem=nn[kMG] or in a kdump kernel, 345 * numa_meminfo might not include all memblock.reserved 346 * memory ranges, because quirks such as trim_snb_memory() 347 * reserve specific pages for Sandy Bridge graphics. ] 348 */ 349 for_each_reserved_mem_region(mb_region) { 350 int nid = memblock_get_region_node(mb_region); 351 352 if (numa_valid_node(nid)) 353 node_set(nid, reserved_nodemask); 354 } 355 356 /* 357 * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory 358 * belonging to the reserved node mask. 359 * 360 * Note that this will include memory regions that reside 361 * on nodes that contain kernel memory - entire nodes 362 * become hot-unpluggable: 363 */ 364 for (i = 0; i < numa_meminfo.nr_blks; i++) { 365 struct numa_memblk *mb = numa_meminfo.blk + i; 366 367 if (!node_isset(mb->nid, reserved_nodemask)) 368 continue; 369 370 memblock_clear_hotplug(mb->start, mb->end - mb->start); 371 } 372 } 373 374 static int __init numa_register_meminfo(struct numa_meminfo *mi) 375 { 376 int i; 377 378 /* Account for nodes with cpus and no memory */ 379 node_possible_map = numa_nodes_parsed; 380 numa_nodemask_from_meminfo(&node_possible_map, mi); 381 if (WARN_ON(nodes_empty(node_possible_map))) 382 return -EINVAL; 383 384 for (i = 0; i < mi->nr_blks; i++) { 385 struct numa_memblk *mb = &mi->blk[i]; 386 387 memblock_set_node(mb->start, mb->end - mb->start, 388 &memblock.memory, mb->nid); 389 } 390 391 /* 392 * At very early time, the kernel have to use some memory such as 393 * loading the kernel image. We cannot prevent this anyway. So any 394 * node the kernel resides in should be un-hotpluggable. 395 * 396 * And when we come here, alloc node data won't fail. 397 */ 398 numa_clear_kernel_node_hotplug(); 399 400 /* 401 * If sections array is gonna be used for pfn -> nid mapping, check 402 * whether its granularity is fine enough. 403 */ 404 if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) { 405 unsigned long pfn_align = node_map_pfn_alignment(); 406 407 if (pfn_align && pfn_align < PAGES_PER_SECTION) { 408 unsigned long node_align_mb = PFN_PHYS(pfn_align) >> 20; 409 410 unsigned long sect_align_mb = PFN_PHYS(PAGES_PER_SECTION) >> 20; 411 412 pr_warn("Node alignment %luMB < min %luMB, rejecting NUMA config\n", 413 node_align_mb, sect_align_mb); 414 return -EINVAL; 415 } 416 } 417 418 return 0; 419 } 420 421 int __init numa_memblks_init(int (*init_func)(void), 422 bool memblock_force_top_down) 423 { 424 phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; 425 int ret; 426 427 nodes_clear(numa_nodes_parsed); 428 nodes_clear(node_possible_map); 429 nodes_clear(node_online_map); 430 memset(&numa_meminfo, 0, sizeof(numa_meminfo)); 431 WARN_ON(memblock_set_node(0, max_addr, &memblock.memory, NUMA_NO_NODE)); 432 WARN_ON(memblock_set_node(0, max_addr, &memblock.reserved, 433 NUMA_NO_NODE)); 434 /* In case that parsing SRAT failed. */ 435 WARN_ON(memblock_clear_hotplug(0, max_addr)); 436 numa_reset_distance(); 437 438 ret = init_func(); 439 if (ret < 0) 440 return ret; 441 442 /* 443 * We reset memblock back to the top-down direction 444 * here because if we configured ACPI_NUMA, we have 445 * parsed SRAT in init_func(). It is ok to have the 446 * reset here even if we did't configure ACPI_NUMA 447 * or acpi numa init fails and fallbacks to dummy 448 * numa init. 449 */ 450 if (memblock_force_top_down) 451 memblock_set_bottom_up(false); 452 453 ret = numa_cleanup_meminfo(&numa_meminfo); 454 if (ret < 0) 455 return ret; 456 457 numa_emulation(&numa_meminfo, numa_distance_cnt); 458 459 return numa_register_meminfo(&numa_meminfo); 460 } 461 462 static int __init cmp_memblk(const void *a, const void *b) 463 { 464 const struct numa_memblk *ma = *(const struct numa_memblk **)a; 465 const struct numa_memblk *mb = *(const struct numa_memblk **)b; 466 467 return (ma->start > mb->start) - (ma->start < mb->start); 468 } 469 470 static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; 471 472 /** 473 * numa_fill_memblks - Fill gaps in numa_meminfo memblks 474 * @start: address to begin fill 475 * @end: address to end fill 476 * 477 * Find and extend numa_meminfo memblks to cover the physical 478 * address range @start-@end 479 * 480 * RETURNS: 481 * 0 : Success 482 * NUMA_NO_MEMBLK : No memblks exist in address range @start-@end 483 */ 484 485 int __init numa_fill_memblks(u64 start, u64 end) 486 { 487 struct numa_memblk **blk = &numa_memblk_list[0]; 488 struct numa_meminfo *mi = &numa_meminfo; 489 int count = 0; 490 u64 prev_end; 491 492 /* 493 * Create a list of pointers to numa_meminfo memblks that 494 * overlap start, end. The list is used to make in-place 495 * changes that fill out the numa_meminfo memblks. 496 */ 497 for (int i = 0; i < mi->nr_blks; i++) { 498 struct numa_memblk *bi = &mi->blk[i]; 499 500 if (memblock_addrs_overlap(start, end - start, bi->start, 501 bi->end - bi->start)) { 502 blk[count] = &mi->blk[i]; 503 count++; 504 } 505 } 506 if (!count) 507 return NUMA_NO_MEMBLK; 508 509 /* Sort the list of pointers in memblk->start order */ 510 sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL); 511 512 /* Make sure the first/last memblks include start/end */ 513 blk[0]->start = min(blk[0]->start, start); 514 blk[count - 1]->end = max(blk[count - 1]->end, end); 515 516 /* 517 * Fill any gaps by tracking the previous memblks 518 * end address and backfilling to it if needed. 519 */ 520 prev_end = blk[0]->end; 521 for (int i = 1; i < count; i++) { 522 struct numa_memblk *curr = blk[i]; 523 524 if (prev_end >= curr->start) { 525 if (prev_end < curr->end) 526 prev_end = curr->end; 527 } else { 528 curr->start = prev_end; 529 prev_end = curr->end; 530 } 531 } 532 return 0; 533 } 534 535 #ifdef CONFIG_NUMA_KEEP_MEMINFO 536 static int meminfo_to_nid(struct numa_meminfo *mi, u64 start) 537 { 538 int i; 539 540 for (i = 0; i < mi->nr_blks; i++) 541 if (mi->blk[i].start <= start && mi->blk[i].end > start) 542 return mi->blk[i].nid; 543 return NUMA_NO_NODE; 544 } 545 546 int phys_to_target_node(u64 start) 547 { 548 int nid = meminfo_to_nid(&numa_meminfo, start); 549 550 /* 551 * Prefer online nodes, but if reserved memory might be 552 * hot-added continue the search with reserved ranges. 553 */ 554 if (nid != NUMA_NO_NODE) 555 return nid; 556 557 return meminfo_to_nid(&numa_reserved_meminfo, start); 558 } 559 EXPORT_SYMBOL_GPL(phys_to_target_node); 560 561 int memory_add_physaddr_to_nid(u64 start) 562 { 563 int nid = meminfo_to_nid(&numa_meminfo, start); 564 565 if (nid == NUMA_NO_NODE) 566 nid = numa_meminfo.blk[0].nid; 567 return nid; 568 } 569 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 570 571 #endif /* CONFIG_NUMA_KEEP_MEMINFO */ 572