1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * NUMA emulation 4 */ 5 #include <linux/kernel.h> 6 #include <linux/errno.h> 7 #include <linux/topology.h> 8 #include <linux/memblock.h> 9 #include <linux/numa_memblks.h> 10 #include <asm/numa.h> 11 #include <acpi/acpi_numa.h> 12 13 #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) 14 #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) 15 16 int emu_nid_to_phys[MAX_NUMNODES]; 17 static char *emu_cmdline __initdata; 18 19 int __init numa_emu_cmdline(char *str) 20 { 21 emu_cmdline = str; 22 return 0; 23 } 24 25 static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) 26 { 27 int i; 28 29 for (i = 0; i < mi->nr_blks; i++) 30 if (mi->blk[i].nid == nid) 31 return i; 32 return -ENOENT; 33 } 34 35 static u64 __init mem_hole_size(u64 start, u64 end) 36 { 37 unsigned long start_pfn = PFN_UP(start); 38 unsigned long end_pfn = PFN_DOWN(end); 39 40 if (start_pfn < end_pfn) 41 return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn)); 42 return 0; 43 } 44 45 /* 46 * Sets up nid to range from @start to @end. The return value is -errno if 47 * something went wrong, 0 otherwise. 48 */ 49 static int __init emu_setup_memblk(struct numa_meminfo *ei, 50 struct numa_meminfo *pi, 51 int nid, int phys_blk, u64 size) 52 { 53 struct numa_memblk *eb = &ei->blk[ei->nr_blks]; 54 struct numa_memblk *pb = &pi->blk[phys_blk]; 55 56 if (ei->nr_blks >= NR_NODE_MEMBLKS) { 57 pr_err("NUMA: Too many emulated memblks, failing emulation\n"); 58 return -EINVAL; 59 } 60 61 ei->nr_blks++; 62 eb->start = pb->start; 63 eb->end = pb->start + size; 64 eb->nid = nid; 65 66 if (emu_nid_to_phys[nid] == NUMA_NO_NODE) 67 emu_nid_to_phys[nid] = pb->nid; 68 69 pb->start += size; 70 if (pb->start >= pb->end) { 71 WARN_ON_ONCE(pb->start > pb->end); 72 numa_remove_memblk_from(phys_blk, pi); 73 } 74 75 printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n", 76 nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20); 77 return 0; 78 } 79 80 /* 81 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr 82 * to max_addr. 83 * 84 * Returns zero on success or negative on error. 85 */ 86 static int __init split_nodes_interleave(struct numa_meminfo *ei, 87 struct numa_meminfo *pi, 88 u64 addr, u64 max_addr, int nr_nodes) 89 { 90 nodemask_t physnode_mask = numa_nodes_parsed; 91 u64 size; 92 int big; 93 int nid = 0; 94 int i, ret; 95 96 if (nr_nodes <= 0) 97 return -1; 98 if (nr_nodes > MAX_NUMNODES) { 99 pr_info("numa=fake=%d too large, reducing to %d\n", 100 nr_nodes, MAX_NUMNODES); 101 nr_nodes = MAX_NUMNODES; 102 } 103 104 /* 105 * Calculate target node size. x86_32 freaks on __udivdi3() so do 106 * the division in ulong number of pages and convert back. 107 */ 108 size = max_addr - addr - mem_hole_size(addr, max_addr); 109 size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); 110 111 /* 112 * Calculate the number of big nodes that can be allocated as a result 113 * of consolidating the remainder. 114 */ 115 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / 116 FAKE_NODE_MIN_SIZE; 117 118 size &= FAKE_NODE_MIN_HASH_MASK; 119 if (!size) { 120 pr_err("Not enough memory for each node. " 121 "NUMA emulation disabled.\n"); 122 return -1; 123 } 124 125 /* 126 * Continue to fill physical nodes with fake nodes until there is no 127 * memory left on any of them. 128 */ 129 while (!nodes_empty(physnode_mask)) { 130 for_each_node_mask(i, physnode_mask) { 131 u64 dma32_end = numa_emu_dma_end(); 132 u64 start, limit, end; 133 int phys_blk; 134 135 phys_blk = emu_find_memblk_by_nid(i, pi); 136 if (phys_blk < 0) { 137 node_clear(i, physnode_mask); 138 continue; 139 } 140 start = pi->blk[phys_blk].start; 141 limit = pi->blk[phys_blk].end; 142 end = start + size; 143 144 if (nid < big) 145 end += FAKE_NODE_MIN_SIZE; 146 147 /* 148 * Continue to add memory to this fake node if its 149 * non-reserved memory is less than the per-node size. 150 */ 151 while (end - start - mem_hole_size(start, end) < size) { 152 end += FAKE_NODE_MIN_SIZE; 153 if (end > limit) { 154 end = limit; 155 break; 156 } 157 } 158 159 /* 160 * If there won't be at least FAKE_NODE_MIN_SIZE of 161 * non-reserved memory in ZONE_DMA32 for the next node, 162 * this one must extend to the boundary. 163 */ 164 if (end < dma32_end && dma32_end - end - 165 mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) 166 end = dma32_end; 167 168 /* 169 * If there won't be enough non-reserved memory for the 170 * next node, this one must extend to the end of the 171 * physical node. 172 */ 173 if (limit - end - mem_hole_size(end, limit) < size) 174 end = limit; 175 176 ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, 177 phys_blk, 178 min(end, limit) - start); 179 if (ret < 0) 180 return ret; 181 } 182 } 183 return 0; 184 } 185 186 /* 187 * Returns the end address of a node so that there is at least `size' amount of 188 * non-reserved memory or `max_addr' is reached. 189 */ 190 static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) 191 { 192 u64 end = start + size; 193 194 while (end - start - mem_hole_size(start, end) < size) { 195 end += FAKE_NODE_MIN_SIZE; 196 if (end > max_addr) { 197 end = max_addr; 198 break; 199 } 200 } 201 return end; 202 } 203 204 static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes) 205 { 206 unsigned long max_pfn = PHYS_PFN(max_addr); 207 unsigned long base_pfn = PHYS_PFN(base); 208 unsigned long hole_pfns = PHYS_PFN(hole); 209 210 return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes); 211 } 212 213 /* 214 * Sets up fake nodes of `size' interleaved over physical nodes ranging from 215 * `addr' to `max_addr'. 216 * 217 * Returns zero on success or negative on error. 218 */ 219 static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei, 220 struct numa_meminfo *pi, 221 u64 addr, u64 max_addr, u64 size, 222 int nr_nodes, struct numa_memblk *pblk, 223 int nid) 224 { 225 nodemask_t physnode_mask = numa_nodes_parsed; 226 int i, ret, uniform = 0; 227 u64 min_size; 228 229 if ((!size && !nr_nodes) || (nr_nodes && !pblk)) 230 return -1; 231 232 /* 233 * In the 'uniform' case split the passed in physical node by 234 * nr_nodes, in the non-uniform case, ignore the passed in 235 * physical block and try to create nodes of at least size 236 * @size. 237 * 238 * In the uniform case, split the nodes strictly by physical 239 * capacity, i.e. ignore holes. In the non-uniform case account 240 * for holes and treat @size as a minimum floor. 241 */ 242 if (!nr_nodes) 243 nr_nodes = MAX_NUMNODES; 244 else { 245 nodes_clear(physnode_mask); 246 node_set(pblk->nid, physnode_mask); 247 uniform = 1; 248 } 249 250 if (uniform) { 251 min_size = uniform_size(max_addr, addr, 0, nr_nodes); 252 size = min_size; 253 } else { 254 /* 255 * The limit on emulated nodes is MAX_NUMNODES, so the 256 * size per node is increased accordingly if the 257 * requested size is too small. This creates a uniform 258 * distribution of node sizes across the entire machine 259 * (but not necessarily over physical nodes). 260 */ 261 min_size = uniform_size(max_addr, addr, 262 mem_hole_size(addr, max_addr), nr_nodes); 263 } 264 min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE); 265 if (size < min_size) { 266 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", 267 size >> 20, min_size >> 20); 268 size = min_size; 269 } 270 size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE); 271 272 /* 273 * Fill physical nodes with fake nodes of size until there is no memory 274 * left on any of them. 275 */ 276 while (!nodes_empty(physnode_mask)) { 277 for_each_node_mask(i, physnode_mask) { 278 u64 dma32_end = numa_emu_dma_end(); 279 u64 start, limit, end; 280 int phys_blk; 281 282 phys_blk = emu_find_memblk_by_nid(i, pi); 283 if (phys_blk < 0) { 284 node_clear(i, physnode_mask); 285 continue; 286 } 287 288 start = pi->blk[phys_blk].start; 289 limit = pi->blk[phys_blk].end; 290 291 if (uniform) 292 end = start + size; 293 else 294 end = find_end_of_node(start, limit, size); 295 /* 296 * If there won't be at least FAKE_NODE_MIN_SIZE of 297 * non-reserved memory in ZONE_DMA32 for the next node, 298 * this one must extend to the boundary. 299 */ 300 if (end < dma32_end && dma32_end - end - 301 mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) 302 end = dma32_end; 303 304 /* 305 * If there won't be enough non-reserved memory for the 306 * next node, this one must extend to the end of the 307 * physical node. 308 */ 309 if ((limit - end - mem_hole_size(end, limit) < size) 310 && !uniform) 311 end = limit; 312 313 ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, 314 phys_blk, 315 min(end, limit) - start); 316 if (ret < 0) 317 return ret; 318 } 319 } 320 return nid; 321 } 322 323 static int __init split_nodes_size_interleave(struct numa_meminfo *ei, 324 struct numa_meminfo *pi, 325 u64 addr, u64 max_addr, u64 size) 326 { 327 return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size, 328 0, NULL, 0); 329 } 330 331 static int __init setup_emu2phys_nid(int *dfl_phys_nid) 332 { 333 int i, max_emu_nid = 0; 334 335 *dfl_phys_nid = NUMA_NO_NODE; 336 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { 337 if (emu_nid_to_phys[i] != NUMA_NO_NODE) { 338 max_emu_nid = i; 339 if (*dfl_phys_nid == NUMA_NO_NODE) 340 *dfl_phys_nid = emu_nid_to_phys[i]; 341 } 342 } 343 344 return max_emu_nid; 345 } 346 347 /** 348 * numa_emulation - Emulate NUMA nodes 349 * @numa_meminfo: NUMA configuration to massage 350 * @numa_dist_cnt: The size of the physical NUMA distance table 351 * 352 * Emulate NUMA nodes according to the numa=fake kernel parameter. 353 * @numa_meminfo contains the physical memory configuration and is modified 354 * to reflect the emulated configuration on success. @numa_dist_cnt is 355 * used to determine the size of the physical distance table. 356 * 357 * On success, the following modifications are made. 358 * 359 * - @numa_meminfo is updated to reflect the emulated nodes. 360 * 361 * - __apicid_to_node[] is updated such that APIC IDs are mapped to the 362 * emulated nodes. 363 * 364 * - NUMA distance table is rebuilt to represent distances between emulated 365 * nodes. The distances are determined considering how emulated nodes 366 * are mapped to physical nodes and match the actual distances. 367 * 368 * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical 369 * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). 370 * 371 * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with 372 * identity mapping and no other modification is made. 373 */ 374 void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) 375 { 376 static struct numa_meminfo ei __initdata; 377 static struct numa_meminfo pi __initdata; 378 const u64 max_addr = PFN_PHYS(max_pfn); 379 u8 *phys_dist = NULL; 380 size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); 381 int max_emu_nid, dfl_phys_nid; 382 int i, j, ret; 383 nodemask_t physnode_mask = numa_nodes_parsed; 384 385 if (!emu_cmdline) 386 goto no_emu; 387 388 memset(&ei, 0, sizeof(ei)); 389 pi = *numa_meminfo; 390 391 for (i = 0; i < MAX_NUMNODES; i++) 392 emu_nid_to_phys[i] = NUMA_NO_NODE; 393 394 /* 395 * If the numa=fake command-line contains a 'M' or 'G', it represents 396 * the fixed node size. Otherwise, if it is just a single number N, 397 * split the system RAM into N fake nodes. 398 */ 399 if (strchr(emu_cmdline, 'U')) { 400 unsigned long n; 401 int nid = 0; 402 403 n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); 404 ret = -1; 405 for_each_node_mask(i, physnode_mask) { 406 /* 407 * The reason we pass in blk[0] is due to 408 * numa_remove_memblk_from() called by 409 * emu_setup_memblk() will delete entry 0 410 * and then move everything else up in the pi.blk 411 * array. Therefore we should always be looking 412 * at blk[0]. 413 */ 414 ret = split_nodes_size_interleave_uniform(&ei, &pi, 415 pi.blk[0].start, pi.blk[0].end, 0, 416 n, &pi.blk[0], nid); 417 if (ret < 0) 418 break; 419 if (ret < n) { 420 pr_info("%s: phys: %d only got %d of %ld nodes, failing\n", 421 __func__, i, ret, n); 422 ret = -1; 423 break; 424 } 425 nid = ret; 426 } 427 } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { 428 u64 size; 429 430 size = memparse(emu_cmdline, &emu_cmdline); 431 ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); 432 } else { 433 unsigned long n; 434 435 n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); 436 ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); 437 } 438 if (*emu_cmdline == ':') 439 emu_cmdline++; 440 441 if (ret < 0) 442 goto no_emu; 443 444 if (numa_cleanup_meminfo(&ei) < 0) { 445 pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); 446 goto no_emu; 447 } 448 449 /* copy the physical distance table */ 450 if (numa_dist_cnt) { 451 phys_dist = memblock_alloc(phys_size, PAGE_SIZE); 452 if (!phys_dist) { 453 pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); 454 goto no_emu; 455 } 456 457 for (i = 0; i < numa_dist_cnt; i++) 458 for (j = 0; j < numa_dist_cnt; j++) 459 phys_dist[i * numa_dist_cnt + j] = 460 node_distance(i, j); 461 } 462 463 /* 464 * Determine the max emulated nid and the default phys nid to use 465 * for unmapped nodes. 466 */ 467 max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid); 468 469 /* Make sure numa_nodes_parsed only contains emulated nodes */ 470 nodes_clear(numa_nodes_parsed); 471 for (i = 0; i < ARRAY_SIZE(ei.blk); i++) 472 if (ei.blk[i].start != ei.blk[i].end && 473 ei.blk[i].nid != NUMA_NO_NODE) 474 node_set(ei.blk[i].nid, numa_nodes_parsed); 475 476 /* fix pxm_to_node_map[] and node_to_pxm_map[] to avoid collision 477 * with faked numa nodes, particularly during later memory hotplug 478 * handling, and also update numa_nodes_parsed accordingly. 479 */ 480 ret = fix_pxm_node_maps(max_emu_nid); 481 if (ret < 0) 482 goto no_emu; 483 484 /* commit */ 485 *numa_meminfo = ei; 486 487 numa_emu_update_cpu_to_node(emu_nid_to_phys, max_emu_nid + 1); 488 489 /* make sure all emulated nodes are mapped to a physical node */ 490 for (i = 0; i < max_emu_nid + 1; i++) 491 if (emu_nid_to_phys[i] == NUMA_NO_NODE) 492 emu_nid_to_phys[i] = dfl_phys_nid; 493 494 /* transform distance table */ 495 numa_reset_distance(); 496 for (i = 0; i < max_emu_nid + 1; i++) { 497 for (j = 0; j < max_emu_nid + 1; j++) { 498 int physi = emu_nid_to_phys[i]; 499 int physj = emu_nid_to_phys[j]; 500 int dist; 501 502 if (get_option(&emu_cmdline, &dist) == 2) 503 ; 504 else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) 505 dist = physi == physj ? 506 LOCAL_DISTANCE : REMOTE_DISTANCE; 507 else 508 dist = phys_dist[physi * numa_dist_cnt + physj]; 509 510 numa_set_distance(i, j, dist); 511 } 512 } 513 for (i = 0; i < numa_distance_cnt; i++) { 514 for (j = 0; j < numa_distance_cnt; j++) { 515 int physi, physj; 516 u8 dist; 517 518 /* distance between fake nodes is already ok */ 519 if (emu_nid_to_phys[i] != NUMA_NO_NODE && 520 emu_nid_to_phys[j] != NUMA_NO_NODE) 521 continue; 522 if (emu_nid_to_phys[i] != NUMA_NO_NODE) 523 physi = emu_nid_to_phys[i]; 524 else 525 physi = i - max_emu_nid; 526 if (emu_nid_to_phys[j] != NUMA_NO_NODE) 527 physj = emu_nid_to_phys[j]; 528 else 529 physj = j - max_emu_nid; 530 dist = phys_dist[physi * numa_dist_cnt + physj]; 531 numa_set_distance(i, j, dist); 532 } 533 } 534 535 /* free the copied physical distance table */ 536 memblock_free(phys_dist, phys_size); 537 return; 538 539 no_emu: 540 numa_nodes_parsed = physnode_mask; 541 /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ 542 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) 543 emu_nid_to_phys[i] = i; 544 } 545 546 #ifndef CONFIG_DEBUG_PER_CPU_MAPS 547 void numa_add_cpu(unsigned int cpu) 548 { 549 int physnid, nid; 550 551 nid = early_cpu_to_node(cpu); 552 BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); 553 554 physnid = emu_nid_to_phys[nid]; 555 556 /* 557 * Map the cpu to each emulated node that is allocated on the physical 558 * node of the cpu's apic id. 559 */ 560 for_each_online_node(nid) 561 if (emu_nid_to_phys[nid] == physnid) 562 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); 563 } 564 565 void numa_remove_cpu(unsigned int cpu) 566 { 567 int i; 568 569 for_each_online_node(i) 570 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); 571 } 572 #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ 573 static void numa_set_cpumask(unsigned int cpu, bool enable) 574 { 575 int nid, physnid; 576 577 nid = early_cpu_to_node(cpu); 578 if (nid == NUMA_NO_NODE) { 579 /* early_cpu_to_node() already emits a warning and trace */ 580 return; 581 } 582 583 physnid = emu_nid_to_phys[nid]; 584 585 for_each_online_node(nid) { 586 if (emu_nid_to_phys[nid] != physnid) 587 continue; 588 589 debug_cpumask_set_cpu(cpu, nid, enable); 590 } 591 } 592 593 void numa_add_cpu(unsigned int cpu) 594 { 595 numa_set_cpumask(cpu, true); 596 } 597 598 void numa_remove_cpu(unsigned int cpu) 599 { 600 numa_set_cpumask(cpu, false); 601 } 602 #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 603