1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * NUMA emulation 4 */ 5 #include <linux/kernel.h> 6 #include <linux/errno.h> 7 #include <linux/topology.h> 8 #include <linux/memblock.h> 9 #include <linux/numa_memblks.h> 10 #include <asm/numa.h> 11 12 #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) 13 #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) 14 15 static int emu_nid_to_phys[MAX_NUMNODES]; 16 static char *emu_cmdline __initdata; 17 18 int __init numa_emu_cmdline(char *str) 19 { 20 emu_cmdline = str; 21 return 0; 22 } 23 24 static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) 25 { 26 int i; 27 28 for (i = 0; i < mi->nr_blks; i++) 29 if (mi->blk[i].nid == nid) 30 return i; 31 return -ENOENT; 32 } 33 34 static u64 __init mem_hole_size(u64 start, u64 end) 35 { 36 unsigned long start_pfn = PFN_UP(start); 37 unsigned long end_pfn = PFN_DOWN(end); 38 39 if (start_pfn < end_pfn) 40 return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn)); 41 return 0; 42 } 43 44 /* 45 * Sets up nid to range from @start to @end. The return value is -errno if 46 * something went wrong, 0 otherwise. 47 */ 48 static int __init emu_setup_memblk(struct numa_meminfo *ei, 49 struct numa_meminfo *pi, 50 int nid, int phys_blk, u64 size) 51 { 52 struct numa_memblk *eb = &ei->blk[ei->nr_blks]; 53 struct numa_memblk *pb = &pi->blk[phys_blk]; 54 55 if (ei->nr_blks >= NR_NODE_MEMBLKS) { 56 pr_err("NUMA: Too many emulated memblks, failing emulation\n"); 57 return -EINVAL; 58 } 59 60 ei->nr_blks++; 61 eb->start = pb->start; 62 eb->end = pb->start + size; 63 eb->nid = nid; 64 65 if (emu_nid_to_phys[nid] == NUMA_NO_NODE) 66 emu_nid_to_phys[nid] = pb->nid; 67 68 pb->start += size; 69 if (pb->start >= pb->end) { 70 WARN_ON_ONCE(pb->start > pb->end); 71 numa_remove_memblk_from(phys_blk, pi); 72 } 73 74 printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n", 75 nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20); 76 return 0; 77 } 78 79 /* 80 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr 81 * to max_addr. 82 * 83 * Returns zero on success or negative on error. 84 */ 85 static int __init split_nodes_interleave(struct numa_meminfo *ei, 86 struct numa_meminfo *pi, 87 u64 addr, u64 max_addr, int nr_nodes) 88 { 89 nodemask_t physnode_mask = numa_nodes_parsed; 90 u64 size; 91 int big; 92 int nid = 0; 93 int i, ret; 94 95 if (nr_nodes <= 0) 96 return -1; 97 if (nr_nodes > MAX_NUMNODES) { 98 pr_info("numa=fake=%d too large, reducing to %d\n", 99 nr_nodes, MAX_NUMNODES); 100 nr_nodes = MAX_NUMNODES; 101 } 102 103 /* 104 * Calculate target node size. x86_32 freaks on __udivdi3() so do 105 * the division in ulong number of pages and convert back. 106 */ 107 size = max_addr - addr - mem_hole_size(addr, max_addr); 108 size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); 109 110 /* 111 * Calculate the number of big nodes that can be allocated as a result 112 * of consolidating the remainder. 113 */ 114 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / 115 FAKE_NODE_MIN_SIZE; 116 117 size &= FAKE_NODE_MIN_HASH_MASK; 118 if (!size) { 119 pr_err("Not enough memory for each node. " 120 "NUMA emulation disabled.\n"); 121 return -1; 122 } 123 124 /* 125 * Continue to fill physical nodes with fake nodes until there is no 126 * memory left on any of them. 127 */ 128 while (!nodes_empty(physnode_mask)) { 129 for_each_node_mask(i, physnode_mask) { 130 u64 dma32_end = numa_emu_dma_end(); 131 u64 start, limit, end; 132 int phys_blk; 133 134 phys_blk = emu_find_memblk_by_nid(i, pi); 135 if (phys_blk < 0) { 136 node_clear(i, physnode_mask); 137 continue; 138 } 139 start = pi->blk[phys_blk].start; 140 limit = pi->blk[phys_blk].end; 141 end = start + size; 142 143 if (nid < big) 144 end += FAKE_NODE_MIN_SIZE; 145 146 /* 147 * Continue to add memory to this fake node if its 148 * non-reserved memory is less than the per-node size. 149 */ 150 while (end - start - mem_hole_size(start, end) < size) { 151 end += FAKE_NODE_MIN_SIZE; 152 if (end > limit) { 153 end = limit; 154 break; 155 } 156 } 157 158 /* 159 * If there won't be at least FAKE_NODE_MIN_SIZE of 160 * non-reserved memory in ZONE_DMA32 for the next node, 161 * this one must extend to the boundary. 162 */ 163 if (end < dma32_end && dma32_end - end - 164 mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) 165 end = dma32_end; 166 167 /* 168 * If there won't be enough non-reserved memory for the 169 * next node, this one must extend to the end of the 170 * physical node. 171 */ 172 if (limit - end - mem_hole_size(end, limit) < size) 173 end = limit; 174 175 ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, 176 phys_blk, 177 min(end, limit) - start); 178 if (ret < 0) 179 return ret; 180 } 181 } 182 return 0; 183 } 184 185 /* 186 * Returns the end address of a node so that there is at least `size' amount of 187 * non-reserved memory or `max_addr' is reached. 188 */ 189 static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) 190 { 191 u64 end = start + size; 192 193 while (end - start - mem_hole_size(start, end) < size) { 194 end += FAKE_NODE_MIN_SIZE; 195 if (end > max_addr) { 196 end = max_addr; 197 break; 198 } 199 } 200 return end; 201 } 202 203 static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes) 204 { 205 unsigned long max_pfn = PHYS_PFN(max_addr); 206 unsigned long base_pfn = PHYS_PFN(base); 207 unsigned long hole_pfns = PHYS_PFN(hole); 208 209 return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes); 210 } 211 212 /* 213 * Sets up fake nodes of `size' interleaved over physical nodes ranging from 214 * `addr' to `max_addr'. 215 * 216 * Returns zero on success or negative on error. 217 */ 218 static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei, 219 struct numa_meminfo *pi, 220 u64 addr, u64 max_addr, u64 size, 221 int nr_nodes, struct numa_memblk *pblk, 222 int nid) 223 { 224 nodemask_t physnode_mask = numa_nodes_parsed; 225 int i, ret, uniform = 0; 226 u64 min_size; 227 228 if ((!size && !nr_nodes) || (nr_nodes && !pblk)) 229 return -1; 230 231 /* 232 * In the 'uniform' case split the passed in physical node by 233 * nr_nodes, in the non-uniform case, ignore the passed in 234 * physical block and try to create nodes of at least size 235 * @size. 236 * 237 * In the uniform case, split the nodes strictly by physical 238 * capacity, i.e. ignore holes. In the non-uniform case account 239 * for holes and treat @size as a minimum floor. 240 */ 241 if (!nr_nodes) 242 nr_nodes = MAX_NUMNODES; 243 else { 244 nodes_clear(physnode_mask); 245 node_set(pblk->nid, physnode_mask); 246 uniform = 1; 247 } 248 249 if (uniform) { 250 min_size = uniform_size(max_addr, addr, 0, nr_nodes); 251 size = min_size; 252 } else { 253 /* 254 * The limit on emulated nodes is MAX_NUMNODES, so the 255 * size per node is increased accordingly if the 256 * requested size is too small. This creates a uniform 257 * distribution of node sizes across the entire machine 258 * (but not necessarily over physical nodes). 259 */ 260 min_size = uniform_size(max_addr, addr, 261 mem_hole_size(addr, max_addr), nr_nodes); 262 } 263 min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE); 264 if (size < min_size) { 265 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", 266 size >> 20, min_size >> 20); 267 size = min_size; 268 } 269 size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE); 270 271 /* 272 * Fill physical nodes with fake nodes of size until there is no memory 273 * left on any of them. 274 */ 275 while (!nodes_empty(physnode_mask)) { 276 for_each_node_mask(i, physnode_mask) { 277 u64 dma32_end = numa_emu_dma_end(); 278 u64 start, limit, end; 279 int phys_blk; 280 281 phys_blk = emu_find_memblk_by_nid(i, pi); 282 if (phys_blk < 0) { 283 node_clear(i, physnode_mask); 284 continue; 285 } 286 287 start = pi->blk[phys_blk].start; 288 limit = pi->blk[phys_blk].end; 289 290 if (uniform) 291 end = start + size; 292 else 293 end = find_end_of_node(start, limit, size); 294 /* 295 * If there won't be at least FAKE_NODE_MIN_SIZE of 296 * non-reserved memory in ZONE_DMA32 for the next node, 297 * this one must extend to the boundary. 298 */ 299 if (end < dma32_end && dma32_end - end - 300 mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) 301 end = dma32_end; 302 303 /* 304 * If there won't be enough non-reserved memory for the 305 * next node, this one must extend to the end of the 306 * physical node. 307 */ 308 if ((limit - end - mem_hole_size(end, limit) < size) 309 && !uniform) 310 end = limit; 311 312 ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, 313 phys_blk, 314 min(end, limit) - start); 315 if (ret < 0) 316 return ret; 317 } 318 } 319 return nid; 320 } 321 322 static int __init split_nodes_size_interleave(struct numa_meminfo *ei, 323 struct numa_meminfo *pi, 324 u64 addr, u64 max_addr, u64 size) 325 { 326 return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size, 327 0, NULL, 0); 328 } 329 330 static int __init setup_emu2phys_nid(int *dfl_phys_nid) 331 { 332 int i, max_emu_nid = 0; 333 334 *dfl_phys_nid = NUMA_NO_NODE; 335 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { 336 if (emu_nid_to_phys[i] != NUMA_NO_NODE) { 337 max_emu_nid = i; 338 if (*dfl_phys_nid == NUMA_NO_NODE) 339 *dfl_phys_nid = emu_nid_to_phys[i]; 340 } 341 } 342 343 return max_emu_nid; 344 } 345 346 /** 347 * numa_emulation - Emulate NUMA nodes 348 * @numa_meminfo: NUMA configuration to massage 349 * @numa_dist_cnt: The size of the physical NUMA distance table 350 * 351 * Emulate NUMA nodes according to the numa=fake kernel parameter. 352 * @numa_meminfo contains the physical memory configuration and is modified 353 * to reflect the emulated configuration on success. @numa_dist_cnt is 354 * used to determine the size of the physical distance table. 355 * 356 * On success, the following modifications are made. 357 * 358 * - @numa_meminfo is updated to reflect the emulated nodes. 359 * 360 * - __apicid_to_node[] is updated such that APIC IDs are mapped to the 361 * emulated nodes. 362 * 363 * - NUMA distance table is rebuilt to represent distances between emulated 364 * nodes. The distances are determined considering how emulated nodes 365 * are mapped to physical nodes and match the actual distances. 366 * 367 * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical 368 * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). 369 * 370 * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with 371 * identity mapping and no other modification is made. 372 */ 373 void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) 374 { 375 static struct numa_meminfo ei __initdata; 376 static struct numa_meminfo pi __initdata; 377 const u64 max_addr = PFN_PHYS(max_pfn); 378 u8 *phys_dist = NULL; 379 size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); 380 int max_emu_nid, dfl_phys_nid; 381 int i, j, ret; 382 383 if (!emu_cmdline) 384 goto no_emu; 385 386 memset(&ei, 0, sizeof(ei)); 387 pi = *numa_meminfo; 388 389 for (i = 0; i < MAX_NUMNODES; i++) 390 emu_nid_to_phys[i] = NUMA_NO_NODE; 391 392 /* 393 * If the numa=fake command-line contains a 'M' or 'G', it represents 394 * the fixed node size. Otherwise, if it is just a single number N, 395 * split the system RAM into N fake nodes. 396 */ 397 if (strchr(emu_cmdline, 'U')) { 398 nodemask_t physnode_mask = numa_nodes_parsed; 399 unsigned long n; 400 int nid = 0; 401 402 n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); 403 ret = -1; 404 for_each_node_mask(i, physnode_mask) { 405 /* 406 * The reason we pass in blk[0] is due to 407 * numa_remove_memblk_from() called by 408 * emu_setup_memblk() will delete entry 0 409 * and then move everything else up in the pi.blk 410 * array. Therefore we should always be looking 411 * at blk[0]. 412 */ 413 ret = split_nodes_size_interleave_uniform(&ei, &pi, 414 pi.blk[0].start, pi.blk[0].end, 0, 415 n, &pi.blk[0], nid); 416 if (ret < 0) 417 break; 418 if (ret < n) { 419 pr_info("%s: phys: %d only got %d of %ld nodes, failing\n", 420 __func__, i, ret, n); 421 ret = -1; 422 break; 423 } 424 nid = ret; 425 } 426 } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { 427 u64 size; 428 429 size = memparse(emu_cmdline, &emu_cmdline); 430 ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); 431 } else { 432 unsigned long n; 433 434 n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); 435 ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); 436 } 437 if (*emu_cmdline == ':') 438 emu_cmdline++; 439 440 if (ret < 0) 441 goto no_emu; 442 443 if (numa_cleanup_meminfo(&ei) < 0) { 444 pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); 445 goto no_emu; 446 } 447 448 /* copy the physical distance table */ 449 if (numa_dist_cnt) { 450 phys_dist = memblock_alloc(phys_size, PAGE_SIZE); 451 if (!phys_dist) { 452 pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); 453 goto no_emu; 454 } 455 456 for (i = 0; i < numa_dist_cnt; i++) 457 for (j = 0; j < numa_dist_cnt; j++) 458 phys_dist[i * numa_dist_cnt + j] = 459 node_distance(i, j); 460 } 461 462 /* 463 * Determine the max emulated nid and the default phys nid to use 464 * for unmapped nodes. 465 */ 466 max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid); 467 468 /* commit */ 469 *numa_meminfo = ei; 470 471 /* Make sure numa_nodes_parsed only contains emulated nodes */ 472 nodes_clear(numa_nodes_parsed); 473 for (i = 0; i < ARRAY_SIZE(ei.blk); i++) 474 if (ei.blk[i].start != ei.blk[i].end && 475 ei.blk[i].nid != NUMA_NO_NODE) 476 node_set(ei.blk[i].nid, numa_nodes_parsed); 477 478 numa_emu_update_cpu_to_node(emu_nid_to_phys, ARRAY_SIZE(emu_nid_to_phys)); 479 480 /* make sure all emulated nodes are mapped to a physical node */ 481 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) 482 if (emu_nid_to_phys[i] == NUMA_NO_NODE) 483 emu_nid_to_phys[i] = dfl_phys_nid; 484 485 /* transform distance table */ 486 numa_reset_distance(); 487 for (i = 0; i < max_emu_nid + 1; i++) { 488 for (j = 0; j < max_emu_nid + 1; j++) { 489 int physi = emu_nid_to_phys[i]; 490 int physj = emu_nid_to_phys[j]; 491 int dist; 492 493 if (get_option(&emu_cmdline, &dist) == 2) 494 ; 495 else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) 496 dist = physi == physj ? 497 LOCAL_DISTANCE : REMOTE_DISTANCE; 498 else 499 dist = phys_dist[physi * numa_dist_cnt + physj]; 500 501 numa_set_distance(i, j, dist); 502 } 503 } 504 505 /* free the copied physical distance table */ 506 memblock_free(phys_dist, phys_size); 507 return; 508 509 no_emu: 510 /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ 511 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) 512 emu_nid_to_phys[i] = i; 513 } 514 515 #ifndef CONFIG_DEBUG_PER_CPU_MAPS 516 void numa_add_cpu(unsigned int cpu) 517 { 518 int physnid, nid; 519 520 nid = early_cpu_to_node(cpu); 521 BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); 522 523 physnid = emu_nid_to_phys[nid]; 524 525 /* 526 * Map the cpu to each emulated node that is allocated on the physical 527 * node of the cpu's apic id. 528 */ 529 for_each_online_node(nid) 530 if (emu_nid_to_phys[nid] == physnid) 531 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); 532 } 533 534 void numa_remove_cpu(unsigned int cpu) 535 { 536 int i; 537 538 for_each_online_node(i) 539 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); 540 } 541 #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ 542 static void numa_set_cpumask(unsigned int cpu, bool enable) 543 { 544 int nid, physnid; 545 546 nid = early_cpu_to_node(cpu); 547 if (nid == NUMA_NO_NODE) { 548 /* early_cpu_to_node() already emits a warning and trace */ 549 return; 550 } 551 552 physnid = emu_nid_to_phys[nid]; 553 554 for_each_online_node(nid) { 555 if (emu_nid_to_phys[nid] != physnid) 556 continue; 557 558 debug_cpumask_set_cpu(cpu, nid, enable); 559 } 560 } 561 562 void numa_add_cpu(unsigned int cpu) 563 { 564 numa_set_cpumask(cpu, true); 565 } 566 567 void numa_remove_cpu(unsigned int cpu) 568 { 569 numa_set_cpumask(cpu, false); 570 } 571 #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 572