1 // SPDX-License-Identifier: GPL-2.0-or-later
2
3 #include <linux/array_size.h>
4 #include <linux/sort.h>
5 #include <linux/printk.h>
6 #include <linux/memblock.h>
7 #include <linux/numa.h>
8 #include <linux/numa_memblks.h>
9
10 #include <asm/numa.h>
11
12 int numa_distance_cnt;
13 static u8 *numa_distance;
14
15 nodemask_t numa_nodes_parsed __initdata;
16
17 static struct numa_meminfo numa_meminfo __initdata_or_meminfo;
18 static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo;
19
20 /*
21 * Set nodes, which have memory in @mi, in *@nodemask.
22 */
numa_nodemask_from_meminfo(nodemask_t * nodemask,const struct numa_meminfo * mi)23 static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
24 const struct numa_meminfo *mi)
25 {
26 int i;
27
28 for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
29 if (mi->blk[i].start != mi->blk[i].end &&
30 mi->blk[i].nid != NUMA_NO_NODE)
31 node_set(mi->blk[i].nid, *nodemask);
32 }
33
34 /**
35 * numa_reset_distance - Reset NUMA distance table
36 *
37 * The current table is freed. The next numa_set_distance() call will
38 * create a new one.
39 */
numa_reset_distance(void)40 void __init numa_reset_distance(void)
41 {
42 size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
43
44 /* numa_distance could be 1LU marking allocation failure, test cnt */
45 if (numa_distance_cnt)
46 memblock_free(numa_distance, size);
47 numa_distance_cnt = 0;
48 numa_distance = NULL; /* enable table creation */
49 }
50
numa_alloc_distance(void)51 static int __init numa_alloc_distance(void)
52 {
53 nodemask_t nodes_parsed;
54 size_t size;
55 int i, j, cnt = 0;
56
57 /* size the new table and allocate it */
58 nodes_parsed = numa_nodes_parsed;
59 numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
60
61 for_each_node_mask(i, nodes_parsed)
62 cnt = i;
63 cnt++;
64 size = cnt * cnt * sizeof(numa_distance[0]);
65
66 numa_distance = memblock_alloc(size, PAGE_SIZE);
67 if (!numa_distance) {
68 pr_warn("Warning: can't allocate distance table!\n");
69 /* don't retry until explicitly reset */
70 numa_distance = (void *)1LU;
71 return -ENOMEM;
72 }
73
74 numa_distance_cnt = cnt;
75
76 /* fill with the default distances */
77 for (i = 0; i < cnt; i++)
78 for (j = 0; j < cnt; j++)
79 numa_distance[i * cnt + j] = i == j ?
80 LOCAL_DISTANCE : REMOTE_DISTANCE;
81 pr_debug("NUMA: Initialized distance table, cnt=%d\n", cnt);
82
83 return 0;
84 }
85
86 /**
87 * numa_set_distance - Set NUMA distance from one NUMA to another
88 * @from: the 'from' node to set distance
89 * @to: the 'to' node to set distance
90 * @distance: NUMA distance
91 *
92 * Set the distance from node @from to @to to @distance. If distance table
93 * doesn't exist, one which is large enough to accommodate all the currently
94 * known nodes will be created.
95 *
96 * If such table cannot be allocated, a warning is printed and further
97 * calls are ignored until the distance table is reset with
98 * numa_reset_distance().
99 *
100 * If @from or @to is higher than the highest known node or lower than zero
101 * at the time of table creation or @distance doesn't make sense, the call
102 * is ignored.
103 * This is to allow simplification of specific NUMA config implementations.
104 */
numa_set_distance(int from,int to,int distance)105 void __init numa_set_distance(int from, int to, int distance)
106 {
107 if (!numa_distance && numa_alloc_distance() < 0)
108 return;
109
110 if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
111 from < 0 || to < 0) {
112 pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
113 from, to, distance);
114 return;
115 }
116
117 if ((u8)distance != distance ||
118 (from == to && distance != LOCAL_DISTANCE)) {
119 pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
120 from, to, distance);
121 return;
122 }
123
124 numa_distance[from * numa_distance_cnt + to] = distance;
125 }
126
__node_distance(int from,int to)127 int __node_distance(int from, int to)
128 {
129 if (from >= numa_distance_cnt || to >= numa_distance_cnt)
130 return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
131 return numa_distance[from * numa_distance_cnt + to];
132 }
133 EXPORT_SYMBOL(__node_distance);
134
numa_add_memblk_to(int nid,u64 start,u64 end,struct numa_meminfo * mi)135 static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
136 struct numa_meminfo *mi)
137 {
138 /* ignore zero length blks */
139 if (start == end)
140 return 0;
141
142 /* whine about and ignore invalid blks */
143 if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
144 pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
145 nid, start, end - 1);
146 return 0;
147 }
148
149 if (mi->nr_blks >= NR_NODE_MEMBLKS) {
150 pr_err("too many memblk ranges\n");
151 return -EINVAL;
152 }
153
154 mi->blk[mi->nr_blks].start = start;
155 mi->blk[mi->nr_blks].end = end;
156 mi->blk[mi->nr_blks].nid = nid;
157 mi->nr_blks++;
158 return 0;
159 }
160
161 /**
162 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
163 * @idx: Index of memblk to remove
164 * @mi: numa_meminfo to remove memblk from
165 *
166 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
167 * decrementing @mi->nr_blks.
168 */
numa_remove_memblk_from(int idx,struct numa_meminfo * mi)169 void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
170 {
171 mi->nr_blks--;
172 memmove(&mi->blk[idx], &mi->blk[idx + 1],
173 (mi->nr_blks - idx) * sizeof(mi->blk[0]));
174 }
175
176 /**
177 * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another
178 * @dst: numa_meminfo to append block to
179 * @idx: Index of memblk to remove
180 * @src: numa_meminfo to remove memblk from
181 */
numa_move_tail_memblk(struct numa_meminfo * dst,int idx,struct numa_meminfo * src)182 static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx,
183 struct numa_meminfo *src)
184 {
185 dst->blk[dst->nr_blks++] = src->blk[idx];
186 numa_remove_memblk_from(idx, src);
187 }
188
189 /**
190 * numa_add_memblk - Add one numa_memblk to numa_meminfo
191 * @nid: NUMA node ID of the new memblk
192 * @start: Start address of the new memblk
193 * @end: End address of the new memblk
194 *
195 * Add a new memblk to the default numa_meminfo.
196 *
197 * RETURNS:
198 * 0 on success, -errno on failure.
199 */
numa_add_memblk(int nid,u64 start,u64 end)200 int __init numa_add_memblk(int nid, u64 start, u64 end)
201 {
202 return numa_add_memblk_to(nid, start, end, &numa_meminfo);
203 }
204
205 /**
206 * numa_add_reserved_memblk - Add one numa_memblk to numa_reserved_meminfo
207 * @nid: NUMA node ID of the new memblk
208 * @start: Start address of the new memblk
209 * @end: End address of the new memblk
210 *
211 * Add a new memblk to the numa_reserved_meminfo.
212 *
213 * Usage Case: numa_cleanup_meminfo() reconciles all numa_memblk instances
214 * against memblock_type information and moves any that intersect reserved
215 * ranges to numa_reserved_meminfo. However, when that information is known
216 * ahead of time, we use numa_add_reserved_memblk() to add the numa_memblk
217 * to numa_reserved_meminfo directly.
218 *
219 * RETURNS:
220 * 0 on success, -errno on failure.
221 */
numa_add_reserved_memblk(int nid,u64 start,u64 end)222 int __init numa_add_reserved_memblk(int nid, u64 start, u64 end)
223 {
224 return numa_add_memblk_to(nid, start, end, &numa_reserved_meminfo);
225 }
226
227 /**
228 * numa_cleanup_meminfo - Cleanup a numa_meminfo
229 * @mi: numa_meminfo to clean up
230 *
231 * Sanitize @mi by merging and removing unnecessary memblks. Also check for
232 * conflicts and clear unused memblks.
233 *
234 * RETURNS:
235 * 0 on success, -errno on failure.
236 */
numa_cleanup_meminfo(struct numa_meminfo * mi)237 int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
238 {
239 const u64 low = memblock_start_of_DRAM();
240 const u64 high = memblock_end_of_DRAM();
241 int i, j, k;
242
243 /* first, trim all entries */
244 for (i = 0; i < mi->nr_blks; i++) {
245 struct numa_memblk *bi = &mi->blk[i];
246
247 /* move / save reserved memory ranges */
248 if (!memblock_overlaps_region(&memblock.memory,
249 bi->start, bi->end - bi->start)) {
250 numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi);
251 continue;
252 }
253
254 /* make sure all non-reserved blocks are inside the limits */
255 bi->start = max(bi->start, low);
256
257 /* preserve info for non-RAM areas above 'max_pfn': */
258 if (bi->end > high) {
259 numa_add_memblk_to(bi->nid, high, bi->end,
260 &numa_reserved_meminfo);
261 bi->end = high;
262 }
263
264 /* and there's no empty block */
265 if (bi->start >= bi->end)
266 numa_remove_memblk_from(i--, mi);
267 }
268
269 /* merge neighboring / overlapping entries */
270 for (i = 0; i < mi->nr_blks; i++) {
271 struct numa_memblk *bi = &mi->blk[i];
272
273 for (j = i + 1; j < mi->nr_blks; j++) {
274 struct numa_memblk *bj = &mi->blk[j];
275 u64 start, end;
276
277 /*
278 * See whether there are overlapping blocks. Whine
279 * about but allow overlaps of the same nid. They
280 * will be merged below.
281 */
282 if (bi->end > bj->start && bi->start < bj->end) {
283 if (bi->nid != bj->nid) {
284 pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
285 bi->nid, bi->start, bi->end - 1,
286 bj->nid, bj->start, bj->end - 1);
287 return -EINVAL;
288 }
289 pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
290 bi->nid, bi->start, bi->end - 1,
291 bj->start, bj->end - 1);
292 }
293
294 /*
295 * Join together blocks on the same node, holes
296 * between which don't overlap with memory on other
297 * nodes.
298 */
299 if (bi->nid != bj->nid)
300 continue;
301 start = min(bi->start, bj->start);
302 end = max(bi->end, bj->end);
303 for (k = 0; k < mi->nr_blks; k++) {
304 struct numa_memblk *bk = &mi->blk[k];
305
306 if (bi->nid == bk->nid)
307 continue;
308 if (start < bk->end && end > bk->start)
309 break;
310 }
311 if (k < mi->nr_blks)
312 continue;
313 pr_info("NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n",
314 bi->nid, bi->start, bi->end - 1, bj->start,
315 bj->end - 1, start, end - 1);
316 bi->start = start;
317 bi->end = end;
318 numa_remove_memblk_from(j--, mi);
319 }
320 }
321
322 /* clear unused ones */
323 for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
324 mi->blk[i].start = mi->blk[i].end = 0;
325 mi->blk[i].nid = NUMA_NO_NODE;
326 }
327
328 return 0;
329 }
330
331 /*
332 * Mark all currently memblock-reserved physical memory (which covers the
333 * kernel's own memory ranges) as hot-unswappable.
334 */
numa_clear_kernel_node_hotplug(void)335 static void __init numa_clear_kernel_node_hotplug(void)
336 {
337 nodemask_t reserved_nodemask = NODE_MASK_NONE;
338 struct memblock_region *mb_region;
339 int i;
340
341 /*
342 * We have to do some preprocessing of memblock regions, to
343 * make them suitable for reservation.
344 *
345 * At this time, all memory regions reserved by memblock are
346 * used by the kernel, but those regions are not split up
347 * along node boundaries yet, and don't necessarily have their
348 * node ID set yet either.
349 *
350 * So iterate over all parsed memory blocks and use those ranges to
351 * set the nid in memblock.reserved. This will split up the
352 * memblock regions along node boundaries and will set the node IDs
353 * as well.
354 */
355 for (i = 0; i < numa_meminfo.nr_blks; i++) {
356 struct numa_memblk *mb = numa_meminfo.blk + i;
357 int ret;
358
359 ret = memblock_set_node(mb->start, mb->end - mb->start,
360 &memblock.reserved, mb->nid);
361 WARN_ON_ONCE(ret);
362 }
363
364 /*
365 * Now go over all reserved memblock regions, to construct a
366 * node mask of all kernel reserved memory areas.
367 *
368 * [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
369 * numa_meminfo might not include all memblock.reserved
370 * memory ranges, because quirks such as trim_snb_memory()
371 * reserve specific pages for Sandy Bridge graphics. ]
372 */
373 for_each_reserved_mem_region(mb_region) {
374 int nid = memblock_get_region_node(mb_region);
375
376 if (numa_valid_node(nid))
377 node_set(nid, reserved_nodemask);
378 }
379
380 /*
381 * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
382 * belonging to the reserved node mask.
383 *
384 * Note that this will include memory regions that reside
385 * on nodes that contain kernel memory - entire nodes
386 * become hot-unpluggable:
387 */
388 for (i = 0; i < numa_meminfo.nr_blks; i++) {
389 struct numa_memblk *mb = numa_meminfo.blk + i;
390
391 if (!node_isset(mb->nid, reserved_nodemask))
392 continue;
393
394 memblock_clear_hotplug(mb->start, mb->end - mb->start);
395 }
396 }
397
numa_register_meminfo(struct numa_meminfo * mi)398 static int __init numa_register_meminfo(struct numa_meminfo *mi)
399 {
400 int i;
401
402 /* Account for nodes with cpus and no memory */
403 node_possible_map = numa_nodes_parsed;
404 numa_nodemask_from_meminfo(&node_possible_map, mi);
405 if (WARN_ON(nodes_empty(node_possible_map)))
406 return -EINVAL;
407
408 for (i = 0; i < mi->nr_blks; i++) {
409 struct numa_memblk *mb = &mi->blk[i];
410
411 memblock_set_node(mb->start, mb->end - mb->start,
412 &memblock.memory, mb->nid);
413 }
414
415 /*
416 * At very early time, the kernel have to use some memory such as
417 * loading the kernel image. We cannot prevent this anyway. So any
418 * node the kernel resides in should be un-hotpluggable.
419 *
420 * And when we come here, alloc node data won't fail.
421 */
422 numa_clear_kernel_node_hotplug();
423
424 /*
425 * If sections array is gonna be used for pfn -> nid mapping, check
426 * whether its granularity is fine enough.
427 */
428 if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) {
429 unsigned long pfn_align = node_map_pfn_alignment();
430
431 if (pfn_align && pfn_align < PAGES_PER_SECTION) {
432 unsigned long node_align_mb = PFN_PHYS(pfn_align) / SZ_1M;
433
434 unsigned long sect_align_mb = PFN_PHYS(PAGES_PER_SECTION) / SZ_1M;
435
436 pr_warn("Node alignment %luMB < min %luMB, rejecting NUMA config\n",
437 node_align_mb, sect_align_mb);
438 return -EINVAL;
439 }
440 }
441
442 return 0;
443 }
444
numa_memblks_init(int (* init_func)(void),bool memblock_force_top_down)445 int __init numa_memblks_init(int (*init_func)(void),
446 bool memblock_force_top_down)
447 {
448 phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
449 int ret;
450
451 nodes_clear(numa_nodes_parsed);
452 nodes_clear(node_possible_map);
453 nodes_clear(node_online_map);
454 memset(&numa_meminfo, 0, sizeof(numa_meminfo));
455 WARN_ON(memblock_set_node(0, max_addr, &memblock.memory, NUMA_NO_NODE));
456 WARN_ON(memblock_set_node(0, max_addr, &memblock.reserved,
457 NUMA_NO_NODE));
458 /* In case that parsing SRAT failed. */
459 WARN_ON(memblock_clear_hotplug(0, max_addr));
460 numa_reset_distance();
461
462 ret = init_func();
463 if (ret < 0)
464 return ret;
465
466 /*
467 * We reset memblock back to the top-down direction
468 * here because if we configured ACPI_NUMA, we have
469 * parsed SRAT in init_func(). It is ok to have the
470 * reset here even if we did't configure ACPI_NUMA
471 * or acpi numa init fails and fallbacks to dummy
472 * numa init.
473 */
474 if (memblock_force_top_down)
475 memblock_set_bottom_up(false);
476
477 ret = numa_cleanup_meminfo(&numa_meminfo);
478 if (ret < 0)
479 return ret;
480
481 numa_emulation(&numa_meminfo, numa_distance_cnt);
482
483 return numa_register_meminfo(&numa_meminfo);
484 }
485
cmp_memblk(const void * a,const void * b)486 static int __init cmp_memblk(const void *a, const void *b)
487 {
488 const struct numa_memblk *ma = *(const struct numa_memblk **)a;
489 const struct numa_memblk *mb = *(const struct numa_memblk **)b;
490
491 return (ma->start > mb->start) - (ma->start < mb->start);
492 }
493
494 static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata;
495
496 /**
497 * numa_fill_memblks - Fill gaps in numa_meminfo memblks
498 * @start: address to begin fill
499 * @end: address to end fill
500 *
501 * Find and extend numa_meminfo memblks to cover the physical
502 * address range @start-@end
503 *
504 * RETURNS:
505 * 0 : Success
506 * NUMA_NO_MEMBLK : No memblks exist in address range @start-@end
507 */
508
numa_fill_memblks(u64 start,u64 end)509 int __init numa_fill_memblks(u64 start, u64 end)
510 {
511 struct numa_memblk **blk = &numa_memblk_list[0];
512 struct numa_meminfo *mi = &numa_meminfo;
513 int count = 0;
514 u64 prev_end;
515
516 /*
517 * Create a list of pointers to numa_meminfo memblks that
518 * overlap start, end. The list is used to make in-place
519 * changes that fill out the numa_meminfo memblks.
520 */
521 for (int i = 0; i < mi->nr_blks; i++) {
522 struct numa_memblk *bi = &mi->blk[i];
523
524 if (memblock_addrs_overlap(start, end - start, bi->start,
525 bi->end - bi->start)) {
526 blk[count] = &mi->blk[i];
527 count++;
528 }
529 }
530 if (!count)
531 return NUMA_NO_MEMBLK;
532
533 /* Sort the list of pointers in memblk->start order */
534 sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL);
535
536 /* Make sure the first/last memblks include start/end */
537 blk[0]->start = min(blk[0]->start, start);
538 blk[count - 1]->end = max(blk[count - 1]->end, end);
539
540 /*
541 * Fill any gaps by tracking the previous memblks
542 * end address and backfilling to it if needed.
543 */
544 prev_end = blk[0]->end;
545 for (int i = 1; i < count; i++) {
546 struct numa_memblk *curr = blk[i];
547
548 if (prev_end >= curr->start) {
549 if (prev_end < curr->end)
550 prev_end = curr->end;
551 } else {
552 curr->start = prev_end;
553 prev_end = curr->end;
554 }
555 }
556 return 0;
557 }
558
559 #ifdef CONFIG_NUMA_KEEP_MEMINFO
meminfo_to_nid(struct numa_meminfo * mi,u64 start)560 static int meminfo_to_nid(struct numa_meminfo *mi, u64 start)
561 {
562 int i;
563
564 for (i = 0; i < mi->nr_blks; i++)
565 if (mi->blk[i].start <= start && mi->blk[i].end > start)
566 return mi->blk[i].nid;
567 return NUMA_NO_NODE;
568 }
569
phys_to_target_node(u64 start)570 int phys_to_target_node(u64 start)
571 {
572 int nid = meminfo_to_nid(&numa_meminfo, start);
573
574 /*
575 * Prefer online nodes, but if reserved memory might be
576 * hot-added continue the search with reserved ranges.
577 */
578 if (nid != NUMA_NO_NODE)
579 return nid;
580
581 return meminfo_to_nid(&numa_reserved_meminfo, start);
582 }
583 EXPORT_SYMBOL_GPL(phys_to_target_node);
584
memory_add_physaddr_to_nid(u64 start)585 int memory_add_physaddr_to_nid(u64 start)
586 {
587 int nid = meminfo_to_nid(&numa_meminfo, start);
588
589 if (nid == NUMA_NO_NODE)
590 nid = numa_meminfo.blk[0].nid;
591 return nid;
592 }
593 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
594
595 #endif /* CONFIG_NUMA_KEEP_MEMINFO */
596