1 // SPDX-License-Identifier: GPL-2.0-or-later
2
3 #include <linux/array_size.h>
4 #include <linux/sort.h>
5 #include <linux/printk.h>
6 #include <linux/memblock.h>
7 #include <linux/numa.h>
8 #include <linux/numa_memblks.h>
9
10 static int numa_distance_cnt;
11 static u8 *numa_distance;
12
13 nodemask_t numa_nodes_parsed __initdata;
14
15 static struct numa_meminfo numa_meminfo __initdata_or_meminfo;
16 static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo;
17
18 /*
19 * Set nodes, which have memory in @mi, in *@nodemask.
20 */
numa_nodemask_from_meminfo(nodemask_t * nodemask,const struct numa_meminfo * mi)21 static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
22 const struct numa_meminfo *mi)
23 {
24 int i;
25
26 for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
27 if (mi->blk[i].start != mi->blk[i].end &&
28 mi->blk[i].nid != NUMA_NO_NODE)
29 node_set(mi->blk[i].nid, *nodemask);
30 }
31
32 /**
33 * numa_reset_distance - Reset NUMA distance table
34 *
35 * The current table is freed. The next numa_set_distance() call will
36 * create a new one.
37 */
numa_reset_distance(void)38 void __init numa_reset_distance(void)
39 {
40 size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
41
42 /* numa_distance could be 1LU marking allocation failure, test cnt */
43 if (numa_distance_cnt)
44 memblock_free(numa_distance, size);
45 numa_distance_cnt = 0;
46 numa_distance = NULL; /* enable table creation */
47 }
48
numa_alloc_distance(void)49 static int __init numa_alloc_distance(void)
50 {
51 nodemask_t nodes_parsed;
52 size_t size;
53 int i, j, cnt = 0;
54
55 /* size the new table and allocate it */
56 nodes_parsed = numa_nodes_parsed;
57 numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
58
59 for_each_node_mask(i, nodes_parsed)
60 cnt = i;
61 cnt++;
62 size = cnt * cnt * sizeof(numa_distance[0]);
63
64 numa_distance = memblock_alloc(size, PAGE_SIZE);
65 if (!numa_distance) {
66 pr_warn("Warning: can't allocate distance table!\n");
67 /* don't retry until explicitly reset */
68 numa_distance = (void *)1LU;
69 return -ENOMEM;
70 }
71
72 numa_distance_cnt = cnt;
73
74 /* fill with the default distances */
75 for (i = 0; i < cnt; i++)
76 for (j = 0; j < cnt; j++)
77 numa_distance[i * cnt + j] = i == j ?
78 LOCAL_DISTANCE : REMOTE_DISTANCE;
79 printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
80
81 return 0;
82 }
83
84 /**
85 * numa_set_distance - Set NUMA distance from one NUMA to another
86 * @from: the 'from' node to set distance
87 * @to: the 'to' node to set distance
88 * @distance: NUMA distance
89 *
90 * Set the distance from node @from to @to to @distance. If distance table
91 * doesn't exist, one which is large enough to accommodate all the currently
92 * known nodes will be created.
93 *
94 * If such table cannot be allocated, a warning is printed and further
95 * calls are ignored until the distance table is reset with
96 * numa_reset_distance().
97 *
98 * If @from or @to is higher than the highest known node or lower than zero
99 * at the time of table creation or @distance doesn't make sense, the call
100 * is ignored.
101 * This is to allow simplification of specific NUMA config implementations.
102 */
numa_set_distance(int from,int to,int distance)103 void __init numa_set_distance(int from, int to, int distance)
104 {
105 if (!numa_distance && numa_alloc_distance() < 0)
106 return;
107
108 if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
109 from < 0 || to < 0) {
110 pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
111 from, to, distance);
112 return;
113 }
114
115 if ((u8)distance != distance ||
116 (from == to && distance != LOCAL_DISTANCE)) {
117 pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
118 from, to, distance);
119 return;
120 }
121
122 numa_distance[from * numa_distance_cnt + to] = distance;
123 }
124
__node_distance(int from,int to)125 int __node_distance(int from, int to)
126 {
127 if (from >= numa_distance_cnt || to >= numa_distance_cnt)
128 return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
129 return numa_distance[from * numa_distance_cnt + to];
130 }
131 EXPORT_SYMBOL(__node_distance);
132
numa_add_memblk_to(int nid,u64 start,u64 end,struct numa_meminfo * mi)133 static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
134 struct numa_meminfo *mi)
135 {
136 /* ignore zero length blks */
137 if (start == end)
138 return 0;
139
140 /* whine about and ignore invalid blks */
141 if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
142 pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
143 nid, start, end - 1);
144 return 0;
145 }
146
147 if (mi->nr_blks >= NR_NODE_MEMBLKS) {
148 pr_err("too many memblk ranges\n");
149 return -EINVAL;
150 }
151
152 mi->blk[mi->nr_blks].start = start;
153 mi->blk[mi->nr_blks].end = end;
154 mi->blk[mi->nr_blks].nid = nid;
155 mi->nr_blks++;
156 return 0;
157 }
158
159 /**
160 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
161 * @idx: Index of memblk to remove
162 * @mi: numa_meminfo to remove memblk from
163 *
164 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
165 * decrementing @mi->nr_blks.
166 */
numa_remove_memblk_from(int idx,struct numa_meminfo * mi)167 void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
168 {
169 mi->nr_blks--;
170 memmove(&mi->blk[idx], &mi->blk[idx + 1],
171 (mi->nr_blks - idx) * sizeof(mi->blk[0]));
172 }
173
174 /**
175 * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another
176 * @dst: numa_meminfo to append block to
177 * @idx: Index of memblk to remove
178 * @src: numa_meminfo to remove memblk from
179 */
numa_move_tail_memblk(struct numa_meminfo * dst,int idx,struct numa_meminfo * src)180 static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx,
181 struct numa_meminfo *src)
182 {
183 dst->blk[dst->nr_blks++] = src->blk[idx];
184 numa_remove_memblk_from(idx, src);
185 }
186
187 /**
188 * numa_add_memblk - Add one numa_memblk to numa_meminfo
189 * @nid: NUMA node ID of the new memblk
190 * @start: Start address of the new memblk
191 * @end: End address of the new memblk
192 *
193 * Add a new memblk to the default numa_meminfo.
194 *
195 * RETURNS:
196 * 0 on success, -errno on failure.
197 */
numa_add_memblk(int nid,u64 start,u64 end)198 int __init numa_add_memblk(int nid, u64 start, u64 end)
199 {
200 return numa_add_memblk_to(nid, start, end, &numa_meminfo);
201 }
202
203 /**
204 * numa_cleanup_meminfo - Cleanup a numa_meminfo
205 * @mi: numa_meminfo to clean up
206 *
207 * Sanitize @mi by merging and removing unnecessary memblks. Also check for
208 * conflicts and clear unused memblks.
209 *
210 * RETURNS:
211 * 0 on success, -errno on failure.
212 */
numa_cleanup_meminfo(struct numa_meminfo * mi)213 int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
214 {
215 const u64 low = memblock_start_of_DRAM();
216 const u64 high = memblock_end_of_DRAM();
217 int i, j, k;
218
219 /* first, trim all entries */
220 for (i = 0; i < mi->nr_blks; i++) {
221 struct numa_memblk *bi = &mi->blk[i];
222
223 /* move / save reserved memory ranges */
224 if (!memblock_overlaps_region(&memblock.memory,
225 bi->start, bi->end - bi->start)) {
226 numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi);
227 continue;
228 }
229
230 /* make sure all non-reserved blocks are inside the limits */
231 bi->start = max(bi->start, low);
232
233 /* preserve info for non-RAM areas above 'max_pfn': */
234 if (bi->end > high) {
235 numa_add_memblk_to(bi->nid, high, bi->end,
236 &numa_reserved_meminfo);
237 bi->end = high;
238 }
239
240 /* and there's no empty block */
241 if (bi->start >= bi->end)
242 numa_remove_memblk_from(i--, mi);
243 }
244
245 /* merge neighboring / overlapping entries */
246 for (i = 0; i < mi->nr_blks; i++) {
247 struct numa_memblk *bi = &mi->blk[i];
248
249 for (j = i + 1; j < mi->nr_blks; j++) {
250 struct numa_memblk *bj = &mi->blk[j];
251 u64 start, end;
252
253 /*
254 * See whether there are overlapping blocks. Whine
255 * about but allow overlaps of the same nid. They
256 * will be merged below.
257 */
258 if (bi->end > bj->start && bi->start < bj->end) {
259 if (bi->nid != bj->nid) {
260 pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
261 bi->nid, bi->start, bi->end - 1,
262 bj->nid, bj->start, bj->end - 1);
263 return -EINVAL;
264 }
265 pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
266 bi->nid, bi->start, bi->end - 1,
267 bj->start, bj->end - 1);
268 }
269
270 /*
271 * Join together blocks on the same node, holes
272 * between which don't overlap with memory on other
273 * nodes.
274 */
275 if (bi->nid != bj->nid)
276 continue;
277 start = min(bi->start, bj->start);
278 end = max(bi->end, bj->end);
279 for (k = 0; k < mi->nr_blks; k++) {
280 struct numa_memblk *bk = &mi->blk[k];
281
282 if (bi->nid == bk->nid)
283 continue;
284 if (start < bk->end && end > bk->start)
285 break;
286 }
287 if (k < mi->nr_blks)
288 continue;
289 pr_info("NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n",
290 bi->nid, bi->start, bi->end - 1, bj->start,
291 bj->end - 1, start, end - 1);
292 bi->start = start;
293 bi->end = end;
294 numa_remove_memblk_from(j--, mi);
295 }
296 }
297
298 /* clear unused ones */
299 for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
300 mi->blk[i].start = mi->blk[i].end = 0;
301 mi->blk[i].nid = NUMA_NO_NODE;
302 }
303
304 return 0;
305 }
306
307 /*
308 * Mark all currently memblock-reserved physical memory (which covers the
309 * kernel's own memory ranges) as hot-unswappable.
310 */
numa_clear_kernel_node_hotplug(void)311 static void __init numa_clear_kernel_node_hotplug(void)
312 {
313 nodemask_t reserved_nodemask = NODE_MASK_NONE;
314 struct memblock_region *mb_region;
315 int i;
316
317 /*
318 * We have to do some preprocessing of memblock regions, to
319 * make them suitable for reservation.
320 *
321 * At this time, all memory regions reserved by memblock are
322 * used by the kernel, but those regions are not split up
323 * along node boundaries yet, and don't necessarily have their
324 * node ID set yet either.
325 *
326 * So iterate over all parsed memory blocks and use those ranges to
327 * set the nid in memblock.reserved. This will split up the
328 * memblock regions along node boundaries and will set the node IDs
329 * as well.
330 */
331 for (i = 0; i < numa_meminfo.nr_blks; i++) {
332 struct numa_memblk *mb = numa_meminfo.blk + i;
333 int ret;
334
335 ret = memblock_set_node(mb->start, mb->end - mb->start,
336 &memblock.reserved, mb->nid);
337 WARN_ON_ONCE(ret);
338 }
339
340 /*
341 * Now go over all reserved memblock regions, to construct a
342 * node mask of all kernel reserved memory areas.
343 *
344 * [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
345 * numa_meminfo might not include all memblock.reserved
346 * memory ranges, because quirks such as trim_snb_memory()
347 * reserve specific pages for Sandy Bridge graphics. ]
348 */
349 for_each_reserved_mem_region(mb_region) {
350 int nid = memblock_get_region_node(mb_region);
351
352 if (numa_valid_node(nid))
353 node_set(nid, reserved_nodemask);
354 }
355
356 /*
357 * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
358 * belonging to the reserved node mask.
359 *
360 * Note that this will include memory regions that reside
361 * on nodes that contain kernel memory - entire nodes
362 * become hot-unpluggable:
363 */
364 for (i = 0; i < numa_meminfo.nr_blks; i++) {
365 struct numa_memblk *mb = numa_meminfo.blk + i;
366
367 if (!node_isset(mb->nid, reserved_nodemask))
368 continue;
369
370 memblock_clear_hotplug(mb->start, mb->end - mb->start);
371 }
372 }
373
numa_register_meminfo(struct numa_meminfo * mi)374 static int __init numa_register_meminfo(struct numa_meminfo *mi)
375 {
376 int i;
377
378 /* Account for nodes with cpus and no memory */
379 node_possible_map = numa_nodes_parsed;
380 numa_nodemask_from_meminfo(&node_possible_map, mi);
381 if (WARN_ON(nodes_empty(node_possible_map)))
382 return -EINVAL;
383
384 for (i = 0; i < mi->nr_blks; i++) {
385 struct numa_memblk *mb = &mi->blk[i];
386
387 memblock_set_node(mb->start, mb->end - mb->start,
388 &memblock.memory, mb->nid);
389 }
390
391 /*
392 * At very early time, the kernel have to use some memory such as
393 * loading the kernel image. We cannot prevent this anyway. So any
394 * node the kernel resides in should be un-hotpluggable.
395 *
396 * And when we come here, alloc node data won't fail.
397 */
398 numa_clear_kernel_node_hotplug();
399
400 /*
401 * If sections array is gonna be used for pfn -> nid mapping, check
402 * whether its granularity is fine enough.
403 */
404 if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) {
405 unsigned long pfn_align = node_map_pfn_alignment();
406
407 if (pfn_align && pfn_align < PAGES_PER_SECTION) {
408 unsigned long node_align_mb = PFN_PHYS(pfn_align) >> 20;
409
410 unsigned long sect_align_mb = PFN_PHYS(PAGES_PER_SECTION) >> 20;
411
412 pr_warn("Node alignment %luMB < min %luMB, rejecting NUMA config\n",
413 node_align_mb, sect_align_mb);
414 return -EINVAL;
415 }
416 }
417
418 return 0;
419 }
420
numa_memblks_init(int (* init_func)(void),bool memblock_force_top_down)421 int __init numa_memblks_init(int (*init_func)(void),
422 bool memblock_force_top_down)
423 {
424 phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
425 int ret;
426
427 nodes_clear(numa_nodes_parsed);
428 nodes_clear(node_possible_map);
429 nodes_clear(node_online_map);
430 memset(&numa_meminfo, 0, sizeof(numa_meminfo));
431 WARN_ON(memblock_set_node(0, max_addr, &memblock.memory, NUMA_NO_NODE));
432 WARN_ON(memblock_set_node(0, max_addr, &memblock.reserved,
433 NUMA_NO_NODE));
434 /* In case that parsing SRAT failed. */
435 WARN_ON(memblock_clear_hotplug(0, max_addr));
436 numa_reset_distance();
437
438 ret = init_func();
439 if (ret < 0)
440 return ret;
441
442 /*
443 * We reset memblock back to the top-down direction
444 * here because if we configured ACPI_NUMA, we have
445 * parsed SRAT in init_func(). It is ok to have the
446 * reset here even if we did't configure ACPI_NUMA
447 * or acpi numa init fails and fallbacks to dummy
448 * numa init.
449 */
450 if (memblock_force_top_down)
451 memblock_set_bottom_up(false);
452
453 ret = numa_cleanup_meminfo(&numa_meminfo);
454 if (ret < 0)
455 return ret;
456
457 numa_emulation(&numa_meminfo, numa_distance_cnt);
458
459 return numa_register_meminfo(&numa_meminfo);
460 }
461
cmp_memblk(const void * a,const void * b)462 static int __init cmp_memblk(const void *a, const void *b)
463 {
464 const struct numa_memblk *ma = *(const struct numa_memblk **)a;
465 const struct numa_memblk *mb = *(const struct numa_memblk **)b;
466
467 return (ma->start > mb->start) - (ma->start < mb->start);
468 }
469
470 static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata;
471
472 /**
473 * numa_fill_memblks - Fill gaps in numa_meminfo memblks
474 * @start: address to begin fill
475 * @end: address to end fill
476 *
477 * Find and extend numa_meminfo memblks to cover the physical
478 * address range @start-@end
479 *
480 * RETURNS:
481 * 0 : Success
482 * NUMA_NO_MEMBLK : No memblks exist in address range @start-@end
483 */
484
numa_fill_memblks(u64 start,u64 end)485 int __init numa_fill_memblks(u64 start, u64 end)
486 {
487 struct numa_memblk **blk = &numa_memblk_list[0];
488 struct numa_meminfo *mi = &numa_meminfo;
489 int count = 0;
490 u64 prev_end;
491
492 /*
493 * Create a list of pointers to numa_meminfo memblks that
494 * overlap start, end. The list is used to make in-place
495 * changes that fill out the numa_meminfo memblks.
496 */
497 for (int i = 0; i < mi->nr_blks; i++) {
498 struct numa_memblk *bi = &mi->blk[i];
499
500 if (memblock_addrs_overlap(start, end - start, bi->start,
501 bi->end - bi->start)) {
502 blk[count] = &mi->blk[i];
503 count++;
504 }
505 }
506 if (!count)
507 return NUMA_NO_MEMBLK;
508
509 /* Sort the list of pointers in memblk->start order */
510 sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL);
511
512 /* Make sure the first/last memblks include start/end */
513 blk[0]->start = min(blk[0]->start, start);
514 blk[count - 1]->end = max(blk[count - 1]->end, end);
515
516 /*
517 * Fill any gaps by tracking the previous memblks
518 * end address and backfilling to it if needed.
519 */
520 prev_end = blk[0]->end;
521 for (int i = 1; i < count; i++) {
522 struct numa_memblk *curr = blk[i];
523
524 if (prev_end >= curr->start) {
525 if (prev_end < curr->end)
526 prev_end = curr->end;
527 } else {
528 curr->start = prev_end;
529 prev_end = curr->end;
530 }
531 }
532 return 0;
533 }
534
535 #ifdef CONFIG_NUMA_KEEP_MEMINFO
meminfo_to_nid(struct numa_meminfo * mi,u64 start)536 static int meminfo_to_nid(struct numa_meminfo *mi, u64 start)
537 {
538 int i;
539
540 for (i = 0; i < mi->nr_blks; i++)
541 if (mi->blk[i].start <= start && mi->blk[i].end > start)
542 return mi->blk[i].nid;
543 return NUMA_NO_NODE;
544 }
545
phys_to_target_node(u64 start)546 int phys_to_target_node(u64 start)
547 {
548 int nid = meminfo_to_nid(&numa_meminfo, start);
549
550 /*
551 * Prefer online nodes, but if reserved memory might be
552 * hot-added continue the search with reserved ranges.
553 */
554 if (nid != NUMA_NO_NODE)
555 return nid;
556
557 return meminfo_to_nid(&numa_reserved_meminfo, start);
558 }
559 EXPORT_SYMBOL_GPL(phys_to_target_node);
560
memory_add_physaddr_to_nid(u64 start)561 int memory_add_physaddr_to_nid(u64 start)
562 {
563 int nid = meminfo_to_nid(&numa_meminfo, start);
564
565 if (nid == NUMA_NO_NODE)
566 nid = numa_meminfo.blk[0].nid;
567 return nid;
568 }
569 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
570
571 #endif /* CONFIG_NUMA_KEEP_MEMINFO */
572