numa_64.c (fbe99959d1db85222829a64d869dcab704ac7ec8) numa_64.c (b8ef9172b2aad7eeb1fcd37a9e632c7b24da1f64)
1/*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>

--- 4 unchanged lines hidden (view full) ---

13#include <linux/module.h>
14#include <linux/nodemask.h>
15#include <linux/sched.h>
16#include <linux/acpi.h>
17
18#include <asm/e820.h>
19#include <asm/proto.h>
20#include <asm/dma.h>
1/*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>

--- 4 unchanged lines hidden (view full) ---

13#include <linux/module.h>
14#include <linux/nodemask.h>
15#include <linux/sched.h>
16#include <linux/acpi.h>
17
18#include <asm/e820.h>
19#include <asm/proto.h>
20#include <asm/dma.h>
21#include <asm/numa.h>
22#include <asm/acpi.h>
23#include <asm/amd_nb.h>
24
21#include <asm/acpi.h>
22#include <asm/amd_nb.h>
23
25struct numa_memblk {
26 u64 start;
27 u64 end;
28 int nid;
29};
24#include "numa_internal.h"
30
25
31struct numa_meminfo {
32 int nr_blks;
33 struct numa_memblk blk[NR_NODE_MEMBLKS];
34};
35
36struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
37EXPORT_SYMBOL(node_data);
38
39nodemask_t numa_nodes_parsed __initdata;
40
41struct memnode memnode;
42
43static unsigned long __initdata nodemap_addr;

--- 166 unchanged lines hidden (view full) ---

210
211 mi->blk[mi->nr_blks].start = start;
212 mi->blk[mi->nr_blks].end = end;
213 mi->blk[mi->nr_blks].nid = nid;
214 mi->nr_blks++;
215 return 0;
216}
217
26struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
27EXPORT_SYMBOL(node_data);
28
29nodemask_t numa_nodes_parsed __initdata;
30
31struct memnode memnode;
32
33static unsigned long __initdata nodemap_addr;

--- 166 unchanged lines hidden (view full) ---

200
201 mi->blk[mi->nr_blks].start = start;
202 mi->blk[mi->nr_blks].end = end;
203 mi->blk[mi->nr_blks].nid = nid;
204 mi->nr_blks++;
205 return 0;
206}
207
218static void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
208void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
219{
220 mi->nr_blks--;
221 memmove(&mi->blk[idx], &mi->blk[idx + 1],
222 (mi->nr_blks - idx) * sizeof(mi->blk[0]));
223}
224
225int __init numa_add_memblk(int nid, u64 start, u64 end)
226{

--- 41 unchanged lines hidden (view full) ---

268 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
269 NODE_DATA(nodeid)->node_id = nodeid;
270 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
271 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
272
273 node_set_online(nodeid);
274}
275
209{
210 mi->nr_blks--;
211 memmove(&mi->blk[idx], &mi->blk[idx + 1],
212 (mi->nr_blks - idx) * sizeof(mi->blk[0]));
213}
214
215int __init numa_add_memblk(int nid, u64 start, u64 end)
216{

--- 41 unchanged lines hidden (view full) ---

258 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
259 NODE_DATA(nodeid)->node_id = nodeid;
260 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
261 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
262
263 node_set_online(nodeid);
264}
265
276static int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
266int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
277{
278 const u64 low = 0;
279 const u64 high = (u64)max_pfn << PAGE_SHIFT;
280 int i, j, k;
281
282 for (i = 0; i < mi->nr_blks; i++) {
283 struct numa_memblk *bi = &mi->blk[i];
284

--- 77 unchanged lines hidden (view full) ---

362 mi->blk[i].nid != NUMA_NO_NODE)
363 node_set(mi->blk[i].nid, *nodemask);
364}
365
366/*
367 * Reset distance table. The current table is freed. The next
368 * numa_set_distance() call will create a new one.
369 */
267{
268 const u64 low = 0;
269 const u64 high = (u64)max_pfn << PAGE_SHIFT;
270 int i, j, k;
271
272 for (i = 0; i < mi->nr_blks; i++) {
273 struct numa_memblk *bi = &mi->blk[i];
274

--- 77 unchanged lines hidden (view full) ---

352 mi->blk[i].nid != NUMA_NO_NODE)
353 node_set(mi->blk[i].nid, *nodemask);
354}
355
356/*
357 * Reset distance table. The current table is freed. The next
358 * numa_set_distance() call will create a new one.
359 */
370static void __init numa_reset_distance(void)
360void __init numa_reset_distance(void)
371{
372 size_t size;
373
374 if (numa_distance_cnt) {
375 size = numa_distance_cnt * sizeof(numa_distance[0]);
376 memblock_x86_free_range(__pa(numa_distance),
377 __pa(numa_distance) + size);
378 numa_distance_cnt = 0;

--- 141 unchanged lines hidden (view full) ---

520
521 if (start < end)
522 setup_node_bootmem(nid, start, end);
523 }
524
525 return 0;
526}
527
361{
362 size_t size;
363
364 if (numa_distance_cnt) {
365 size = numa_distance_cnt * sizeof(numa_distance[0]);
366 memblock_x86_free_range(__pa(numa_distance),
367 __pa(numa_distance) + size);
368 numa_distance_cnt = 0;

--- 141 unchanged lines hidden (view full) ---

510
511 if (start < end)
512 setup_node_bootmem(nid, start, end);
513 }
514
515 return 0;
516}
517
528#ifdef CONFIG_NUMA_EMU
529/* Numa emulation */
530static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata;
531static char *emu_cmdline __initdata;
532
533void __init numa_emu_cmdline(char *str)
534{
535 emu_cmdline = str;
536}
537
538static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
539{
540 int i;
541
542 for (i = 0; i < mi->nr_blks; i++)
543 if (mi->blk[i].nid == nid)
544 return i;
545 return -ENOENT;
546}
547
548/*
549 * Sets up nid to range from @start to @end. The return value is -errno if
550 * something went wrong, 0 otherwise.
551 */
552static int __init emu_setup_memblk(struct numa_meminfo *ei,
553 struct numa_meminfo *pi,
554 int nid, int phys_blk, u64 size)
555{
556 struct numa_memblk *eb = &ei->blk[ei->nr_blks];
557 struct numa_memblk *pb = &pi->blk[phys_blk];
558
559 if (ei->nr_blks >= NR_NODE_MEMBLKS) {
560 pr_err("NUMA: Too many emulated memblks, failing emulation\n");
561 return -EINVAL;
562 }
563
564 ei->nr_blks++;
565 eb->start = pb->start;
566 eb->end = pb->start + size;
567 eb->nid = nid;
568
569 if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
570 emu_nid_to_phys[nid] = pb->nid;
571
572 pb->start += size;
573 if (pb->start >= pb->end) {
574 WARN_ON_ONCE(pb->start > pb->end);
575 numa_remove_memblk_from(phys_blk, pi);
576 }
577
578 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
579 eb->start, eb->end, (eb->end - eb->start) >> 20);
580 return 0;
581}
582
583/*
584 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
585 * to max_addr. The return value is the number of nodes allocated.
586 */
587static int __init split_nodes_interleave(struct numa_meminfo *ei,
588 struct numa_meminfo *pi,
589 u64 addr, u64 max_addr, int nr_nodes)
590{
591 nodemask_t physnode_mask = NODE_MASK_NONE;
592 u64 size;
593 int big;
594 int nid = 0;
595 int i, ret;
596
597 if (nr_nodes <= 0)
598 return -1;
599 if (nr_nodes > MAX_NUMNODES) {
600 pr_info("numa=fake=%d too large, reducing to %d\n",
601 nr_nodes, MAX_NUMNODES);
602 nr_nodes = MAX_NUMNODES;
603 }
604
605 size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
606 /*
607 * Calculate the number of big nodes that can be allocated as a result
608 * of consolidating the remainder.
609 */
610 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
611 FAKE_NODE_MIN_SIZE;
612
613 size &= FAKE_NODE_MIN_HASH_MASK;
614 if (!size) {
615 pr_err("Not enough memory for each node. "
616 "NUMA emulation disabled.\n");
617 return -1;
618 }
619
620 for (i = 0; i < pi->nr_blks; i++)
621 node_set(pi->blk[i].nid, physnode_mask);
622
623 /*
624 * Continue to fill physical nodes with fake nodes until there is no
625 * memory left on any of them.
626 */
627 while (nodes_weight(physnode_mask)) {
628 for_each_node_mask(i, physnode_mask) {
629 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
630 u64 start, limit, end;
631 int phys_blk;
632
633 phys_blk = emu_find_memblk_by_nid(i, pi);
634 if (phys_blk < 0) {
635 node_clear(i, physnode_mask);
636 continue;
637 }
638 start = pi->blk[phys_blk].start;
639 limit = pi->blk[phys_blk].end;
640 end = start + size;
641
642 if (nid < big)
643 end += FAKE_NODE_MIN_SIZE;
644
645 /*
646 * Continue to add memory to this fake node if its
647 * non-reserved memory is less than the per-node size.
648 */
649 while (end - start -
650 memblock_x86_hole_size(start, end) < size) {
651 end += FAKE_NODE_MIN_SIZE;
652 if (end > limit) {
653 end = limit;
654 break;
655 }
656 }
657
658 /*
659 * If there won't be at least FAKE_NODE_MIN_SIZE of
660 * non-reserved memory in ZONE_DMA32 for the next node,
661 * this one must extend to the boundary.
662 */
663 if (end < dma32_end && dma32_end - end -
664 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
665 end = dma32_end;
666
667 /*
668 * If there won't be enough non-reserved memory for the
669 * next node, this one must extend to the end of the
670 * physical node.
671 */
672 if (limit - end -
673 memblock_x86_hole_size(end, limit) < size)
674 end = limit;
675
676 ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
677 phys_blk,
678 min(end, limit) - start);
679 if (ret < 0)
680 return ret;
681 }
682 }
683 return 0;
684}
685
686/*
687 * Returns the end address of a node so that there is at least `size' amount of
688 * non-reserved memory or `max_addr' is reached.
689 */
690static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
691{
692 u64 end = start + size;
693
694 while (end - start - memblock_x86_hole_size(start, end) < size) {
695 end += FAKE_NODE_MIN_SIZE;
696 if (end > max_addr) {
697 end = max_addr;
698 break;
699 }
700 }
701 return end;
702}
703
704/*
705 * Sets up fake nodes of `size' interleaved over physical nodes ranging from
706 * `addr' to `max_addr'. The return value is the number of nodes allocated.
707 */
708static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
709 struct numa_meminfo *pi,
710 u64 addr, u64 max_addr, u64 size)
711{
712 nodemask_t physnode_mask = NODE_MASK_NONE;
713 u64 min_size;
714 int nid = 0;
715 int i, ret;
716
717 if (!size)
718 return -1;
719 /*
720 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
721 * increased accordingly if the requested size is too small. This
722 * creates a uniform distribution of node sizes across the entire
723 * machine (but not necessarily over physical nodes).
724 */
725 min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
726 MAX_NUMNODES;
727 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
728 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
729 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
730 FAKE_NODE_MIN_HASH_MASK;
731 if (size < min_size) {
732 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
733 size >> 20, min_size >> 20);
734 size = min_size;
735 }
736 size &= FAKE_NODE_MIN_HASH_MASK;
737
738 for (i = 0; i < pi->nr_blks; i++)
739 node_set(pi->blk[i].nid, physnode_mask);
740
741 /*
742 * Fill physical nodes with fake nodes of size until there is no memory
743 * left on any of them.
744 */
745 while (nodes_weight(physnode_mask)) {
746 for_each_node_mask(i, physnode_mask) {
747 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
748 u64 start, limit, end;
749 int phys_blk;
750
751 phys_blk = emu_find_memblk_by_nid(i, pi);
752 if (phys_blk < 0) {
753 node_clear(i, physnode_mask);
754 continue;
755 }
756 start = pi->blk[phys_blk].start;
757 limit = pi->blk[phys_blk].end;
758
759 end = find_end_of_node(start, limit, size);
760 /*
761 * If there won't be at least FAKE_NODE_MIN_SIZE of
762 * non-reserved memory in ZONE_DMA32 for the next node,
763 * this one must extend to the boundary.
764 */
765 if (end < dma32_end && dma32_end - end -
766 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
767 end = dma32_end;
768
769 /*
770 * If there won't be enough non-reserved memory for the
771 * next node, this one must extend to the end of the
772 * physical node.
773 */
774 if (limit - end -
775 memblock_x86_hole_size(end, limit) < size)
776 end = limit;
777
778 ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
779 phys_blk,
780 min(end, limit) - start);
781 if (ret < 0)
782 return ret;
783 }
784 }
785 return 0;
786}
787
788/*
789 * Sets up the system RAM area from start_pfn to last_pfn according to the
790 * numa=fake command-line option.
791 */
792static void __init numa_emulation(struct numa_meminfo *numa_meminfo,
793 int numa_dist_cnt)
794{
795 static struct numa_meminfo ei __initdata;
796 static struct numa_meminfo pi __initdata;
797 const u64 max_addr = max_pfn << PAGE_SHIFT;
798 u8 *phys_dist = NULL;
799 int i, j, ret;
800
801 if (!emu_cmdline)
802 goto no_emu;
803
804 memset(&ei, 0, sizeof(ei));
805 pi = *numa_meminfo;
806
807 for (i = 0; i < MAX_NUMNODES; i++)
808 emu_nid_to_phys[i] = NUMA_NO_NODE;
809
810 /*
811 * If the numa=fake command-line contains a 'M' or 'G', it represents
812 * the fixed node size. Otherwise, if it is just a single number N,
813 * split the system RAM into N fake nodes.
814 */
815 if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
816 u64 size;
817
818 size = memparse(emu_cmdline, &emu_cmdline);
819 ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
820 } else {
821 unsigned long n;
822
823 n = simple_strtoul(emu_cmdline, NULL, 0);
824 ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
825 }
826
827 if (ret < 0)
828 goto no_emu;
829
830 if (numa_cleanup_meminfo(&ei) < 0) {
831 pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
832 goto no_emu;
833 }
834
835 /*
836 * Copy the original distance table. It's temporary so no need to
837 * reserve it.
838 */
839 if (numa_dist_cnt) {
840 size_t size = numa_dist_cnt * sizeof(phys_dist[0]);
841 u64 phys;
842
843 phys = memblock_find_in_range(0,
844 (u64)max_pfn_mapped << PAGE_SHIFT,
845 size, PAGE_SIZE);
846 if (phys == MEMBLOCK_ERROR) {
847 pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
848 goto no_emu;
849 }
850 phys_dist = __va(phys);
851
852 for (i = 0; i < numa_dist_cnt; i++)
853 for (j = 0; j < numa_dist_cnt; j++)
854 phys_dist[i * numa_dist_cnt + j] =
855 node_distance(i, j);
856 }
857
858 /* commit */
859 *numa_meminfo = ei;
860
861 /*
862 * Transform __apicid_to_node table to use emulated nids by
863 * reverse-mapping phys_nid. The maps should always exist but fall
864 * back to zero just in case.
865 */
866 for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
867 if (__apicid_to_node[i] == NUMA_NO_NODE)
868 continue;
869 for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
870 if (__apicid_to_node[i] == emu_nid_to_phys[j])
871 break;
872 __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
873 }
874
875 /* make sure all emulated nodes are mapped to a physical node */
876 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
877 if (emu_nid_to_phys[i] == NUMA_NO_NODE)
878 emu_nid_to_phys[i] = 0;
879
880 /* transform distance table */
881 numa_reset_distance();
882 for (i = 0; i < MAX_NUMNODES; i++) {
883 for (j = 0; j < MAX_NUMNODES; j++) {
884 int physi = emu_nid_to_phys[i];
885 int physj = emu_nid_to_phys[j];
886 int dist;
887
888 if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
889 dist = physi == physj ?
890 LOCAL_DISTANCE : REMOTE_DISTANCE;
891 else
892 dist = phys_dist[physi * numa_dist_cnt + physj];
893
894 numa_set_distance(i, j, dist);
895 }
896 }
897 return;
898
899no_emu:
900 /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */
901 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
902 emu_nid_to_phys[i] = i;
903}
904#else /* CONFIG_NUMA_EMU */
905static inline void numa_emulation(struct numa_meminfo *numa_meminfo,
906 int numa_dist_cnt)
907{ }
908#endif /* CONFIG_NUMA_EMU */
909
910static int __init dummy_numa_init(void)
911{
912 printk(KERN_INFO "%s\n",
913 numa_off ? "NUMA turned off" : "No NUMA configuration found");
914 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
915 0LU, max_pfn << PAGE_SHIFT);
916
917 node_set(0, numa_nodes_parsed);

--- 71 unchanged lines hidden (view full) ---

989int __cpuinit numa_cpu_node(int cpu)
990{
991 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
992
993 if (apicid != BAD_APICID)
994 return __apicid_to_node[apicid];
995 return NUMA_NO_NODE;
996}
518static int __init dummy_numa_init(void)
519{
520 printk(KERN_INFO "%s\n",
521 numa_off ? "NUMA turned off" : "No NUMA configuration found");
522 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
523 0LU, max_pfn << PAGE_SHIFT);
524
525 node_set(0, numa_nodes_parsed);

--- 71 unchanged lines hidden (view full) ---

597int __cpuinit numa_cpu_node(int cpu)
598{
599 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
600
601 if (apicid != BAD_APICID)
602 return __apicid_to_node[apicid];
603 return NUMA_NO_NODE;
604}
997
998/*
999 * UGLINESS AHEAD: Currently, CONFIG_NUMA_EMU is 64bit only and makes use
1000 * of 64bit specific data structures. The distinction is artificial and
1001 * should be removed. numa_{add|remove}_cpu() are implemented in numa.c
1002 * for both 32 and 64bit when CONFIG_NUMA_EMU is disabled but here when
1003 * enabled.
1004 *
1005 * NUMA emulation is planned to be made generic and the following and other
1006 * related code should be moved to numa.c.
1007 */
1008#ifdef CONFIG_NUMA_EMU
1009# ifndef CONFIG_DEBUG_PER_CPU_MAPS
1010void __cpuinit numa_add_cpu(int cpu)
1011{
1012 int physnid, nid;
1013
1014 nid = numa_cpu_node(cpu);
1015 if (nid == NUMA_NO_NODE)
1016 nid = early_cpu_to_node(cpu);
1017 BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
1018
1019 physnid = emu_nid_to_phys[nid];
1020
1021 /*
1022 * Map the cpu to each emulated node that is allocated on the physical
1023 * node of the cpu's apic id.
1024 */
1025 for_each_online_node(nid)
1026 if (emu_nid_to_phys[nid] == physnid)
1027 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
1028}
1029
1030void __cpuinit numa_remove_cpu(int cpu)
1031{
1032 int i;
1033
1034 for_each_online_node(i)
1035 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
1036}
1037# else /* !CONFIG_DEBUG_PER_CPU_MAPS */
1038static void __cpuinit numa_set_cpumask(int cpu, int enable)
1039{
1040 struct cpumask *mask;
1041 int nid, physnid, i;
1042
1043 nid = early_cpu_to_node(cpu);
1044 if (nid == NUMA_NO_NODE) {
1045 /* early_cpu_to_node() already emits a warning and trace */
1046 return;
1047 }
1048
1049 physnid = emu_nid_to_phys[nid];
1050
1051 for_each_online_node(i) {
1052 if (emu_nid_to_phys[nid] != physnid)
1053 continue;
1054
1055 mask = debug_cpumask_set_cpu(cpu, enable);
1056 if (!mask)
1057 return;
1058
1059 if (enable)
1060 cpumask_set_cpu(cpu, mask);
1061 else
1062 cpumask_clear_cpu(cpu, mask);
1063 }
1064}
1065
1066void __cpuinit numa_add_cpu(int cpu)
1067{
1068 numa_set_cpumask(cpu, 1);
1069}
1070
1071void __cpuinit numa_remove_cpu(int cpu)
1072{
1073 numa_set_cpumask(cpu, 0);
1074}
1075# endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
1076#endif /* CONFIG_NUMA_EMU */