numa_64.c (fbe99959d1db85222829a64d869dcab704ac7ec8) | numa_64.c (b8ef9172b2aad7eeb1fcd37a9e632c7b24da1f64) |
---|---|
1/* 2 * Generic VM initialization for x86-64 NUMA setups. 3 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 4 */ 5#include <linux/kernel.h> 6#include <linux/mm.h> 7#include <linux/string.h> 8#include <linux/init.h> --- 4 unchanged lines hidden (view full) --- 13#include <linux/module.h> 14#include <linux/nodemask.h> 15#include <linux/sched.h> 16#include <linux/acpi.h> 17 18#include <asm/e820.h> 19#include <asm/proto.h> 20#include <asm/dma.h> | 1/* 2 * Generic VM initialization for x86-64 NUMA setups. 3 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 4 */ 5#include <linux/kernel.h> 6#include <linux/mm.h> 7#include <linux/string.h> 8#include <linux/init.h> --- 4 unchanged lines hidden (view full) --- 13#include <linux/module.h> 14#include <linux/nodemask.h> 15#include <linux/sched.h> 16#include <linux/acpi.h> 17 18#include <asm/e820.h> 19#include <asm/proto.h> 20#include <asm/dma.h> |
21#include <asm/numa.h> | |
22#include <asm/acpi.h> 23#include <asm/amd_nb.h> 24 | 21#include <asm/acpi.h> 22#include <asm/amd_nb.h> 23 |
25struct numa_memblk { 26 u64 start; 27 u64 end; 28 int nid; 29}; | 24#include "numa_internal.h" |
30 | 25 |
31struct numa_meminfo { 32 int nr_blks; 33 struct numa_memblk blk[NR_NODE_MEMBLKS]; 34}; 35 | |
36struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 37EXPORT_SYMBOL(node_data); 38 39nodemask_t numa_nodes_parsed __initdata; 40 41struct memnode memnode; 42 43static unsigned long __initdata nodemap_addr; --- 166 unchanged lines hidden (view full) --- 210 211 mi->blk[mi->nr_blks].start = start; 212 mi->blk[mi->nr_blks].end = end; 213 mi->blk[mi->nr_blks].nid = nid; 214 mi->nr_blks++; 215 return 0; 216} 217 | 26struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 27EXPORT_SYMBOL(node_data); 28 29nodemask_t numa_nodes_parsed __initdata; 30 31struct memnode memnode; 32 33static unsigned long __initdata nodemap_addr; --- 166 unchanged lines hidden (view full) --- 200 201 mi->blk[mi->nr_blks].start = start; 202 mi->blk[mi->nr_blks].end = end; 203 mi->blk[mi->nr_blks].nid = nid; 204 mi->nr_blks++; 205 return 0; 206} 207 |
218static void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) | 208void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) |
219{ 220 mi->nr_blks--; 221 memmove(&mi->blk[idx], &mi->blk[idx + 1], 222 (mi->nr_blks - idx) * sizeof(mi->blk[0])); 223} 224 225int __init numa_add_memblk(int nid, u64 start, u64 end) 226{ --- 41 unchanged lines hidden (view full) --- 268 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 269 NODE_DATA(nodeid)->node_id = nodeid; 270 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 271 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; 272 273 node_set_online(nodeid); 274} 275 | 209{ 210 mi->nr_blks--; 211 memmove(&mi->blk[idx], &mi->blk[idx + 1], 212 (mi->nr_blks - idx) * sizeof(mi->blk[0])); 213} 214 215int __init numa_add_memblk(int nid, u64 start, u64 end) 216{ --- 41 unchanged lines hidden (view full) --- 258 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 259 NODE_DATA(nodeid)->node_id = nodeid; 260 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 261 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; 262 263 node_set_online(nodeid); 264} 265 |
276static int __init numa_cleanup_meminfo(struct numa_meminfo *mi) | 266int __init numa_cleanup_meminfo(struct numa_meminfo *mi) |
277{ 278 const u64 low = 0; 279 const u64 high = (u64)max_pfn << PAGE_SHIFT; 280 int i, j, k; 281 282 for (i = 0; i < mi->nr_blks; i++) { 283 struct numa_memblk *bi = &mi->blk[i]; 284 --- 77 unchanged lines hidden (view full) --- 362 mi->blk[i].nid != NUMA_NO_NODE) 363 node_set(mi->blk[i].nid, *nodemask); 364} 365 366/* 367 * Reset distance table. The current table is freed. The next 368 * numa_set_distance() call will create a new one. 369 */ | 267{ 268 const u64 low = 0; 269 const u64 high = (u64)max_pfn << PAGE_SHIFT; 270 int i, j, k; 271 272 for (i = 0; i < mi->nr_blks; i++) { 273 struct numa_memblk *bi = &mi->blk[i]; 274 --- 77 unchanged lines hidden (view full) --- 352 mi->blk[i].nid != NUMA_NO_NODE) 353 node_set(mi->blk[i].nid, *nodemask); 354} 355 356/* 357 * Reset distance table. The current table is freed. The next 358 * numa_set_distance() call will create a new one. 359 */ |
370static void __init numa_reset_distance(void) | 360void __init numa_reset_distance(void) |
371{ 372 size_t size; 373 374 if (numa_distance_cnt) { 375 size = numa_distance_cnt * sizeof(numa_distance[0]); 376 memblock_x86_free_range(__pa(numa_distance), 377 __pa(numa_distance) + size); 378 numa_distance_cnt = 0; --- 141 unchanged lines hidden (view full) --- 520 521 if (start < end) 522 setup_node_bootmem(nid, start, end); 523 } 524 525 return 0; 526} 527 | 361{ 362 size_t size; 363 364 if (numa_distance_cnt) { 365 size = numa_distance_cnt * sizeof(numa_distance[0]); 366 memblock_x86_free_range(__pa(numa_distance), 367 __pa(numa_distance) + size); 368 numa_distance_cnt = 0; --- 141 unchanged lines hidden (view full) --- 510 511 if (start < end) 512 setup_node_bootmem(nid, start, end); 513 } 514 515 return 0; 516} 517 |
528#ifdef CONFIG_NUMA_EMU 529/* Numa emulation */ 530static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata; 531static char *emu_cmdline __initdata; 532 533void __init numa_emu_cmdline(char *str) 534{ 535 emu_cmdline = str; 536} 537 538static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) 539{ 540 int i; 541 542 for (i = 0; i < mi->nr_blks; i++) 543 if (mi->blk[i].nid == nid) 544 return i; 545 return -ENOENT; 546} 547 548/* 549 * Sets up nid to range from @start to @end. The return value is -errno if 550 * something went wrong, 0 otherwise. 551 */ 552static int __init emu_setup_memblk(struct numa_meminfo *ei, 553 struct numa_meminfo *pi, 554 int nid, int phys_blk, u64 size) 555{ 556 struct numa_memblk *eb = &ei->blk[ei->nr_blks]; 557 struct numa_memblk *pb = &pi->blk[phys_blk]; 558 559 if (ei->nr_blks >= NR_NODE_MEMBLKS) { 560 pr_err("NUMA: Too many emulated memblks, failing emulation\n"); 561 return -EINVAL; 562 } 563 564 ei->nr_blks++; 565 eb->start = pb->start; 566 eb->end = pb->start + size; 567 eb->nid = nid; 568 569 if (emu_nid_to_phys[nid] == NUMA_NO_NODE) 570 emu_nid_to_phys[nid] = pb->nid; 571 572 pb->start += size; 573 if (pb->start >= pb->end) { 574 WARN_ON_ONCE(pb->start > pb->end); 575 numa_remove_memblk_from(phys_blk, pi); 576 } 577 578 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, 579 eb->start, eb->end, (eb->end - eb->start) >> 20); 580 return 0; 581} 582 583/* 584 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr 585 * to max_addr. The return value is the number of nodes allocated. 586 */ 587static int __init split_nodes_interleave(struct numa_meminfo *ei, 588 struct numa_meminfo *pi, 589 u64 addr, u64 max_addr, int nr_nodes) 590{ 591 nodemask_t physnode_mask = NODE_MASK_NONE; 592 u64 size; 593 int big; 594 int nid = 0; 595 int i, ret; 596 597 if (nr_nodes <= 0) 598 return -1; 599 if (nr_nodes > MAX_NUMNODES) { 600 pr_info("numa=fake=%d too large, reducing to %d\n", 601 nr_nodes, MAX_NUMNODES); 602 nr_nodes = MAX_NUMNODES; 603 } 604 605 size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes; 606 /* 607 * Calculate the number of big nodes that can be allocated as a result 608 * of consolidating the remainder. 609 */ 610 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / 611 FAKE_NODE_MIN_SIZE; 612 613 size &= FAKE_NODE_MIN_HASH_MASK; 614 if (!size) { 615 pr_err("Not enough memory for each node. " 616 "NUMA emulation disabled.\n"); 617 return -1; 618 } 619 620 for (i = 0; i < pi->nr_blks; i++) 621 node_set(pi->blk[i].nid, physnode_mask); 622 623 /* 624 * Continue to fill physical nodes with fake nodes until there is no 625 * memory left on any of them. 626 */ 627 while (nodes_weight(physnode_mask)) { 628 for_each_node_mask(i, physnode_mask) { 629 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); 630 u64 start, limit, end; 631 int phys_blk; 632 633 phys_blk = emu_find_memblk_by_nid(i, pi); 634 if (phys_blk < 0) { 635 node_clear(i, physnode_mask); 636 continue; 637 } 638 start = pi->blk[phys_blk].start; 639 limit = pi->blk[phys_blk].end; 640 end = start + size; 641 642 if (nid < big) 643 end += FAKE_NODE_MIN_SIZE; 644 645 /* 646 * Continue to add memory to this fake node if its 647 * non-reserved memory is less than the per-node size. 648 */ 649 while (end - start - 650 memblock_x86_hole_size(start, end) < size) { 651 end += FAKE_NODE_MIN_SIZE; 652 if (end > limit) { 653 end = limit; 654 break; 655 } 656 } 657 658 /* 659 * If there won't be at least FAKE_NODE_MIN_SIZE of 660 * non-reserved memory in ZONE_DMA32 for the next node, 661 * this one must extend to the boundary. 662 */ 663 if (end < dma32_end && dma32_end - end - 664 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) 665 end = dma32_end; 666 667 /* 668 * If there won't be enough non-reserved memory for the 669 * next node, this one must extend to the end of the 670 * physical node. 671 */ 672 if (limit - end - 673 memblock_x86_hole_size(end, limit) < size) 674 end = limit; 675 676 ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, 677 phys_blk, 678 min(end, limit) - start); 679 if (ret < 0) 680 return ret; 681 } 682 } 683 return 0; 684} 685 686/* 687 * Returns the end address of a node so that there is at least `size' amount of 688 * non-reserved memory or `max_addr' is reached. 689 */ 690static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) 691{ 692 u64 end = start + size; 693 694 while (end - start - memblock_x86_hole_size(start, end) < size) { 695 end += FAKE_NODE_MIN_SIZE; 696 if (end > max_addr) { 697 end = max_addr; 698 break; 699 } 700 } 701 return end; 702} 703 704/* 705 * Sets up fake nodes of `size' interleaved over physical nodes ranging from 706 * `addr' to `max_addr'. The return value is the number of nodes allocated. 707 */ 708static int __init split_nodes_size_interleave(struct numa_meminfo *ei, 709 struct numa_meminfo *pi, 710 u64 addr, u64 max_addr, u64 size) 711{ 712 nodemask_t physnode_mask = NODE_MASK_NONE; 713 u64 min_size; 714 int nid = 0; 715 int i, ret; 716 717 if (!size) 718 return -1; 719 /* 720 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is 721 * increased accordingly if the requested size is too small. This 722 * creates a uniform distribution of node sizes across the entire 723 * machine (but not necessarily over physical nodes). 724 */ 725 min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / 726 MAX_NUMNODES; 727 min_size = max(min_size, FAKE_NODE_MIN_SIZE); 728 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) 729 min_size = (min_size + FAKE_NODE_MIN_SIZE) & 730 FAKE_NODE_MIN_HASH_MASK; 731 if (size < min_size) { 732 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", 733 size >> 20, min_size >> 20); 734 size = min_size; 735 } 736 size &= FAKE_NODE_MIN_HASH_MASK; 737 738 for (i = 0; i < pi->nr_blks; i++) 739 node_set(pi->blk[i].nid, physnode_mask); 740 741 /* 742 * Fill physical nodes with fake nodes of size until there is no memory 743 * left on any of them. 744 */ 745 while (nodes_weight(physnode_mask)) { 746 for_each_node_mask(i, physnode_mask) { 747 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; 748 u64 start, limit, end; 749 int phys_blk; 750 751 phys_blk = emu_find_memblk_by_nid(i, pi); 752 if (phys_blk < 0) { 753 node_clear(i, physnode_mask); 754 continue; 755 } 756 start = pi->blk[phys_blk].start; 757 limit = pi->blk[phys_blk].end; 758 759 end = find_end_of_node(start, limit, size); 760 /* 761 * If there won't be at least FAKE_NODE_MIN_SIZE of 762 * non-reserved memory in ZONE_DMA32 for the next node, 763 * this one must extend to the boundary. 764 */ 765 if (end < dma32_end && dma32_end - end - 766 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) 767 end = dma32_end; 768 769 /* 770 * If there won't be enough non-reserved memory for the 771 * next node, this one must extend to the end of the 772 * physical node. 773 */ 774 if (limit - end - 775 memblock_x86_hole_size(end, limit) < size) 776 end = limit; 777 778 ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, 779 phys_blk, 780 min(end, limit) - start); 781 if (ret < 0) 782 return ret; 783 } 784 } 785 return 0; 786} 787 788/* 789 * Sets up the system RAM area from start_pfn to last_pfn according to the 790 * numa=fake command-line option. 791 */ 792static void __init numa_emulation(struct numa_meminfo *numa_meminfo, 793 int numa_dist_cnt) 794{ 795 static struct numa_meminfo ei __initdata; 796 static struct numa_meminfo pi __initdata; 797 const u64 max_addr = max_pfn << PAGE_SHIFT; 798 u8 *phys_dist = NULL; 799 int i, j, ret; 800 801 if (!emu_cmdline) 802 goto no_emu; 803 804 memset(&ei, 0, sizeof(ei)); 805 pi = *numa_meminfo; 806 807 for (i = 0; i < MAX_NUMNODES; i++) 808 emu_nid_to_phys[i] = NUMA_NO_NODE; 809 810 /* 811 * If the numa=fake command-line contains a 'M' or 'G', it represents 812 * the fixed node size. Otherwise, if it is just a single number N, 813 * split the system RAM into N fake nodes. 814 */ 815 if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { 816 u64 size; 817 818 size = memparse(emu_cmdline, &emu_cmdline); 819 ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); 820 } else { 821 unsigned long n; 822 823 n = simple_strtoul(emu_cmdline, NULL, 0); 824 ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); 825 } 826 827 if (ret < 0) 828 goto no_emu; 829 830 if (numa_cleanup_meminfo(&ei) < 0) { 831 pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); 832 goto no_emu; 833 } 834 835 /* 836 * Copy the original distance table. It's temporary so no need to 837 * reserve it. 838 */ 839 if (numa_dist_cnt) { 840 size_t size = numa_dist_cnt * sizeof(phys_dist[0]); 841 u64 phys; 842 843 phys = memblock_find_in_range(0, 844 (u64)max_pfn_mapped << PAGE_SHIFT, 845 size, PAGE_SIZE); 846 if (phys == MEMBLOCK_ERROR) { 847 pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); 848 goto no_emu; 849 } 850 phys_dist = __va(phys); 851 852 for (i = 0; i < numa_dist_cnt; i++) 853 for (j = 0; j < numa_dist_cnt; j++) 854 phys_dist[i * numa_dist_cnt + j] = 855 node_distance(i, j); 856 } 857 858 /* commit */ 859 *numa_meminfo = ei; 860 861 /* 862 * Transform __apicid_to_node table to use emulated nids by 863 * reverse-mapping phys_nid. The maps should always exist but fall 864 * back to zero just in case. 865 */ 866 for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { 867 if (__apicid_to_node[i] == NUMA_NO_NODE) 868 continue; 869 for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) 870 if (__apicid_to_node[i] == emu_nid_to_phys[j]) 871 break; 872 __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0; 873 } 874 875 /* make sure all emulated nodes are mapped to a physical node */ 876 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) 877 if (emu_nid_to_phys[i] == NUMA_NO_NODE) 878 emu_nid_to_phys[i] = 0; 879 880 /* transform distance table */ 881 numa_reset_distance(); 882 for (i = 0; i < MAX_NUMNODES; i++) { 883 for (j = 0; j < MAX_NUMNODES; j++) { 884 int physi = emu_nid_to_phys[i]; 885 int physj = emu_nid_to_phys[j]; 886 int dist; 887 888 if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) 889 dist = physi == physj ? 890 LOCAL_DISTANCE : REMOTE_DISTANCE; 891 else 892 dist = phys_dist[physi * numa_dist_cnt + physj]; 893 894 numa_set_distance(i, j, dist); 895 } 896 } 897 return; 898 899no_emu: 900 /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ 901 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) 902 emu_nid_to_phys[i] = i; 903} 904#else /* CONFIG_NUMA_EMU */ 905static inline void numa_emulation(struct numa_meminfo *numa_meminfo, 906 int numa_dist_cnt) 907{ } 908#endif /* CONFIG_NUMA_EMU */ 909 | |
910static int __init dummy_numa_init(void) 911{ 912 printk(KERN_INFO "%s\n", 913 numa_off ? "NUMA turned off" : "No NUMA configuration found"); 914 printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 915 0LU, max_pfn << PAGE_SHIFT); 916 917 node_set(0, numa_nodes_parsed); --- 71 unchanged lines hidden (view full) --- 989int __cpuinit numa_cpu_node(int cpu) 990{ 991 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); 992 993 if (apicid != BAD_APICID) 994 return __apicid_to_node[apicid]; 995 return NUMA_NO_NODE; 996} | 518static int __init dummy_numa_init(void) 519{ 520 printk(KERN_INFO "%s\n", 521 numa_off ? "NUMA turned off" : "No NUMA configuration found"); 522 printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 523 0LU, max_pfn << PAGE_SHIFT); 524 525 node_set(0, numa_nodes_parsed); --- 71 unchanged lines hidden (view full) --- 597int __cpuinit numa_cpu_node(int cpu) 598{ 599 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); 600 601 if (apicid != BAD_APICID) 602 return __apicid_to_node[apicid]; 603 return NUMA_NO_NODE; 604} |
997 998/* 999 * UGLINESS AHEAD: Currently, CONFIG_NUMA_EMU is 64bit only and makes use 1000 * of 64bit specific data structures. The distinction is artificial and 1001 * should be removed. numa_{add|remove}_cpu() are implemented in numa.c 1002 * for both 32 and 64bit when CONFIG_NUMA_EMU is disabled but here when 1003 * enabled. 1004 * 1005 * NUMA emulation is planned to be made generic and the following and other 1006 * related code should be moved to numa.c. 1007 */ 1008#ifdef CONFIG_NUMA_EMU 1009# ifndef CONFIG_DEBUG_PER_CPU_MAPS 1010void __cpuinit numa_add_cpu(int cpu) 1011{ 1012 int physnid, nid; 1013 1014 nid = numa_cpu_node(cpu); 1015 if (nid == NUMA_NO_NODE) 1016 nid = early_cpu_to_node(cpu); 1017 BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); 1018 1019 physnid = emu_nid_to_phys[nid]; 1020 1021 /* 1022 * Map the cpu to each emulated node that is allocated on the physical 1023 * node of the cpu's apic id. 1024 */ 1025 for_each_online_node(nid) 1026 if (emu_nid_to_phys[nid] == physnid) 1027 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); 1028} 1029 1030void __cpuinit numa_remove_cpu(int cpu) 1031{ 1032 int i; 1033 1034 for_each_online_node(i) 1035 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); 1036} 1037# else /* !CONFIG_DEBUG_PER_CPU_MAPS */ 1038static void __cpuinit numa_set_cpumask(int cpu, int enable) 1039{ 1040 struct cpumask *mask; 1041 int nid, physnid, i; 1042 1043 nid = early_cpu_to_node(cpu); 1044 if (nid == NUMA_NO_NODE) { 1045 /* early_cpu_to_node() already emits a warning and trace */ 1046 return; 1047 } 1048 1049 physnid = emu_nid_to_phys[nid]; 1050 1051 for_each_online_node(i) { 1052 if (emu_nid_to_phys[nid] != physnid) 1053 continue; 1054 1055 mask = debug_cpumask_set_cpu(cpu, enable); 1056 if (!mask) 1057 return; 1058 1059 if (enable) 1060 cpumask_set_cpu(cpu, mask); 1061 else 1062 cpumask_clear_cpu(cpu, mask); 1063 } 1064} 1065 1066void __cpuinit numa_add_cpu(int cpu) 1067{ 1068 numa_set_cpumask(cpu, 1); 1069} 1070 1071void __cpuinit numa_remove_cpu(int cpu) 1072{ 1073 numa_set_cpumask(cpu, 0); 1074} 1075# endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 1076#endif /* CONFIG_NUMA_EMU */ | |