xref: /linux/arch/powerpc/mm/numa.c (revision ab1f9dac6eea25ee59e4c8e1cf0b7476afbbfe07)
1*ab1f9dacSPaul Mackerras /*
2*ab1f9dacSPaul Mackerras  * pSeries NUMA support
3*ab1f9dacSPaul Mackerras  *
4*ab1f9dacSPaul Mackerras  * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
5*ab1f9dacSPaul Mackerras  *
6*ab1f9dacSPaul Mackerras  * This program is free software; you can redistribute it and/or
7*ab1f9dacSPaul Mackerras  * modify it under the terms of the GNU General Public License
8*ab1f9dacSPaul Mackerras  * as published by the Free Software Foundation; either version
9*ab1f9dacSPaul Mackerras  * 2 of the License, or (at your option) any later version.
10*ab1f9dacSPaul Mackerras  */
11*ab1f9dacSPaul Mackerras #include <linux/threads.h>
12*ab1f9dacSPaul Mackerras #include <linux/bootmem.h>
13*ab1f9dacSPaul Mackerras #include <linux/init.h>
14*ab1f9dacSPaul Mackerras #include <linux/mm.h>
15*ab1f9dacSPaul Mackerras #include <linux/mmzone.h>
16*ab1f9dacSPaul Mackerras #include <linux/module.h>
17*ab1f9dacSPaul Mackerras #include <linux/nodemask.h>
18*ab1f9dacSPaul Mackerras #include <linux/cpu.h>
19*ab1f9dacSPaul Mackerras #include <linux/notifier.h>
20*ab1f9dacSPaul Mackerras #include <asm/lmb.h>
21*ab1f9dacSPaul Mackerras #include <asm/machdep.h>
22*ab1f9dacSPaul Mackerras #include <asm/abs_addr.h>
23*ab1f9dacSPaul Mackerras 
24*ab1f9dacSPaul Mackerras static int numa_enabled = 1;
25*ab1f9dacSPaul Mackerras 
26*ab1f9dacSPaul Mackerras static int numa_debug;
27*ab1f9dacSPaul Mackerras #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
28*ab1f9dacSPaul Mackerras 
29*ab1f9dacSPaul Mackerras #ifdef DEBUG_NUMA
30*ab1f9dacSPaul Mackerras #define ARRAY_INITIALISER -1
31*ab1f9dacSPaul Mackerras #else
32*ab1f9dacSPaul Mackerras #define ARRAY_INITIALISER 0
33*ab1f9dacSPaul Mackerras #endif
34*ab1f9dacSPaul Mackerras 
35*ab1f9dacSPaul Mackerras int numa_cpu_lookup_table[NR_CPUS] = { [ 0 ... (NR_CPUS - 1)] =
36*ab1f9dacSPaul Mackerras 	ARRAY_INITIALISER};
37*ab1f9dacSPaul Mackerras char *numa_memory_lookup_table;
38*ab1f9dacSPaul Mackerras cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
39*ab1f9dacSPaul Mackerras int nr_cpus_in_node[MAX_NUMNODES] = { [0 ... (MAX_NUMNODES -1)] = 0};
40*ab1f9dacSPaul Mackerras 
41*ab1f9dacSPaul Mackerras struct pglist_data *node_data[MAX_NUMNODES];
42*ab1f9dacSPaul Mackerras bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
43*ab1f9dacSPaul Mackerras static int min_common_depth;
44*ab1f9dacSPaul Mackerras 
45*ab1f9dacSPaul Mackerras /*
46*ab1f9dacSPaul Mackerras  * We need somewhere to store start/span for each node until we have
47*ab1f9dacSPaul Mackerras  * allocated the real node_data structures.
48*ab1f9dacSPaul Mackerras  */
49*ab1f9dacSPaul Mackerras static struct {
50*ab1f9dacSPaul Mackerras 	unsigned long node_start_pfn;
51*ab1f9dacSPaul Mackerras 	unsigned long node_end_pfn;
52*ab1f9dacSPaul Mackerras 	unsigned long node_present_pages;
53*ab1f9dacSPaul Mackerras } init_node_data[MAX_NUMNODES] __initdata;
54*ab1f9dacSPaul Mackerras 
55*ab1f9dacSPaul Mackerras EXPORT_SYMBOL(node_data);
56*ab1f9dacSPaul Mackerras EXPORT_SYMBOL(numa_cpu_lookup_table);
57*ab1f9dacSPaul Mackerras EXPORT_SYMBOL(numa_memory_lookup_table);
58*ab1f9dacSPaul Mackerras EXPORT_SYMBOL(numa_cpumask_lookup_table);
59*ab1f9dacSPaul Mackerras EXPORT_SYMBOL(nr_cpus_in_node);
60*ab1f9dacSPaul Mackerras 
61*ab1f9dacSPaul Mackerras static inline void map_cpu_to_node(int cpu, int node)
62*ab1f9dacSPaul Mackerras {
63*ab1f9dacSPaul Mackerras 	numa_cpu_lookup_table[cpu] = node;
64*ab1f9dacSPaul Mackerras 	if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) {
65*ab1f9dacSPaul Mackerras 		cpu_set(cpu, numa_cpumask_lookup_table[node]);
66*ab1f9dacSPaul Mackerras 		nr_cpus_in_node[node]++;
67*ab1f9dacSPaul Mackerras 	}
68*ab1f9dacSPaul Mackerras }
69*ab1f9dacSPaul Mackerras 
70*ab1f9dacSPaul Mackerras #ifdef CONFIG_HOTPLUG_CPU
71*ab1f9dacSPaul Mackerras static void unmap_cpu_from_node(unsigned long cpu)
72*ab1f9dacSPaul Mackerras {
73*ab1f9dacSPaul Mackerras 	int node = numa_cpu_lookup_table[cpu];
74*ab1f9dacSPaul Mackerras 
75*ab1f9dacSPaul Mackerras 	dbg("removing cpu %lu from node %d\n", cpu, node);
76*ab1f9dacSPaul Mackerras 
77*ab1f9dacSPaul Mackerras 	if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
78*ab1f9dacSPaul Mackerras 		cpu_clear(cpu, numa_cpumask_lookup_table[node]);
79*ab1f9dacSPaul Mackerras 		nr_cpus_in_node[node]--;
80*ab1f9dacSPaul Mackerras 	} else {
81*ab1f9dacSPaul Mackerras 		printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
82*ab1f9dacSPaul Mackerras 		       cpu, node);
83*ab1f9dacSPaul Mackerras 	}
84*ab1f9dacSPaul Mackerras }
85*ab1f9dacSPaul Mackerras #endif /* CONFIG_HOTPLUG_CPU */
86*ab1f9dacSPaul Mackerras 
87*ab1f9dacSPaul Mackerras static struct device_node * __devinit find_cpu_node(unsigned int cpu)
88*ab1f9dacSPaul Mackerras {
89*ab1f9dacSPaul Mackerras 	unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
90*ab1f9dacSPaul Mackerras 	struct device_node *cpu_node = NULL;
91*ab1f9dacSPaul Mackerras 	unsigned int *interrupt_server, *reg;
92*ab1f9dacSPaul Mackerras 	int len;
93*ab1f9dacSPaul Mackerras 
94*ab1f9dacSPaul Mackerras 	while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) {
95*ab1f9dacSPaul Mackerras 		/* Try interrupt server first */
96*ab1f9dacSPaul Mackerras 		interrupt_server = (unsigned int *)get_property(cpu_node,
97*ab1f9dacSPaul Mackerras 					"ibm,ppc-interrupt-server#s", &len);
98*ab1f9dacSPaul Mackerras 
99*ab1f9dacSPaul Mackerras 		len = len / sizeof(u32);
100*ab1f9dacSPaul Mackerras 
101*ab1f9dacSPaul Mackerras 		if (interrupt_server && (len > 0)) {
102*ab1f9dacSPaul Mackerras 			while (len--) {
103*ab1f9dacSPaul Mackerras 				if (interrupt_server[len] == hw_cpuid)
104*ab1f9dacSPaul Mackerras 					return cpu_node;
105*ab1f9dacSPaul Mackerras 			}
106*ab1f9dacSPaul Mackerras 		} else {
107*ab1f9dacSPaul Mackerras 			reg = (unsigned int *)get_property(cpu_node,
108*ab1f9dacSPaul Mackerras 							   "reg", &len);
109*ab1f9dacSPaul Mackerras 			if (reg && (len > 0) && (reg[0] == hw_cpuid))
110*ab1f9dacSPaul Mackerras 				return cpu_node;
111*ab1f9dacSPaul Mackerras 		}
112*ab1f9dacSPaul Mackerras 	}
113*ab1f9dacSPaul Mackerras 
114*ab1f9dacSPaul Mackerras 	return NULL;
115*ab1f9dacSPaul Mackerras }
116*ab1f9dacSPaul Mackerras 
117*ab1f9dacSPaul Mackerras /* must hold reference to node during call */
118*ab1f9dacSPaul Mackerras static int *of_get_associativity(struct device_node *dev)
119*ab1f9dacSPaul Mackerras {
120*ab1f9dacSPaul Mackerras 	return (unsigned int *)get_property(dev, "ibm,associativity", NULL);
121*ab1f9dacSPaul Mackerras }
122*ab1f9dacSPaul Mackerras 
123*ab1f9dacSPaul Mackerras static int of_node_numa_domain(struct device_node *device)
124*ab1f9dacSPaul Mackerras {
125*ab1f9dacSPaul Mackerras 	int numa_domain;
126*ab1f9dacSPaul Mackerras 	unsigned int *tmp;
127*ab1f9dacSPaul Mackerras 
128*ab1f9dacSPaul Mackerras 	if (min_common_depth == -1)
129*ab1f9dacSPaul Mackerras 		return 0;
130*ab1f9dacSPaul Mackerras 
131*ab1f9dacSPaul Mackerras 	tmp = of_get_associativity(device);
132*ab1f9dacSPaul Mackerras 	if (tmp && (tmp[0] >= min_common_depth)) {
133*ab1f9dacSPaul Mackerras 		numa_domain = tmp[min_common_depth];
134*ab1f9dacSPaul Mackerras 	} else {
135*ab1f9dacSPaul Mackerras 		dbg("WARNING: no NUMA information for %s\n",
136*ab1f9dacSPaul Mackerras 		    device->full_name);
137*ab1f9dacSPaul Mackerras 		numa_domain = 0;
138*ab1f9dacSPaul Mackerras 	}
139*ab1f9dacSPaul Mackerras 	return numa_domain;
140*ab1f9dacSPaul Mackerras }
141*ab1f9dacSPaul Mackerras 
142*ab1f9dacSPaul Mackerras /*
143*ab1f9dacSPaul Mackerras  * In theory, the "ibm,associativity" property may contain multiple
144*ab1f9dacSPaul Mackerras  * associativity lists because a resource may be multiply connected
145*ab1f9dacSPaul Mackerras  * into the machine.  This resource then has different associativity
146*ab1f9dacSPaul Mackerras  * characteristics relative to its multiple connections.  We ignore
147*ab1f9dacSPaul Mackerras  * this for now.  We also assume that all cpu and memory sets have
148*ab1f9dacSPaul Mackerras  * their distances represented at a common level.  This won't be
149*ab1f9dacSPaul Mackerras  * true for heirarchical NUMA.
150*ab1f9dacSPaul Mackerras  *
151*ab1f9dacSPaul Mackerras  * In any case the ibm,associativity-reference-points should give
152*ab1f9dacSPaul Mackerras  * the correct depth for a normal NUMA system.
153*ab1f9dacSPaul Mackerras  *
154*ab1f9dacSPaul Mackerras  * - Dave Hansen <haveblue@us.ibm.com>
155*ab1f9dacSPaul Mackerras  */
156*ab1f9dacSPaul Mackerras static int __init find_min_common_depth(void)
157*ab1f9dacSPaul Mackerras {
158*ab1f9dacSPaul Mackerras 	int depth;
159*ab1f9dacSPaul Mackerras 	unsigned int *ref_points;
160*ab1f9dacSPaul Mackerras 	struct device_node *rtas_root;
161*ab1f9dacSPaul Mackerras 	unsigned int len;
162*ab1f9dacSPaul Mackerras 
163*ab1f9dacSPaul Mackerras 	rtas_root = of_find_node_by_path("/rtas");
164*ab1f9dacSPaul Mackerras 
165*ab1f9dacSPaul Mackerras 	if (!rtas_root)
166*ab1f9dacSPaul Mackerras 		return -1;
167*ab1f9dacSPaul Mackerras 
168*ab1f9dacSPaul Mackerras 	/*
169*ab1f9dacSPaul Mackerras 	 * this property is 2 32-bit integers, each representing a level of
170*ab1f9dacSPaul Mackerras 	 * depth in the associativity nodes.  The first is for an SMP
171*ab1f9dacSPaul Mackerras 	 * configuration (should be all 0's) and the second is for a normal
172*ab1f9dacSPaul Mackerras 	 * NUMA configuration.
173*ab1f9dacSPaul Mackerras 	 */
174*ab1f9dacSPaul Mackerras 	ref_points = (unsigned int *)get_property(rtas_root,
175*ab1f9dacSPaul Mackerras 			"ibm,associativity-reference-points", &len);
176*ab1f9dacSPaul Mackerras 
177*ab1f9dacSPaul Mackerras 	if ((len >= 1) && ref_points) {
178*ab1f9dacSPaul Mackerras 		depth = ref_points[1];
179*ab1f9dacSPaul Mackerras 	} else {
180*ab1f9dacSPaul Mackerras 		dbg("WARNING: could not find NUMA "
181*ab1f9dacSPaul Mackerras 		    "associativity reference point\n");
182*ab1f9dacSPaul Mackerras 		depth = -1;
183*ab1f9dacSPaul Mackerras 	}
184*ab1f9dacSPaul Mackerras 	of_node_put(rtas_root);
185*ab1f9dacSPaul Mackerras 
186*ab1f9dacSPaul Mackerras 	return depth;
187*ab1f9dacSPaul Mackerras }
188*ab1f9dacSPaul Mackerras 
189*ab1f9dacSPaul Mackerras static int __init get_mem_addr_cells(void)
190*ab1f9dacSPaul Mackerras {
191*ab1f9dacSPaul Mackerras 	struct device_node *memory = NULL;
192*ab1f9dacSPaul Mackerras 	int rc;
193*ab1f9dacSPaul Mackerras 
194*ab1f9dacSPaul Mackerras 	memory = of_find_node_by_type(memory, "memory");
195*ab1f9dacSPaul Mackerras 	if (!memory)
196*ab1f9dacSPaul Mackerras 		return 0; /* it won't matter */
197*ab1f9dacSPaul Mackerras 
198*ab1f9dacSPaul Mackerras 	rc = prom_n_addr_cells(memory);
199*ab1f9dacSPaul Mackerras 	return rc;
200*ab1f9dacSPaul Mackerras }
201*ab1f9dacSPaul Mackerras 
202*ab1f9dacSPaul Mackerras static int __init get_mem_size_cells(void)
203*ab1f9dacSPaul Mackerras {
204*ab1f9dacSPaul Mackerras 	struct device_node *memory = NULL;
205*ab1f9dacSPaul Mackerras 	int rc;
206*ab1f9dacSPaul Mackerras 
207*ab1f9dacSPaul Mackerras 	memory = of_find_node_by_type(memory, "memory");
208*ab1f9dacSPaul Mackerras 	if (!memory)
209*ab1f9dacSPaul Mackerras 		return 0; /* it won't matter */
210*ab1f9dacSPaul Mackerras 	rc = prom_n_size_cells(memory);
211*ab1f9dacSPaul Mackerras 	return rc;
212*ab1f9dacSPaul Mackerras }
213*ab1f9dacSPaul Mackerras 
214*ab1f9dacSPaul Mackerras static unsigned long read_n_cells(int n, unsigned int **buf)
215*ab1f9dacSPaul Mackerras {
216*ab1f9dacSPaul Mackerras 	unsigned long result = 0;
217*ab1f9dacSPaul Mackerras 
218*ab1f9dacSPaul Mackerras 	while (n--) {
219*ab1f9dacSPaul Mackerras 		result = (result << 32) | **buf;
220*ab1f9dacSPaul Mackerras 		(*buf)++;
221*ab1f9dacSPaul Mackerras 	}
222*ab1f9dacSPaul Mackerras 	return result;
223*ab1f9dacSPaul Mackerras }
224*ab1f9dacSPaul Mackerras 
225*ab1f9dacSPaul Mackerras /*
226*ab1f9dacSPaul Mackerras  * Figure out to which domain a cpu belongs and stick it there.
227*ab1f9dacSPaul Mackerras  * Return the id of the domain used.
228*ab1f9dacSPaul Mackerras  */
229*ab1f9dacSPaul Mackerras static int numa_setup_cpu(unsigned long lcpu)
230*ab1f9dacSPaul Mackerras {
231*ab1f9dacSPaul Mackerras 	int numa_domain = 0;
232*ab1f9dacSPaul Mackerras 	struct device_node *cpu = find_cpu_node(lcpu);
233*ab1f9dacSPaul Mackerras 
234*ab1f9dacSPaul Mackerras 	if (!cpu) {
235*ab1f9dacSPaul Mackerras 		WARN_ON(1);
236*ab1f9dacSPaul Mackerras 		goto out;
237*ab1f9dacSPaul Mackerras 	}
238*ab1f9dacSPaul Mackerras 
239*ab1f9dacSPaul Mackerras 	numa_domain = of_node_numa_domain(cpu);
240*ab1f9dacSPaul Mackerras 
241*ab1f9dacSPaul Mackerras 	if (numa_domain >= num_online_nodes()) {
242*ab1f9dacSPaul Mackerras 		/*
243*ab1f9dacSPaul Mackerras 		 * POWER4 LPAR uses 0xffff as invalid node,
244*ab1f9dacSPaul Mackerras 		 * dont warn in this case.
245*ab1f9dacSPaul Mackerras 		 */
246*ab1f9dacSPaul Mackerras 		if (numa_domain != 0xffff)
247*ab1f9dacSPaul Mackerras 			printk(KERN_ERR "WARNING: cpu %ld "
248*ab1f9dacSPaul Mackerras 			       "maps to invalid NUMA node %d\n",
249*ab1f9dacSPaul Mackerras 			       lcpu, numa_domain);
250*ab1f9dacSPaul Mackerras 		numa_domain = 0;
251*ab1f9dacSPaul Mackerras 	}
252*ab1f9dacSPaul Mackerras out:
253*ab1f9dacSPaul Mackerras 	node_set_online(numa_domain);
254*ab1f9dacSPaul Mackerras 
255*ab1f9dacSPaul Mackerras 	map_cpu_to_node(lcpu, numa_domain);
256*ab1f9dacSPaul Mackerras 
257*ab1f9dacSPaul Mackerras 	of_node_put(cpu);
258*ab1f9dacSPaul Mackerras 
259*ab1f9dacSPaul Mackerras 	return numa_domain;
260*ab1f9dacSPaul Mackerras }
261*ab1f9dacSPaul Mackerras 
262*ab1f9dacSPaul Mackerras static int cpu_numa_callback(struct notifier_block *nfb,
263*ab1f9dacSPaul Mackerras 			     unsigned long action,
264*ab1f9dacSPaul Mackerras 			     void *hcpu)
265*ab1f9dacSPaul Mackerras {
266*ab1f9dacSPaul Mackerras 	unsigned long lcpu = (unsigned long)hcpu;
267*ab1f9dacSPaul Mackerras 	int ret = NOTIFY_DONE;
268*ab1f9dacSPaul Mackerras 
269*ab1f9dacSPaul Mackerras 	switch (action) {
270*ab1f9dacSPaul Mackerras 	case CPU_UP_PREPARE:
271*ab1f9dacSPaul Mackerras 		if (min_common_depth == -1 || !numa_enabled)
272*ab1f9dacSPaul Mackerras 			map_cpu_to_node(lcpu, 0);
273*ab1f9dacSPaul Mackerras 		else
274*ab1f9dacSPaul Mackerras 			numa_setup_cpu(lcpu);
275*ab1f9dacSPaul Mackerras 		ret = NOTIFY_OK;
276*ab1f9dacSPaul Mackerras 		break;
277*ab1f9dacSPaul Mackerras #ifdef CONFIG_HOTPLUG_CPU
278*ab1f9dacSPaul Mackerras 	case CPU_DEAD:
279*ab1f9dacSPaul Mackerras 	case CPU_UP_CANCELED:
280*ab1f9dacSPaul Mackerras 		unmap_cpu_from_node(lcpu);
281*ab1f9dacSPaul Mackerras 		break;
282*ab1f9dacSPaul Mackerras 		ret = NOTIFY_OK;
283*ab1f9dacSPaul Mackerras #endif
284*ab1f9dacSPaul Mackerras 	}
285*ab1f9dacSPaul Mackerras 	return ret;
286*ab1f9dacSPaul Mackerras }
287*ab1f9dacSPaul Mackerras 
288*ab1f9dacSPaul Mackerras /*
289*ab1f9dacSPaul Mackerras  * Check and possibly modify a memory region to enforce the memory limit.
290*ab1f9dacSPaul Mackerras  *
291*ab1f9dacSPaul Mackerras  * Returns the size the region should have to enforce the memory limit.
292*ab1f9dacSPaul Mackerras  * This will either be the original value of size, a truncated value,
293*ab1f9dacSPaul Mackerras  * or zero. If the returned value of size is 0 the region should be
294*ab1f9dacSPaul Mackerras  * discarded as it lies wholy above the memory limit.
295*ab1f9dacSPaul Mackerras  */
296*ab1f9dacSPaul Mackerras static unsigned long __init numa_enforce_memory_limit(unsigned long start, unsigned long size)
297*ab1f9dacSPaul Mackerras {
298*ab1f9dacSPaul Mackerras 	/*
299*ab1f9dacSPaul Mackerras 	 * We use lmb_end_of_DRAM() in here instead of memory_limit because
300*ab1f9dacSPaul Mackerras 	 * we've already adjusted it for the limit and it takes care of
301*ab1f9dacSPaul Mackerras 	 * having memory holes below the limit.
302*ab1f9dacSPaul Mackerras 	 */
303*ab1f9dacSPaul Mackerras 	extern unsigned long memory_limit;
304*ab1f9dacSPaul Mackerras 
305*ab1f9dacSPaul Mackerras 	if (! memory_limit)
306*ab1f9dacSPaul Mackerras 		return size;
307*ab1f9dacSPaul Mackerras 
308*ab1f9dacSPaul Mackerras 	if (start + size <= lmb_end_of_DRAM())
309*ab1f9dacSPaul Mackerras 		return size;
310*ab1f9dacSPaul Mackerras 
311*ab1f9dacSPaul Mackerras 	if (start >= lmb_end_of_DRAM())
312*ab1f9dacSPaul Mackerras 		return 0;
313*ab1f9dacSPaul Mackerras 
314*ab1f9dacSPaul Mackerras 	return lmb_end_of_DRAM() - start;
315*ab1f9dacSPaul Mackerras }
316*ab1f9dacSPaul Mackerras 
317*ab1f9dacSPaul Mackerras static int __init parse_numa_properties(void)
318*ab1f9dacSPaul Mackerras {
319*ab1f9dacSPaul Mackerras 	struct device_node *cpu = NULL;
320*ab1f9dacSPaul Mackerras 	struct device_node *memory = NULL;
321*ab1f9dacSPaul Mackerras 	int addr_cells, size_cells;
322*ab1f9dacSPaul Mackerras 	int max_domain = 0;
323*ab1f9dacSPaul Mackerras 	long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT;
324*ab1f9dacSPaul Mackerras 	unsigned long i;
325*ab1f9dacSPaul Mackerras 
326*ab1f9dacSPaul Mackerras 	if (numa_enabled == 0) {
327*ab1f9dacSPaul Mackerras 		printk(KERN_WARNING "NUMA disabled by user\n");
328*ab1f9dacSPaul Mackerras 		return -1;
329*ab1f9dacSPaul Mackerras 	}
330*ab1f9dacSPaul Mackerras 
331*ab1f9dacSPaul Mackerras 	numa_memory_lookup_table =
332*ab1f9dacSPaul Mackerras 		(char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
333*ab1f9dacSPaul Mackerras 	memset(numa_memory_lookup_table, 0, entries * sizeof(char));
334*ab1f9dacSPaul Mackerras 
335*ab1f9dacSPaul Mackerras 	for (i = 0; i < entries ; i++)
336*ab1f9dacSPaul Mackerras 		numa_memory_lookup_table[i] = ARRAY_INITIALISER;
337*ab1f9dacSPaul Mackerras 
338*ab1f9dacSPaul Mackerras 	min_common_depth = find_min_common_depth();
339*ab1f9dacSPaul Mackerras 
340*ab1f9dacSPaul Mackerras 	dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
341*ab1f9dacSPaul Mackerras 	if (min_common_depth < 0)
342*ab1f9dacSPaul Mackerras 		return min_common_depth;
343*ab1f9dacSPaul Mackerras 
344*ab1f9dacSPaul Mackerras 	max_domain = numa_setup_cpu(boot_cpuid);
345*ab1f9dacSPaul Mackerras 
346*ab1f9dacSPaul Mackerras 	/*
347*ab1f9dacSPaul Mackerras 	 * Even though we connect cpus to numa domains later in SMP init,
348*ab1f9dacSPaul Mackerras 	 * we need to know the maximum node id now. This is because each
349*ab1f9dacSPaul Mackerras 	 * node id must have NODE_DATA etc backing it.
350*ab1f9dacSPaul Mackerras 	 * As a result of hotplug we could still have cpus appear later on
351*ab1f9dacSPaul Mackerras 	 * with larger node ids. In that case we force the cpu into node 0.
352*ab1f9dacSPaul Mackerras 	 */
353*ab1f9dacSPaul Mackerras 	for_each_cpu(i) {
354*ab1f9dacSPaul Mackerras 		int numa_domain;
355*ab1f9dacSPaul Mackerras 
356*ab1f9dacSPaul Mackerras 		cpu = find_cpu_node(i);
357*ab1f9dacSPaul Mackerras 
358*ab1f9dacSPaul Mackerras 		if (cpu) {
359*ab1f9dacSPaul Mackerras 			numa_domain = of_node_numa_domain(cpu);
360*ab1f9dacSPaul Mackerras 			of_node_put(cpu);
361*ab1f9dacSPaul Mackerras 
362*ab1f9dacSPaul Mackerras 			if (numa_domain < MAX_NUMNODES &&
363*ab1f9dacSPaul Mackerras 			    max_domain < numa_domain)
364*ab1f9dacSPaul Mackerras 				max_domain = numa_domain;
365*ab1f9dacSPaul Mackerras 		}
366*ab1f9dacSPaul Mackerras 	}
367*ab1f9dacSPaul Mackerras 
368*ab1f9dacSPaul Mackerras 	addr_cells = get_mem_addr_cells();
369*ab1f9dacSPaul Mackerras 	size_cells = get_mem_size_cells();
370*ab1f9dacSPaul Mackerras 	memory = NULL;
371*ab1f9dacSPaul Mackerras 	while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
372*ab1f9dacSPaul Mackerras 		unsigned long start;
373*ab1f9dacSPaul Mackerras 		unsigned long size;
374*ab1f9dacSPaul Mackerras 		int numa_domain;
375*ab1f9dacSPaul Mackerras 		int ranges;
376*ab1f9dacSPaul Mackerras 		unsigned int *memcell_buf;
377*ab1f9dacSPaul Mackerras 		unsigned int len;
378*ab1f9dacSPaul Mackerras 
379*ab1f9dacSPaul Mackerras 		memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
380*ab1f9dacSPaul Mackerras 		if (!memcell_buf || len <= 0)
381*ab1f9dacSPaul Mackerras 			continue;
382*ab1f9dacSPaul Mackerras 
383*ab1f9dacSPaul Mackerras 		ranges = memory->n_addrs;
384*ab1f9dacSPaul Mackerras new_range:
385*ab1f9dacSPaul Mackerras 		/* these are order-sensitive, and modify the buffer pointer */
386*ab1f9dacSPaul Mackerras 		start = read_n_cells(addr_cells, &memcell_buf);
387*ab1f9dacSPaul Mackerras 		size = read_n_cells(size_cells, &memcell_buf);
388*ab1f9dacSPaul Mackerras 
389*ab1f9dacSPaul Mackerras 		start = _ALIGN_DOWN(start, MEMORY_INCREMENT);
390*ab1f9dacSPaul Mackerras 		size = _ALIGN_UP(size, MEMORY_INCREMENT);
391*ab1f9dacSPaul Mackerras 
392*ab1f9dacSPaul Mackerras 		numa_domain = of_node_numa_domain(memory);
393*ab1f9dacSPaul Mackerras 
394*ab1f9dacSPaul Mackerras 		if (numa_domain >= MAX_NUMNODES) {
395*ab1f9dacSPaul Mackerras 			if (numa_domain != 0xffff)
396*ab1f9dacSPaul Mackerras 				printk(KERN_ERR "WARNING: memory at %lx maps "
397*ab1f9dacSPaul Mackerras 				       "to invalid NUMA node %d\n", start,
398*ab1f9dacSPaul Mackerras 				       numa_domain);
399*ab1f9dacSPaul Mackerras 			numa_domain = 0;
400*ab1f9dacSPaul Mackerras 		}
401*ab1f9dacSPaul Mackerras 
402*ab1f9dacSPaul Mackerras 		if (max_domain < numa_domain)
403*ab1f9dacSPaul Mackerras 			max_domain = numa_domain;
404*ab1f9dacSPaul Mackerras 
405*ab1f9dacSPaul Mackerras 		if (! (size = numa_enforce_memory_limit(start, size))) {
406*ab1f9dacSPaul Mackerras 			if (--ranges)
407*ab1f9dacSPaul Mackerras 				goto new_range;
408*ab1f9dacSPaul Mackerras 			else
409*ab1f9dacSPaul Mackerras 				continue;
410*ab1f9dacSPaul Mackerras 		}
411*ab1f9dacSPaul Mackerras 
412*ab1f9dacSPaul Mackerras 		/*
413*ab1f9dacSPaul Mackerras 		 * Initialize new node struct, or add to an existing one.
414*ab1f9dacSPaul Mackerras 		 */
415*ab1f9dacSPaul Mackerras 		if (init_node_data[numa_domain].node_end_pfn) {
416*ab1f9dacSPaul Mackerras 			if ((start / PAGE_SIZE) <
417*ab1f9dacSPaul Mackerras 			    init_node_data[numa_domain].node_start_pfn)
418*ab1f9dacSPaul Mackerras 				init_node_data[numa_domain].node_start_pfn =
419*ab1f9dacSPaul Mackerras 					start / PAGE_SIZE;
420*ab1f9dacSPaul Mackerras 			if (((start / PAGE_SIZE) + (size / PAGE_SIZE)) >
421*ab1f9dacSPaul Mackerras 			    init_node_data[numa_domain].node_end_pfn)
422*ab1f9dacSPaul Mackerras 				init_node_data[numa_domain].node_end_pfn =
423*ab1f9dacSPaul Mackerras 					(start / PAGE_SIZE) +
424*ab1f9dacSPaul Mackerras 					(size / PAGE_SIZE);
425*ab1f9dacSPaul Mackerras 
426*ab1f9dacSPaul Mackerras 			init_node_data[numa_domain].node_present_pages +=
427*ab1f9dacSPaul Mackerras 				size / PAGE_SIZE;
428*ab1f9dacSPaul Mackerras 		} else {
429*ab1f9dacSPaul Mackerras 			node_set_online(numa_domain);
430*ab1f9dacSPaul Mackerras 
431*ab1f9dacSPaul Mackerras 			init_node_data[numa_domain].node_start_pfn =
432*ab1f9dacSPaul Mackerras 				start / PAGE_SIZE;
433*ab1f9dacSPaul Mackerras 			init_node_data[numa_domain].node_end_pfn =
434*ab1f9dacSPaul Mackerras 				init_node_data[numa_domain].node_start_pfn +
435*ab1f9dacSPaul Mackerras 				size / PAGE_SIZE;
436*ab1f9dacSPaul Mackerras 			init_node_data[numa_domain].node_present_pages =
437*ab1f9dacSPaul Mackerras 				size / PAGE_SIZE;
438*ab1f9dacSPaul Mackerras 		}
439*ab1f9dacSPaul Mackerras 
440*ab1f9dacSPaul Mackerras 		for (i = start ; i < (start+size); i += MEMORY_INCREMENT)
441*ab1f9dacSPaul Mackerras 			numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] =
442*ab1f9dacSPaul Mackerras 				numa_domain;
443*ab1f9dacSPaul Mackerras 
444*ab1f9dacSPaul Mackerras 		if (--ranges)
445*ab1f9dacSPaul Mackerras 			goto new_range;
446*ab1f9dacSPaul Mackerras 	}
447*ab1f9dacSPaul Mackerras 
448*ab1f9dacSPaul Mackerras 	for (i = 0; i <= max_domain; i++)
449*ab1f9dacSPaul Mackerras 		node_set_online(i);
450*ab1f9dacSPaul Mackerras 
451*ab1f9dacSPaul Mackerras 	return 0;
452*ab1f9dacSPaul Mackerras }
453*ab1f9dacSPaul Mackerras 
454*ab1f9dacSPaul Mackerras static void __init setup_nonnuma(void)
455*ab1f9dacSPaul Mackerras {
456*ab1f9dacSPaul Mackerras 	unsigned long top_of_ram = lmb_end_of_DRAM();
457*ab1f9dacSPaul Mackerras 	unsigned long total_ram = lmb_phys_mem_size();
458*ab1f9dacSPaul Mackerras 	unsigned long i;
459*ab1f9dacSPaul Mackerras 
460*ab1f9dacSPaul Mackerras 	printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
461*ab1f9dacSPaul Mackerras 	       top_of_ram, total_ram);
462*ab1f9dacSPaul Mackerras 	printk(KERN_INFO "Memory hole size: %ldMB\n",
463*ab1f9dacSPaul Mackerras 	       (top_of_ram - total_ram) >> 20);
464*ab1f9dacSPaul Mackerras 
465*ab1f9dacSPaul Mackerras 	if (!numa_memory_lookup_table) {
466*ab1f9dacSPaul Mackerras 		long entries = top_of_ram >> MEMORY_INCREMENT_SHIFT;
467*ab1f9dacSPaul Mackerras 		numa_memory_lookup_table =
468*ab1f9dacSPaul Mackerras 			(char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
469*ab1f9dacSPaul Mackerras 		memset(numa_memory_lookup_table, 0, entries * sizeof(char));
470*ab1f9dacSPaul Mackerras 		for (i = 0; i < entries ; i++)
471*ab1f9dacSPaul Mackerras 			numa_memory_lookup_table[i] = ARRAY_INITIALISER;
472*ab1f9dacSPaul Mackerras 	}
473*ab1f9dacSPaul Mackerras 
474*ab1f9dacSPaul Mackerras 	map_cpu_to_node(boot_cpuid, 0);
475*ab1f9dacSPaul Mackerras 
476*ab1f9dacSPaul Mackerras 	node_set_online(0);
477*ab1f9dacSPaul Mackerras 
478*ab1f9dacSPaul Mackerras 	init_node_data[0].node_start_pfn = 0;
479*ab1f9dacSPaul Mackerras 	init_node_data[0].node_end_pfn = lmb_end_of_DRAM() / PAGE_SIZE;
480*ab1f9dacSPaul Mackerras 	init_node_data[0].node_present_pages = total_ram / PAGE_SIZE;
481*ab1f9dacSPaul Mackerras 
482*ab1f9dacSPaul Mackerras 	for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT)
483*ab1f9dacSPaul Mackerras 		numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0;
484*ab1f9dacSPaul Mackerras }
485*ab1f9dacSPaul Mackerras 
486*ab1f9dacSPaul Mackerras static void __init dump_numa_topology(void)
487*ab1f9dacSPaul Mackerras {
488*ab1f9dacSPaul Mackerras 	unsigned int node;
489*ab1f9dacSPaul Mackerras 	unsigned int count;
490*ab1f9dacSPaul Mackerras 
491*ab1f9dacSPaul Mackerras 	if (min_common_depth == -1 || !numa_enabled)
492*ab1f9dacSPaul Mackerras 		return;
493*ab1f9dacSPaul Mackerras 
494*ab1f9dacSPaul Mackerras 	for_each_online_node(node) {
495*ab1f9dacSPaul Mackerras 		unsigned long i;
496*ab1f9dacSPaul Mackerras 
497*ab1f9dacSPaul Mackerras 		printk(KERN_INFO "Node %d Memory:", node);
498*ab1f9dacSPaul Mackerras 
499*ab1f9dacSPaul Mackerras 		count = 0;
500*ab1f9dacSPaul Mackerras 
501*ab1f9dacSPaul Mackerras 		for (i = 0; i < lmb_end_of_DRAM(); i += MEMORY_INCREMENT) {
502*ab1f9dacSPaul Mackerras 			if (numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] == node) {
503*ab1f9dacSPaul Mackerras 				if (count == 0)
504*ab1f9dacSPaul Mackerras 					printk(" 0x%lx", i);
505*ab1f9dacSPaul Mackerras 				++count;
506*ab1f9dacSPaul Mackerras 			} else {
507*ab1f9dacSPaul Mackerras 				if (count > 0)
508*ab1f9dacSPaul Mackerras 					printk("-0x%lx", i);
509*ab1f9dacSPaul Mackerras 				count = 0;
510*ab1f9dacSPaul Mackerras 			}
511*ab1f9dacSPaul Mackerras 		}
512*ab1f9dacSPaul Mackerras 
513*ab1f9dacSPaul Mackerras 		if (count > 0)
514*ab1f9dacSPaul Mackerras 			printk("-0x%lx", i);
515*ab1f9dacSPaul Mackerras 		printk("\n");
516*ab1f9dacSPaul Mackerras 	}
517*ab1f9dacSPaul Mackerras 	return;
518*ab1f9dacSPaul Mackerras }
519*ab1f9dacSPaul Mackerras 
520*ab1f9dacSPaul Mackerras /*
521*ab1f9dacSPaul Mackerras  * Allocate some memory, satisfying the lmb or bootmem allocator where
522*ab1f9dacSPaul Mackerras  * required. nid is the preferred node and end is the physical address of
523*ab1f9dacSPaul Mackerras  * the highest address in the node.
524*ab1f9dacSPaul Mackerras  *
525*ab1f9dacSPaul Mackerras  * Returns the physical address of the memory.
526*ab1f9dacSPaul Mackerras  */
527*ab1f9dacSPaul Mackerras static unsigned long careful_allocation(int nid, unsigned long size,
528*ab1f9dacSPaul Mackerras 					unsigned long align, unsigned long end)
529*ab1f9dacSPaul Mackerras {
530*ab1f9dacSPaul Mackerras 	unsigned long ret = lmb_alloc_base(size, align, end);
531*ab1f9dacSPaul Mackerras 
532*ab1f9dacSPaul Mackerras 	/* retry over all memory */
533*ab1f9dacSPaul Mackerras 	if (!ret)
534*ab1f9dacSPaul Mackerras 		ret = lmb_alloc_base(size, align, lmb_end_of_DRAM());
535*ab1f9dacSPaul Mackerras 
536*ab1f9dacSPaul Mackerras 	if (!ret)
537*ab1f9dacSPaul Mackerras 		panic("numa.c: cannot allocate %lu bytes on node %d",
538*ab1f9dacSPaul Mackerras 		      size, nid);
539*ab1f9dacSPaul Mackerras 
540*ab1f9dacSPaul Mackerras 	/*
541*ab1f9dacSPaul Mackerras 	 * If the memory came from a previously allocated node, we must
542*ab1f9dacSPaul Mackerras 	 * retry with the bootmem allocator.
543*ab1f9dacSPaul Mackerras 	 */
544*ab1f9dacSPaul Mackerras 	if (pa_to_nid(ret) < nid) {
545*ab1f9dacSPaul Mackerras 		nid = pa_to_nid(ret);
546*ab1f9dacSPaul Mackerras 		ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(nid),
547*ab1f9dacSPaul Mackerras 				size, align, 0);
548*ab1f9dacSPaul Mackerras 
549*ab1f9dacSPaul Mackerras 		if (!ret)
550*ab1f9dacSPaul Mackerras 			panic("numa.c: cannot allocate %lu bytes on node %d",
551*ab1f9dacSPaul Mackerras 			      size, nid);
552*ab1f9dacSPaul Mackerras 
553*ab1f9dacSPaul Mackerras 		ret = virt_to_abs(ret);
554*ab1f9dacSPaul Mackerras 
555*ab1f9dacSPaul Mackerras 		dbg("alloc_bootmem %lx %lx\n", ret, size);
556*ab1f9dacSPaul Mackerras 	}
557*ab1f9dacSPaul Mackerras 
558*ab1f9dacSPaul Mackerras 	return ret;
559*ab1f9dacSPaul Mackerras }
560*ab1f9dacSPaul Mackerras 
561*ab1f9dacSPaul Mackerras void __init do_init_bootmem(void)
562*ab1f9dacSPaul Mackerras {
563*ab1f9dacSPaul Mackerras 	int nid;
564*ab1f9dacSPaul Mackerras 	int addr_cells, size_cells;
565*ab1f9dacSPaul Mackerras 	struct device_node *memory = NULL;
566*ab1f9dacSPaul Mackerras 	static struct notifier_block ppc64_numa_nb = {
567*ab1f9dacSPaul Mackerras 		.notifier_call = cpu_numa_callback,
568*ab1f9dacSPaul Mackerras 		.priority = 1 /* Must run before sched domains notifier. */
569*ab1f9dacSPaul Mackerras 	};
570*ab1f9dacSPaul Mackerras 
571*ab1f9dacSPaul Mackerras 	min_low_pfn = 0;
572*ab1f9dacSPaul Mackerras 	max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
573*ab1f9dacSPaul Mackerras 	max_pfn = max_low_pfn;
574*ab1f9dacSPaul Mackerras 
575*ab1f9dacSPaul Mackerras 	if (parse_numa_properties())
576*ab1f9dacSPaul Mackerras 		setup_nonnuma();
577*ab1f9dacSPaul Mackerras 	else
578*ab1f9dacSPaul Mackerras 		dump_numa_topology();
579*ab1f9dacSPaul Mackerras 
580*ab1f9dacSPaul Mackerras 	register_cpu_notifier(&ppc64_numa_nb);
581*ab1f9dacSPaul Mackerras 
582*ab1f9dacSPaul Mackerras 	for_each_online_node(nid) {
583*ab1f9dacSPaul Mackerras 		unsigned long start_paddr, end_paddr;
584*ab1f9dacSPaul Mackerras 		int i;
585*ab1f9dacSPaul Mackerras 		unsigned long bootmem_paddr;
586*ab1f9dacSPaul Mackerras 		unsigned long bootmap_pages;
587*ab1f9dacSPaul Mackerras 
588*ab1f9dacSPaul Mackerras 		start_paddr = init_node_data[nid].node_start_pfn * PAGE_SIZE;
589*ab1f9dacSPaul Mackerras 		end_paddr = init_node_data[nid].node_end_pfn * PAGE_SIZE;
590*ab1f9dacSPaul Mackerras 
591*ab1f9dacSPaul Mackerras 		/* Allocate the node structure node local if possible */
592*ab1f9dacSPaul Mackerras 		NODE_DATA(nid) = (struct pglist_data *)careful_allocation(nid,
593*ab1f9dacSPaul Mackerras 					sizeof(struct pglist_data),
594*ab1f9dacSPaul Mackerras 					SMP_CACHE_BYTES, end_paddr);
595*ab1f9dacSPaul Mackerras 		NODE_DATA(nid) = abs_to_virt(NODE_DATA(nid));
596*ab1f9dacSPaul Mackerras 		memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
597*ab1f9dacSPaul Mackerras 
598*ab1f9dacSPaul Mackerras   		dbg("node %d\n", nid);
599*ab1f9dacSPaul Mackerras 		dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
600*ab1f9dacSPaul Mackerras 
601*ab1f9dacSPaul Mackerras 		NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
602*ab1f9dacSPaul Mackerras 		NODE_DATA(nid)->node_start_pfn =
603*ab1f9dacSPaul Mackerras 			init_node_data[nid].node_start_pfn;
604*ab1f9dacSPaul Mackerras 		NODE_DATA(nid)->node_spanned_pages =
605*ab1f9dacSPaul Mackerras 			end_paddr - start_paddr;
606*ab1f9dacSPaul Mackerras 
607*ab1f9dacSPaul Mackerras 		if (NODE_DATA(nid)->node_spanned_pages == 0)
608*ab1f9dacSPaul Mackerras   			continue;
609*ab1f9dacSPaul Mackerras 
610*ab1f9dacSPaul Mackerras   		dbg("start_paddr = %lx\n", start_paddr);
611*ab1f9dacSPaul Mackerras   		dbg("end_paddr = %lx\n", end_paddr);
612*ab1f9dacSPaul Mackerras 
613*ab1f9dacSPaul Mackerras 		bootmap_pages = bootmem_bootmap_pages((end_paddr - start_paddr) >> PAGE_SHIFT);
614*ab1f9dacSPaul Mackerras 
615*ab1f9dacSPaul Mackerras 		bootmem_paddr = careful_allocation(nid,
616*ab1f9dacSPaul Mackerras 				bootmap_pages << PAGE_SHIFT,
617*ab1f9dacSPaul Mackerras 				PAGE_SIZE, end_paddr);
618*ab1f9dacSPaul Mackerras 		memset(abs_to_virt(bootmem_paddr), 0,
619*ab1f9dacSPaul Mackerras 		       bootmap_pages << PAGE_SHIFT);
620*ab1f9dacSPaul Mackerras 		dbg("bootmap_paddr = %lx\n", bootmem_paddr);
621*ab1f9dacSPaul Mackerras 
622*ab1f9dacSPaul Mackerras 		init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT,
623*ab1f9dacSPaul Mackerras 				  start_paddr >> PAGE_SHIFT,
624*ab1f9dacSPaul Mackerras 				  end_paddr >> PAGE_SHIFT);
625*ab1f9dacSPaul Mackerras 
626*ab1f9dacSPaul Mackerras 		/*
627*ab1f9dacSPaul Mackerras 		 * We need to do another scan of all memory sections to
628*ab1f9dacSPaul Mackerras 		 * associate memory with the correct node.
629*ab1f9dacSPaul Mackerras 		 */
630*ab1f9dacSPaul Mackerras 		addr_cells = get_mem_addr_cells();
631*ab1f9dacSPaul Mackerras 		size_cells = get_mem_size_cells();
632*ab1f9dacSPaul Mackerras 		memory = NULL;
633*ab1f9dacSPaul Mackerras 		while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
634*ab1f9dacSPaul Mackerras 			unsigned long mem_start, mem_size;
635*ab1f9dacSPaul Mackerras 			int numa_domain, ranges;
636*ab1f9dacSPaul Mackerras 			unsigned int *memcell_buf;
637*ab1f9dacSPaul Mackerras 			unsigned int len;
638*ab1f9dacSPaul Mackerras 
639*ab1f9dacSPaul Mackerras 			memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
640*ab1f9dacSPaul Mackerras 			if (!memcell_buf || len <= 0)
641*ab1f9dacSPaul Mackerras 				continue;
642*ab1f9dacSPaul Mackerras 
643*ab1f9dacSPaul Mackerras 			ranges = memory->n_addrs;	/* ranges in cell */
644*ab1f9dacSPaul Mackerras new_range:
645*ab1f9dacSPaul Mackerras 			mem_start = read_n_cells(addr_cells, &memcell_buf);
646*ab1f9dacSPaul Mackerras 			mem_size = read_n_cells(size_cells, &memcell_buf);
647*ab1f9dacSPaul Mackerras 			if (numa_enabled) {
648*ab1f9dacSPaul Mackerras 				numa_domain = of_node_numa_domain(memory);
649*ab1f9dacSPaul Mackerras 				if (numa_domain  >= MAX_NUMNODES)
650*ab1f9dacSPaul Mackerras 					numa_domain = 0;
651*ab1f9dacSPaul Mackerras 			} else
652*ab1f9dacSPaul Mackerras 				numa_domain =  0;
653*ab1f9dacSPaul Mackerras 
654*ab1f9dacSPaul Mackerras 			if (numa_domain != nid)
655*ab1f9dacSPaul Mackerras 				continue;
656*ab1f9dacSPaul Mackerras 
657*ab1f9dacSPaul Mackerras 			mem_size = numa_enforce_memory_limit(mem_start, mem_size);
658*ab1f9dacSPaul Mackerras   			if (mem_size) {
659*ab1f9dacSPaul Mackerras   				dbg("free_bootmem %lx %lx\n", mem_start, mem_size);
660*ab1f9dacSPaul Mackerras   				free_bootmem_node(NODE_DATA(nid), mem_start, mem_size);
661*ab1f9dacSPaul Mackerras 			}
662*ab1f9dacSPaul Mackerras 
663*ab1f9dacSPaul Mackerras 			if (--ranges)		/* process all ranges in cell */
664*ab1f9dacSPaul Mackerras 				goto new_range;
665*ab1f9dacSPaul Mackerras 		}
666*ab1f9dacSPaul Mackerras 
667*ab1f9dacSPaul Mackerras 		/*
668*ab1f9dacSPaul Mackerras 		 * Mark reserved regions on this node
669*ab1f9dacSPaul Mackerras 		 */
670*ab1f9dacSPaul Mackerras 		for (i = 0; i < lmb.reserved.cnt; i++) {
671*ab1f9dacSPaul Mackerras 			unsigned long physbase = lmb.reserved.region[i].base;
672*ab1f9dacSPaul Mackerras 			unsigned long size = lmb.reserved.region[i].size;
673*ab1f9dacSPaul Mackerras 
674*ab1f9dacSPaul Mackerras 			if (pa_to_nid(physbase) != nid &&
675*ab1f9dacSPaul Mackerras 			    pa_to_nid(physbase+size-1) != nid)
676*ab1f9dacSPaul Mackerras 				continue;
677*ab1f9dacSPaul Mackerras 
678*ab1f9dacSPaul Mackerras 			if (physbase < end_paddr &&
679*ab1f9dacSPaul Mackerras 			    (physbase+size) > start_paddr) {
680*ab1f9dacSPaul Mackerras 				/* overlaps */
681*ab1f9dacSPaul Mackerras 				if (physbase < start_paddr) {
682*ab1f9dacSPaul Mackerras 					size -= start_paddr - physbase;
683*ab1f9dacSPaul Mackerras 					physbase = start_paddr;
684*ab1f9dacSPaul Mackerras 				}
685*ab1f9dacSPaul Mackerras 
686*ab1f9dacSPaul Mackerras 				if (size > end_paddr - physbase)
687*ab1f9dacSPaul Mackerras 					size = end_paddr - physbase;
688*ab1f9dacSPaul Mackerras 
689*ab1f9dacSPaul Mackerras 				dbg("reserve_bootmem %lx %lx\n", physbase,
690*ab1f9dacSPaul Mackerras 				    size);
691*ab1f9dacSPaul Mackerras 				reserve_bootmem_node(NODE_DATA(nid), physbase,
692*ab1f9dacSPaul Mackerras 						     size);
693*ab1f9dacSPaul Mackerras 			}
694*ab1f9dacSPaul Mackerras 		}
695*ab1f9dacSPaul Mackerras 		/*
696*ab1f9dacSPaul Mackerras 		 * This loop may look famaliar, but we have to do it again
697*ab1f9dacSPaul Mackerras 		 * after marking our reserved memory to mark memory present
698*ab1f9dacSPaul Mackerras 		 * for sparsemem.
699*ab1f9dacSPaul Mackerras 		 */
700*ab1f9dacSPaul Mackerras 		addr_cells = get_mem_addr_cells();
701*ab1f9dacSPaul Mackerras 		size_cells = get_mem_size_cells();
702*ab1f9dacSPaul Mackerras 		memory = NULL;
703*ab1f9dacSPaul Mackerras 		while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
704*ab1f9dacSPaul Mackerras 			unsigned long mem_start, mem_size;
705*ab1f9dacSPaul Mackerras 			int numa_domain, ranges;
706*ab1f9dacSPaul Mackerras 			unsigned int *memcell_buf;
707*ab1f9dacSPaul Mackerras 			unsigned int len;
708*ab1f9dacSPaul Mackerras 
709*ab1f9dacSPaul Mackerras 			memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
710*ab1f9dacSPaul Mackerras 			if (!memcell_buf || len <= 0)
711*ab1f9dacSPaul Mackerras 				continue;
712*ab1f9dacSPaul Mackerras 
713*ab1f9dacSPaul Mackerras 			ranges = memory->n_addrs;	/* ranges in cell */
714*ab1f9dacSPaul Mackerras new_range2:
715*ab1f9dacSPaul Mackerras 			mem_start = read_n_cells(addr_cells, &memcell_buf);
716*ab1f9dacSPaul Mackerras 			mem_size = read_n_cells(size_cells, &memcell_buf);
717*ab1f9dacSPaul Mackerras 			if (numa_enabled) {
718*ab1f9dacSPaul Mackerras 				numa_domain = of_node_numa_domain(memory);
719*ab1f9dacSPaul Mackerras 				if (numa_domain  >= MAX_NUMNODES)
720*ab1f9dacSPaul Mackerras 					numa_domain = 0;
721*ab1f9dacSPaul Mackerras 			} else
722*ab1f9dacSPaul Mackerras 				numa_domain =  0;
723*ab1f9dacSPaul Mackerras 
724*ab1f9dacSPaul Mackerras 			if (numa_domain != nid)
725*ab1f9dacSPaul Mackerras 				continue;
726*ab1f9dacSPaul Mackerras 
727*ab1f9dacSPaul Mackerras 			mem_size = numa_enforce_memory_limit(mem_start, mem_size);
728*ab1f9dacSPaul Mackerras 			memory_present(numa_domain, mem_start >> PAGE_SHIFT,
729*ab1f9dacSPaul Mackerras 				       (mem_start + mem_size) >> PAGE_SHIFT);
730*ab1f9dacSPaul Mackerras 
731*ab1f9dacSPaul Mackerras 			if (--ranges)		/* process all ranges in cell */
732*ab1f9dacSPaul Mackerras 				goto new_range2;
733*ab1f9dacSPaul Mackerras 		}
734*ab1f9dacSPaul Mackerras 
735*ab1f9dacSPaul Mackerras 	}
736*ab1f9dacSPaul Mackerras }
737*ab1f9dacSPaul Mackerras 
738*ab1f9dacSPaul Mackerras void __init paging_init(void)
739*ab1f9dacSPaul Mackerras {
740*ab1f9dacSPaul Mackerras 	unsigned long zones_size[MAX_NR_ZONES];
741*ab1f9dacSPaul Mackerras 	unsigned long zholes_size[MAX_NR_ZONES];
742*ab1f9dacSPaul Mackerras 	int nid;
743*ab1f9dacSPaul Mackerras 
744*ab1f9dacSPaul Mackerras 	memset(zones_size, 0, sizeof(zones_size));
745*ab1f9dacSPaul Mackerras 	memset(zholes_size, 0, sizeof(zholes_size));
746*ab1f9dacSPaul Mackerras 
747*ab1f9dacSPaul Mackerras 	for_each_online_node(nid) {
748*ab1f9dacSPaul Mackerras 		unsigned long start_pfn;
749*ab1f9dacSPaul Mackerras 		unsigned long end_pfn;
750*ab1f9dacSPaul Mackerras 
751*ab1f9dacSPaul Mackerras 		start_pfn = init_node_data[nid].node_start_pfn;
752*ab1f9dacSPaul Mackerras 		end_pfn = init_node_data[nid].node_end_pfn;
753*ab1f9dacSPaul Mackerras 
754*ab1f9dacSPaul Mackerras 		zones_size[ZONE_DMA] = end_pfn - start_pfn;
755*ab1f9dacSPaul Mackerras 		zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] -
756*ab1f9dacSPaul Mackerras 			init_node_data[nid].node_present_pages;
757*ab1f9dacSPaul Mackerras 
758*ab1f9dacSPaul Mackerras 		dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid,
759*ab1f9dacSPaul Mackerras 		    zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]);
760*ab1f9dacSPaul Mackerras 
761*ab1f9dacSPaul Mackerras 		free_area_init_node(nid, NODE_DATA(nid), zones_size,
762*ab1f9dacSPaul Mackerras 							start_pfn, zholes_size);
763*ab1f9dacSPaul Mackerras 	}
764*ab1f9dacSPaul Mackerras }
765*ab1f9dacSPaul Mackerras 
766*ab1f9dacSPaul Mackerras static int __init early_numa(char *p)
767*ab1f9dacSPaul Mackerras {
768*ab1f9dacSPaul Mackerras 	if (!p)
769*ab1f9dacSPaul Mackerras 		return 0;
770*ab1f9dacSPaul Mackerras 
771*ab1f9dacSPaul Mackerras 	if (strstr(p, "off"))
772*ab1f9dacSPaul Mackerras 		numa_enabled = 0;
773*ab1f9dacSPaul Mackerras 
774*ab1f9dacSPaul Mackerras 	if (strstr(p, "debug"))
775*ab1f9dacSPaul Mackerras 		numa_debug = 1;
776*ab1f9dacSPaul Mackerras 
777*ab1f9dacSPaul Mackerras 	return 0;
778*ab1f9dacSPaul Mackerras }
779*ab1f9dacSPaul Mackerras early_param("numa", early_numa);
780