xref: /linux/mm/page_alloc.c (revision 99b5aa3c10c7cff1e97239fda93649222fc12d25)
1 /*
2  *  linux/mm/page_alloc.c
3  *
4  *  Manages the free list, the system allocates free pages here.
5  *  Note that kmalloc() lives in slab.c
6  *
7  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
8  *  Swap reorganised 29.12.95, Stephen Tweedie
9  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13  *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
14  *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
15  */
16 
17 #include <linux/stddef.h>
18 #include <linux/mm.h>
19 #include <linux/swap.h>
20 #include <linux/interrupt.h>
21 #include <linux/pagemap.h>
22 #include <linux/bootmem.h>
23 #include <linux/compiler.h>
24 #include <linux/kernel.h>
25 #include <linux/module.h>
26 #include <linux/suspend.h>
27 #include <linux/pagevec.h>
28 #include <linux/blkdev.h>
29 #include <linux/slab.h>
30 #include <linux/notifier.h>
31 #include <linux/topology.h>
32 #include <linux/sysctl.h>
33 #include <linux/cpu.h>
34 #include <linux/cpuset.h>
35 #include <linux/memory_hotplug.h>
36 #include <linux/nodemask.h>
37 #include <linux/vmalloc.h>
38 #include <linux/mempolicy.h>
39 #include <linux/stop_machine.h>
40 #include <linux/sort.h>
41 #include <linux/pfn.h>
42 #include <linux/backing-dev.h>
43 #include <linux/fault-inject.h>
44 
45 #include <asm/tlbflush.h>
46 #include <asm/div64.h>
47 #include "internal.h"
48 
49 /*
50  * MCD - HACK: Find somewhere to initialize this EARLY, or make this
51  * initializer cleaner
52  */
53 nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
54 EXPORT_SYMBOL(node_online_map);
55 nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
56 EXPORT_SYMBOL(node_possible_map);
57 unsigned long totalram_pages __read_mostly;
58 unsigned long totalreserve_pages __read_mostly;
59 long nr_swap_pages;
60 int percpu_pagelist_fraction;
61 
62 static void __free_pages_ok(struct page *page, unsigned int order);
63 
64 /*
65  * results with 256, 32 in the lowmem_reserve sysctl:
66  *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
67  *	1G machine -> (16M dma, 784M normal, 224M high)
68  *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
69  *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
70  *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
71  *
72  * TBD: should special case ZONE_DMA32 machines here - in those we normally
73  * don't need any ZONE_NORMAL reservation
74  */
75 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
76 	 256,
77 #ifdef CONFIG_ZONE_DMA32
78 	 256,
79 #endif
80 #ifdef CONFIG_HIGHMEM
81 	 32
82 #endif
83 };
84 
85 EXPORT_SYMBOL(totalram_pages);
86 
87 static char * const zone_names[MAX_NR_ZONES] = {
88 	 "DMA",
89 #ifdef CONFIG_ZONE_DMA32
90 	 "DMA32",
91 #endif
92 	 "Normal",
93 #ifdef CONFIG_HIGHMEM
94 	 "HighMem"
95 #endif
96 };
97 
98 int min_free_kbytes = 1024;
99 
100 unsigned long __meminitdata nr_kernel_pages;
101 unsigned long __meminitdata nr_all_pages;
102 static unsigned long __initdata dma_reserve;
103 
104 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
105   /*
106    * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct
107    * ranges of memory (RAM) that may be registered with add_active_range().
108    * Ranges passed to add_active_range() will be merged if possible
109    * so the number of times add_active_range() can be called is
110    * related to the number of nodes and the number of holes
111    */
112   #ifdef CONFIG_MAX_ACTIVE_REGIONS
113     /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
114     #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
115   #else
116     #if MAX_NUMNODES >= 32
117       /* If there can be many nodes, allow up to 50 holes per node */
118       #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
119     #else
120       /* By default, allow up to 256 distinct regions */
121       #define MAX_ACTIVE_REGIONS 256
122     #endif
123   #endif
124 
125   struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
126   int __initdata nr_nodemap_entries;
127   unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
128   unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
129 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
130   unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES];
131   unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES];
132 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
133 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
134 
135 #ifdef CONFIG_DEBUG_VM
136 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
137 {
138 	int ret = 0;
139 	unsigned seq;
140 	unsigned long pfn = page_to_pfn(page);
141 
142 	do {
143 		seq = zone_span_seqbegin(zone);
144 		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
145 			ret = 1;
146 		else if (pfn < zone->zone_start_pfn)
147 			ret = 1;
148 	} while (zone_span_seqretry(zone, seq));
149 
150 	return ret;
151 }
152 
153 static int page_is_consistent(struct zone *zone, struct page *page)
154 {
155 #ifdef CONFIG_HOLES_IN_ZONE
156 	if (!pfn_valid(page_to_pfn(page)))
157 		return 0;
158 #endif
159 	if (zone != page_zone(page))
160 		return 0;
161 
162 	return 1;
163 }
164 /*
165  * Temporary debugging check for pages not lying within a given zone.
166  */
167 static int bad_range(struct zone *zone, struct page *page)
168 {
169 	if (page_outside_zone_boundaries(zone, page))
170 		return 1;
171 	if (!page_is_consistent(zone, page))
172 		return 1;
173 
174 	return 0;
175 }
176 #else
177 static inline int bad_range(struct zone *zone, struct page *page)
178 {
179 	return 0;
180 }
181 #endif
182 
183 static void bad_page(struct page *page)
184 {
185 	printk(KERN_EMERG "Bad page state in process '%s'\n"
186 		KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
187 		KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
188 		KERN_EMERG "Backtrace:\n",
189 		current->comm, page, (int)(2*sizeof(unsigned long)),
190 		(unsigned long)page->flags, page->mapping,
191 		page_mapcount(page), page_count(page));
192 	dump_stack();
193 	page->flags &= ~(1 << PG_lru	|
194 			1 << PG_private |
195 			1 << PG_locked	|
196 			1 << PG_active	|
197 			1 << PG_dirty	|
198 			1 << PG_reclaim |
199 			1 << PG_slab    |
200 			1 << PG_swapcache |
201 			1 << PG_writeback |
202 			1 << PG_buddy );
203 	set_page_count(page, 0);
204 	reset_page_mapcount(page);
205 	page->mapping = NULL;
206 	add_taint(TAINT_BAD_PAGE);
207 }
208 
209 /*
210  * Higher-order pages are called "compound pages".  They are structured thusly:
211  *
212  * The first PAGE_SIZE page is called the "head page".
213  *
214  * The remaining PAGE_SIZE pages are called "tail pages".
215  *
216  * All pages have PG_compound set.  All pages have their ->private pointing at
217  * the head page (even the head page has this).
218  *
219  * The first tail page's ->lru.next holds the address of the compound page's
220  * put_page() function.  Its ->lru.prev holds the order of allocation.
221  * This usage means that zero-order pages may not be compound.
222  */
223 
224 static void free_compound_page(struct page *page)
225 {
226 	__free_pages_ok(page, (unsigned long)page[1].lru.prev);
227 }
228 
229 static void prep_compound_page(struct page *page, unsigned long order)
230 {
231 	int i;
232 	int nr_pages = 1 << order;
233 
234 	set_compound_page_dtor(page, free_compound_page);
235 	page[1].lru.prev = (void *)order;
236 	for (i = 0; i < nr_pages; i++) {
237 		struct page *p = page + i;
238 
239 		__SetPageCompound(p);
240 		set_page_private(p, (unsigned long)page);
241 	}
242 }
243 
244 static void destroy_compound_page(struct page *page, unsigned long order)
245 {
246 	int i;
247 	int nr_pages = 1 << order;
248 
249 	if (unlikely((unsigned long)page[1].lru.prev != order))
250 		bad_page(page);
251 
252 	for (i = 0; i < nr_pages; i++) {
253 		struct page *p = page + i;
254 
255 		if (unlikely(!PageCompound(p) |
256 				(page_private(p) != (unsigned long)page)))
257 			bad_page(page);
258 		__ClearPageCompound(p);
259 	}
260 }
261 
262 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
263 {
264 	int i;
265 
266 	VM_BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
267 	/*
268 	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
269 	 * and __GFP_HIGHMEM from hard or soft interrupt context.
270 	 */
271 	VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
272 	for (i = 0; i < (1 << order); i++)
273 		clear_highpage(page + i);
274 }
275 
276 /*
277  * function for dealing with page's order in buddy system.
278  * zone->lock is already acquired when we use these.
279  * So, we don't need atomic page->flags operations here.
280  */
281 static inline unsigned long page_order(struct page *page)
282 {
283 	return page_private(page);
284 }
285 
286 static inline void set_page_order(struct page *page, int order)
287 {
288 	set_page_private(page, order);
289 	__SetPageBuddy(page);
290 }
291 
292 static inline void rmv_page_order(struct page *page)
293 {
294 	__ClearPageBuddy(page);
295 	set_page_private(page, 0);
296 }
297 
298 /*
299  * Locate the struct page for both the matching buddy in our
300  * pair (buddy1) and the combined O(n+1) page they form (page).
301  *
302  * 1) Any buddy B1 will have an order O twin B2 which satisfies
303  * the following equation:
304  *     B2 = B1 ^ (1 << O)
305  * For example, if the starting buddy (buddy2) is #8 its order
306  * 1 buddy is #10:
307  *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
308  *
309  * 2) Any buddy B will have an order O+1 parent P which
310  * satisfies the following equation:
311  *     P = B & ~(1 << O)
312  *
313  * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
314  */
315 static inline struct page *
316 __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
317 {
318 	unsigned long buddy_idx = page_idx ^ (1 << order);
319 
320 	return page + (buddy_idx - page_idx);
321 }
322 
323 static inline unsigned long
324 __find_combined_index(unsigned long page_idx, unsigned int order)
325 {
326 	return (page_idx & ~(1 << order));
327 }
328 
329 /*
330  * This function checks whether a page is free && is the buddy
331  * we can do coalesce a page and its buddy if
332  * (a) the buddy is not in a hole &&
333  * (b) the buddy is in the buddy system &&
334  * (c) a page and its buddy have the same order &&
335  * (d) a page and its buddy are in the same zone.
336  *
337  * For recording whether a page is in the buddy system, we use PG_buddy.
338  * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
339  *
340  * For recording page's order, we use page_private(page).
341  */
342 static inline int page_is_buddy(struct page *page, struct page *buddy,
343 								int order)
344 {
345 #ifdef CONFIG_HOLES_IN_ZONE
346 	if (!pfn_valid(page_to_pfn(buddy)))
347 		return 0;
348 #endif
349 
350 	if (page_zone_id(page) != page_zone_id(buddy))
351 		return 0;
352 
353 	if (PageBuddy(buddy) && page_order(buddy) == order) {
354 		BUG_ON(page_count(buddy) != 0);
355 		return 1;
356 	}
357 	return 0;
358 }
359 
360 /*
361  * Freeing function for a buddy system allocator.
362  *
363  * The concept of a buddy system is to maintain direct-mapped table
364  * (containing bit values) for memory blocks of various "orders".
365  * The bottom level table contains the map for the smallest allocatable
366  * units of memory (here, pages), and each level above it describes
367  * pairs of units from the levels below, hence, "buddies".
368  * At a high level, all that happens here is marking the table entry
369  * at the bottom level available, and propagating the changes upward
370  * as necessary, plus some accounting needed to play nicely with other
371  * parts of the VM system.
372  * At each level, we keep a list of pages, which are heads of continuous
373  * free pages of length of (1 << order) and marked with PG_buddy. Page's
374  * order is recorded in page_private(page) field.
375  * So when we are allocating or freeing one, we can derive the state of the
376  * other.  That is, if we allocate a small block, and both were
377  * free, the remainder of the region must be split into blocks.
378  * If a block is freed, and its buddy is also free, then this
379  * triggers coalescing into a block of larger size.
380  *
381  * -- wli
382  */
383 
384 static inline void __free_one_page(struct page *page,
385 		struct zone *zone, unsigned int order)
386 {
387 	unsigned long page_idx;
388 	int order_size = 1 << order;
389 
390 	if (unlikely(PageCompound(page)))
391 		destroy_compound_page(page, order);
392 
393 	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
394 
395 	VM_BUG_ON(page_idx & (order_size - 1));
396 	VM_BUG_ON(bad_range(zone, page));
397 
398 	zone->free_pages += order_size;
399 	while (order < MAX_ORDER-1) {
400 		unsigned long combined_idx;
401 		struct free_area *area;
402 		struct page *buddy;
403 
404 		buddy = __page_find_buddy(page, page_idx, order);
405 		if (!page_is_buddy(page, buddy, order))
406 			break;		/* Move the buddy up one level. */
407 
408 		list_del(&buddy->lru);
409 		area = zone->free_area + order;
410 		area->nr_free--;
411 		rmv_page_order(buddy);
412 		combined_idx = __find_combined_index(page_idx, order);
413 		page = page + (combined_idx - page_idx);
414 		page_idx = combined_idx;
415 		order++;
416 	}
417 	set_page_order(page, order);
418 	list_add(&page->lru, &zone->free_area[order].free_list);
419 	zone->free_area[order].nr_free++;
420 }
421 
422 static inline int free_pages_check(struct page *page)
423 {
424 	if (unlikely(page_mapcount(page) |
425 		(page->mapping != NULL)  |
426 		(page_count(page) != 0)  |
427 		(page->flags & (
428 			1 << PG_lru	|
429 			1 << PG_private |
430 			1 << PG_locked	|
431 			1 << PG_active	|
432 			1 << PG_reclaim	|
433 			1 << PG_slab	|
434 			1 << PG_swapcache |
435 			1 << PG_writeback |
436 			1 << PG_reserved |
437 			1 << PG_buddy ))))
438 		bad_page(page);
439 	if (PageDirty(page))
440 		__ClearPageDirty(page);
441 	/*
442 	 * For now, we report if PG_reserved was found set, but do not
443 	 * clear it, and do not free the page.  But we shall soon need
444 	 * to do more, for when the ZERO_PAGE count wraps negative.
445 	 */
446 	return PageReserved(page);
447 }
448 
449 /*
450  * Frees a list of pages.
451  * Assumes all pages on list are in same zone, and of same order.
452  * count is the number of pages to free.
453  *
454  * If the zone was previously in an "all pages pinned" state then look to
455  * see if this freeing clears that state.
456  *
457  * And clear the zone's pages_scanned counter, to hold off the "all pages are
458  * pinned" detection logic.
459  */
460 static void free_pages_bulk(struct zone *zone, int count,
461 					struct list_head *list, int order)
462 {
463 	spin_lock(&zone->lock);
464 	zone->all_unreclaimable = 0;
465 	zone->pages_scanned = 0;
466 	while (count--) {
467 		struct page *page;
468 
469 		VM_BUG_ON(list_empty(list));
470 		page = list_entry(list->prev, struct page, lru);
471 		/* have to delete it as __free_one_page list manipulates */
472 		list_del(&page->lru);
473 		__free_one_page(page, zone, order);
474 	}
475 	spin_unlock(&zone->lock);
476 }
477 
478 static void free_one_page(struct zone *zone, struct page *page, int order)
479 {
480 	spin_lock(&zone->lock);
481 	zone->all_unreclaimable = 0;
482 	zone->pages_scanned = 0;
483 	__free_one_page(page, zone, order);
484 	spin_unlock(&zone->lock);
485 }
486 
487 static void __free_pages_ok(struct page *page, unsigned int order)
488 {
489 	unsigned long flags;
490 	int i;
491 	int reserved = 0;
492 
493 	for (i = 0 ; i < (1 << order) ; ++i)
494 		reserved += free_pages_check(page + i);
495 	if (reserved)
496 		return;
497 
498 	if (!PageHighMem(page))
499 		debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
500 	arch_free_page(page, order);
501 	kernel_map_pages(page, 1 << order, 0);
502 
503 	local_irq_save(flags);
504 	__count_vm_events(PGFREE, 1 << order);
505 	free_one_page(page_zone(page), page, order);
506 	local_irq_restore(flags);
507 }
508 
509 /*
510  * permit the bootmem allocator to evade page validation on high-order frees
511  */
512 void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
513 {
514 	if (order == 0) {
515 		__ClearPageReserved(page);
516 		set_page_count(page, 0);
517 		set_page_refcounted(page);
518 		__free_page(page);
519 	} else {
520 		int loop;
521 
522 		prefetchw(page);
523 		for (loop = 0; loop < BITS_PER_LONG; loop++) {
524 			struct page *p = &page[loop];
525 
526 			if (loop + 1 < BITS_PER_LONG)
527 				prefetchw(p + 1);
528 			__ClearPageReserved(p);
529 			set_page_count(p, 0);
530 		}
531 
532 		set_page_refcounted(page);
533 		__free_pages(page, order);
534 	}
535 }
536 
537 
538 /*
539  * The order of subdivision here is critical for the IO subsystem.
540  * Please do not alter this order without good reasons and regression
541  * testing. Specifically, as large blocks of memory are subdivided,
542  * the order in which smaller blocks are delivered depends on the order
543  * they're subdivided in this function. This is the primary factor
544  * influencing the order in which pages are delivered to the IO
545  * subsystem according to empirical testing, and this is also justified
546  * by considering the behavior of a buddy system containing a single
547  * large block of memory acted on by a series of small allocations.
548  * This behavior is a critical factor in sglist merging's success.
549  *
550  * -- wli
551  */
552 static inline void expand(struct zone *zone, struct page *page,
553  	int low, int high, struct free_area *area)
554 {
555 	unsigned long size = 1 << high;
556 
557 	while (high > low) {
558 		area--;
559 		high--;
560 		size >>= 1;
561 		VM_BUG_ON(bad_range(zone, &page[size]));
562 		list_add(&page[size].lru, &area->free_list);
563 		area->nr_free++;
564 		set_page_order(&page[size], high);
565 	}
566 }
567 
568 /*
569  * This page is about to be returned from the page allocator
570  */
571 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
572 {
573 	if (unlikely(page_mapcount(page) |
574 		(page->mapping != NULL)  |
575 		(page_count(page) != 0)  |
576 		(page->flags & (
577 			1 << PG_lru	|
578 			1 << PG_private	|
579 			1 << PG_locked	|
580 			1 << PG_active	|
581 			1 << PG_dirty	|
582 			1 << PG_reclaim	|
583 			1 << PG_slab    |
584 			1 << PG_swapcache |
585 			1 << PG_writeback |
586 			1 << PG_reserved |
587 			1 << PG_buddy ))))
588 		bad_page(page);
589 
590 	/*
591 	 * For now, we report if PG_reserved was found set, but do not
592 	 * clear it, and do not allocate the page: as a safety net.
593 	 */
594 	if (PageReserved(page))
595 		return 1;
596 
597 	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
598 			1 << PG_referenced | 1 << PG_arch_1 |
599 			1 << PG_checked | 1 << PG_mappedtodisk);
600 	set_page_private(page, 0);
601 	set_page_refcounted(page);
602 
603 	arch_alloc_page(page, order);
604 	kernel_map_pages(page, 1 << order, 1);
605 
606 	if (gfp_flags & __GFP_ZERO)
607 		prep_zero_page(page, order, gfp_flags);
608 
609 	if (order && (gfp_flags & __GFP_COMP))
610 		prep_compound_page(page, order);
611 
612 	return 0;
613 }
614 
615 /*
616  * Do the hard work of removing an element from the buddy allocator.
617  * Call me with the zone->lock already held.
618  */
619 static struct page *__rmqueue(struct zone *zone, unsigned int order)
620 {
621 	struct free_area * area;
622 	unsigned int current_order;
623 	struct page *page;
624 
625 	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
626 		area = zone->free_area + current_order;
627 		if (list_empty(&area->free_list))
628 			continue;
629 
630 		page = list_entry(area->free_list.next, struct page, lru);
631 		list_del(&page->lru);
632 		rmv_page_order(page);
633 		area->nr_free--;
634 		zone->free_pages -= 1UL << order;
635 		expand(zone, page, order, current_order, area);
636 		return page;
637 	}
638 
639 	return NULL;
640 }
641 
642 /*
643  * Obtain a specified number of elements from the buddy allocator, all under
644  * a single hold of the lock, for efficiency.  Add them to the supplied list.
645  * Returns the number of new pages which were placed at *list.
646  */
647 static int rmqueue_bulk(struct zone *zone, unsigned int order,
648 			unsigned long count, struct list_head *list)
649 {
650 	int i;
651 
652 	spin_lock(&zone->lock);
653 	for (i = 0; i < count; ++i) {
654 		struct page *page = __rmqueue(zone, order);
655 		if (unlikely(page == NULL))
656 			break;
657 		list_add_tail(&page->lru, list);
658 	}
659 	spin_unlock(&zone->lock);
660 	return i;
661 }
662 
663 #ifdef CONFIG_NUMA
664 /*
665  * Called from the slab reaper to drain pagesets on a particular node that
666  * belongs to the currently executing processor.
667  * Note that this function must be called with the thread pinned to
668  * a single processor.
669  */
670 void drain_node_pages(int nodeid)
671 {
672 	int i;
673 	enum zone_type z;
674 	unsigned long flags;
675 
676 	for (z = 0; z < MAX_NR_ZONES; z++) {
677 		struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
678 		struct per_cpu_pageset *pset;
679 
680 		if (!populated_zone(zone))
681 			continue;
682 
683 		pset = zone_pcp(zone, smp_processor_id());
684 		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
685 			struct per_cpu_pages *pcp;
686 
687 			pcp = &pset->pcp[i];
688 			if (pcp->count) {
689 				int to_drain;
690 
691 				local_irq_save(flags);
692 				if (pcp->count >= pcp->batch)
693 					to_drain = pcp->batch;
694 				else
695 					to_drain = pcp->count;
696 				free_pages_bulk(zone, to_drain, &pcp->list, 0);
697 				pcp->count -= to_drain;
698 				local_irq_restore(flags);
699 			}
700 		}
701 	}
702 }
703 #endif
704 
705 static void __drain_pages(unsigned int cpu)
706 {
707 	unsigned long flags;
708 	struct zone *zone;
709 	int i;
710 
711 	for_each_zone(zone) {
712 		struct per_cpu_pageset *pset;
713 
714 		pset = zone_pcp(zone, cpu);
715 		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
716 			struct per_cpu_pages *pcp;
717 
718 			pcp = &pset->pcp[i];
719 			local_irq_save(flags);
720 			free_pages_bulk(zone, pcp->count, &pcp->list, 0);
721 			pcp->count = 0;
722 			local_irq_restore(flags);
723 		}
724 	}
725 }
726 
727 #ifdef CONFIG_PM
728 
729 void mark_free_pages(struct zone *zone)
730 {
731 	unsigned long pfn, max_zone_pfn;
732 	unsigned long flags;
733 	int order;
734 	struct list_head *curr;
735 
736 	if (!zone->spanned_pages)
737 		return;
738 
739 	spin_lock_irqsave(&zone->lock, flags);
740 
741 	max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
742 	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
743 		if (pfn_valid(pfn)) {
744 			struct page *page = pfn_to_page(pfn);
745 
746 			if (!PageNosave(page))
747 				ClearPageNosaveFree(page);
748 		}
749 
750 	for (order = MAX_ORDER - 1; order >= 0; --order)
751 		list_for_each(curr, &zone->free_area[order].free_list) {
752 			unsigned long i;
753 
754 			pfn = page_to_pfn(list_entry(curr, struct page, lru));
755 			for (i = 0; i < (1UL << order); i++)
756 				SetPageNosaveFree(pfn_to_page(pfn + i));
757 		}
758 
759 	spin_unlock_irqrestore(&zone->lock, flags);
760 }
761 
762 /*
763  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
764  */
765 void drain_local_pages(void)
766 {
767 	unsigned long flags;
768 
769 	local_irq_save(flags);
770 	__drain_pages(smp_processor_id());
771 	local_irq_restore(flags);
772 }
773 #endif /* CONFIG_PM */
774 
775 /*
776  * Free a 0-order page
777  */
778 static void fastcall free_hot_cold_page(struct page *page, int cold)
779 {
780 	struct zone *zone = page_zone(page);
781 	struct per_cpu_pages *pcp;
782 	unsigned long flags;
783 
784 	if (PageAnon(page))
785 		page->mapping = NULL;
786 	if (free_pages_check(page))
787 		return;
788 
789 	if (!PageHighMem(page))
790 		debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
791 	arch_free_page(page, 0);
792 	kernel_map_pages(page, 1, 0);
793 
794 	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
795 	local_irq_save(flags);
796 	__count_vm_event(PGFREE);
797 	list_add(&page->lru, &pcp->list);
798 	pcp->count++;
799 	if (pcp->count >= pcp->high) {
800 		free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
801 		pcp->count -= pcp->batch;
802 	}
803 	local_irq_restore(flags);
804 	put_cpu();
805 }
806 
807 void fastcall free_hot_page(struct page *page)
808 {
809 	free_hot_cold_page(page, 0);
810 }
811 
812 void fastcall free_cold_page(struct page *page)
813 {
814 	free_hot_cold_page(page, 1);
815 }
816 
817 /*
818  * split_page takes a non-compound higher-order page, and splits it into
819  * n (1<<order) sub-pages: page[0..n]
820  * Each sub-page must be freed individually.
821  *
822  * Note: this is probably too low level an operation for use in drivers.
823  * Please consult with lkml before using this in your driver.
824  */
825 void split_page(struct page *page, unsigned int order)
826 {
827 	int i;
828 
829 	VM_BUG_ON(PageCompound(page));
830 	VM_BUG_ON(!page_count(page));
831 	for (i = 1; i < (1 << order); i++)
832 		set_page_refcounted(page + i);
833 }
834 
835 /*
836  * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
837  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
838  * or two.
839  */
840 static struct page *buffered_rmqueue(struct zonelist *zonelist,
841 			struct zone *zone, int order, gfp_t gfp_flags)
842 {
843 	unsigned long flags;
844 	struct page *page;
845 	int cold = !!(gfp_flags & __GFP_COLD);
846 	int cpu;
847 
848 again:
849 	cpu  = get_cpu();
850 	if (likely(order == 0)) {
851 		struct per_cpu_pages *pcp;
852 
853 		pcp = &zone_pcp(zone, cpu)->pcp[cold];
854 		local_irq_save(flags);
855 		if (!pcp->count) {
856 			pcp->count = rmqueue_bulk(zone, 0,
857 						pcp->batch, &pcp->list);
858 			if (unlikely(!pcp->count))
859 				goto failed;
860 		}
861 		page = list_entry(pcp->list.next, struct page, lru);
862 		list_del(&page->lru);
863 		pcp->count--;
864 	} else {
865 		spin_lock_irqsave(&zone->lock, flags);
866 		page = __rmqueue(zone, order);
867 		spin_unlock(&zone->lock);
868 		if (!page)
869 			goto failed;
870 	}
871 
872 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
873 	zone_statistics(zonelist, zone);
874 	local_irq_restore(flags);
875 	put_cpu();
876 
877 	VM_BUG_ON(bad_range(zone, page));
878 	if (prep_new_page(page, order, gfp_flags))
879 		goto again;
880 	return page;
881 
882 failed:
883 	local_irq_restore(flags);
884 	put_cpu();
885 	return NULL;
886 }
887 
888 #define ALLOC_NO_WATERMARKS	0x01 /* don't check watermarks at all */
889 #define ALLOC_WMARK_MIN		0x02 /* use pages_min watermark */
890 #define ALLOC_WMARK_LOW		0x04 /* use pages_low watermark */
891 #define ALLOC_WMARK_HIGH	0x08 /* use pages_high watermark */
892 #define ALLOC_HARDER		0x10 /* try to alloc harder */
893 #define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
894 #define ALLOC_CPUSET		0x40 /* check for correct cpuset */
895 
896 #ifdef CONFIG_FAIL_PAGE_ALLOC
897 
898 static struct fail_page_alloc_attr {
899 	struct fault_attr attr;
900 
901 	u32 ignore_gfp_highmem;
902 	u32 ignore_gfp_wait;
903 
904 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
905 
906 	struct dentry *ignore_gfp_highmem_file;
907 	struct dentry *ignore_gfp_wait_file;
908 
909 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
910 
911 } fail_page_alloc = {
912 	.attr = FAULT_ATTR_INITIALIZER,
913 	.ignore_gfp_wait = 1,
914 	.ignore_gfp_highmem = 1,
915 };
916 
917 static int __init setup_fail_page_alloc(char *str)
918 {
919 	return setup_fault_attr(&fail_page_alloc.attr, str);
920 }
921 __setup("fail_page_alloc=", setup_fail_page_alloc);
922 
923 static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
924 {
925 	if (gfp_mask & __GFP_NOFAIL)
926 		return 0;
927 	if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
928 		return 0;
929 	if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
930 		return 0;
931 
932 	return should_fail(&fail_page_alloc.attr, 1 << order);
933 }
934 
935 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
936 
937 static int __init fail_page_alloc_debugfs(void)
938 {
939 	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
940 	struct dentry *dir;
941 	int err;
942 
943 	err = init_fault_attr_dentries(&fail_page_alloc.attr,
944 				       "fail_page_alloc");
945 	if (err)
946 		return err;
947 	dir = fail_page_alloc.attr.dentries.dir;
948 
949 	fail_page_alloc.ignore_gfp_wait_file =
950 		debugfs_create_bool("ignore-gfp-wait", mode, dir,
951 				      &fail_page_alloc.ignore_gfp_wait);
952 
953 	fail_page_alloc.ignore_gfp_highmem_file =
954 		debugfs_create_bool("ignore-gfp-highmem", mode, dir,
955 				      &fail_page_alloc.ignore_gfp_highmem);
956 
957 	if (!fail_page_alloc.ignore_gfp_wait_file ||
958 			!fail_page_alloc.ignore_gfp_highmem_file) {
959 		err = -ENOMEM;
960 		debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
961 		debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
962 		cleanup_fault_attr_dentries(&fail_page_alloc.attr);
963 	}
964 
965 	return err;
966 }
967 
968 late_initcall(fail_page_alloc_debugfs);
969 
970 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
971 
972 #else /* CONFIG_FAIL_PAGE_ALLOC */
973 
974 static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
975 {
976 	return 0;
977 }
978 
979 #endif /* CONFIG_FAIL_PAGE_ALLOC */
980 
981 /*
982  * Return 1 if free pages are above 'mark'. This takes into account the order
983  * of the allocation.
984  */
985 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
986 		      int classzone_idx, int alloc_flags)
987 {
988 	/* free_pages my go negative - that's OK */
989 	unsigned long min = mark;
990 	long free_pages = z->free_pages - (1 << order) + 1;
991 	int o;
992 
993 	if (alloc_flags & ALLOC_HIGH)
994 		min -= min / 2;
995 	if (alloc_flags & ALLOC_HARDER)
996 		min -= min / 4;
997 
998 	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
999 		return 0;
1000 	for (o = 0; o < order; o++) {
1001 		/* At the next order, this order's pages become unavailable */
1002 		free_pages -= z->free_area[o].nr_free << o;
1003 
1004 		/* Require fewer higher order pages to be free */
1005 		min >>= 1;
1006 
1007 		if (free_pages <= min)
1008 			return 0;
1009 	}
1010 	return 1;
1011 }
1012 
1013 #ifdef CONFIG_NUMA
1014 /*
1015  * zlc_setup - Setup for "zonelist cache".  Uses cached zone data to
1016  * skip over zones that are not allowed by the cpuset, or that have
1017  * been recently (in last second) found to be nearly full.  See further
1018  * comments in mmzone.h.  Reduces cache footprint of zonelist scans
1019  * that have to skip over alot of full or unallowed zones.
1020  *
1021  * If the zonelist cache is present in the passed in zonelist, then
1022  * returns a pointer to the allowed node mask (either the current
1023  * tasks mems_allowed, or node_online_map.)
1024  *
1025  * If the zonelist cache is not available for this zonelist, does
1026  * nothing and returns NULL.
1027  *
1028  * If the fullzones BITMAP in the zonelist cache is stale (more than
1029  * a second since last zap'd) then we zap it out (clear its bits.)
1030  *
1031  * We hold off even calling zlc_setup, until after we've checked the
1032  * first zone in the zonelist, on the theory that most allocations will
1033  * be satisfied from that first zone, so best to examine that zone as
1034  * quickly as we can.
1035  */
1036 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1037 {
1038 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
1039 	nodemask_t *allowednodes;	/* zonelist_cache approximation */
1040 
1041 	zlc = zonelist->zlcache_ptr;
1042 	if (!zlc)
1043 		return NULL;
1044 
1045 	if (jiffies - zlc->last_full_zap > 1 * HZ) {
1046 		bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1047 		zlc->last_full_zap = jiffies;
1048 	}
1049 
1050 	allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1051 					&cpuset_current_mems_allowed :
1052 					&node_online_map;
1053 	return allowednodes;
1054 }
1055 
1056 /*
1057  * Given 'z' scanning a zonelist, run a couple of quick checks to see
1058  * if it is worth looking at further for free memory:
1059  *  1) Check that the zone isn't thought to be full (doesn't have its
1060  *     bit set in the zonelist_cache fullzones BITMAP).
1061  *  2) Check that the zones node (obtained from the zonelist_cache
1062  *     z_to_n[] mapping) is allowed in the passed in allowednodes mask.
1063  * Return true (non-zero) if zone is worth looking at further, or
1064  * else return false (zero) if it is not.
1065  *
1066  * This check -ignores- the distinction between various watermarks,
1067  * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ...  If a zone is
1068  * found to be full for any variation of these watermarks, it will
1069  * be considered full for up to one second by all requests, unless
1070  * we are so low on memory on all allowed nodes that we are forced
1071  * into the second scan of the zonelist.
1072  *
1073  * In the second scan we ignore this zonelist cache and exactly
1074  * apply the watermarks to all zones, even it is slower to do so.
1075  * We are low on memory in the second scan, and should leave no stone
1076  * unturned looking for a free page.
1077  */
1078 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
1079 						nodemask_t *allowednodes)
1080 {
1081 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
1082 	int i;				/* index of *z in zonelist zones */
1083 	int n;				/* node that zone *z is on */
1084 
1085 	zlc = zonelist->zlcache_ptr;
1086 	if (!zlc)
1087 		return 1;
1088 
1089 	i = z - zonelist->zones;
1090 	n = zlc->z_to_n[i];
1091 
1092 	/* This zone is worth trying if it is allowed but not full */
1093 	return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
1094 }
1095 
1096 /*
1097  * Given 'z' scanning a zonelist, set the corresponding bit in
1098  * zlc->fullzones, so that subsequent attempts to allocate a page
1099  * from that zone don't waste time re-examining it.
1100  */
1101 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1102 {
1103 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
1104 	int i;				/* index of *z in zonelist zones */
1105 
1106 	zlc = zonelist->zlcache_ptr;
1107 	if (!zlc)
1108 		return;
1109 
1110 	i = z - zonelist->zones;
1111 
1112 	set_bit(i, zlc->fullzones);
1113 }
1114 
1115 #else	/* CONFIG_NUMA */
1116 
1117 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1118 {
1119 	return NULL;
1120 }
1121 
1122 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
1123 				nodemask_t *allowednodes)
1124 {
1125 	return 1;
1126 }
1127 
1128 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1129 {
1130 }
1131 #endif	/* CONFIG_NUMA */
1132 
1133 /*
1134  * get_page_from_freelist goes through the zonelist trying to allocate
1135  * a page.
1136  */
1137 static struct page *
1138 get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
1139 		struct zonelist *zonelist, int alloc_flags)
1140 {
1141 	struct zone **z;
1142 	struct page *page = NULL;
1143 	int classzone_idx = zone_idx(zonelist->zones[0]);
1144 	struct zone *zone;
1145 	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1146 	int zlc_active = 0;		/* set if using zonelist_cache */
1147 	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
1148 
1149 zonelist_scan:
1150 	/*
1151 	 * Scan zonelist, looking for a zone with enough free.
1152 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1153 	 */
1154 	z = zonelist->zones;
1155 
1156 	do {
1157 		if (NUMA_BUILD && zlc_active &&
1158 			!zlc_zone_worth_trying(zonelist, z, allowednodes))
1159 				continue;
1160 		zone = *z;
1161 		if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
1162 			zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
1163 				break;
1164 		if ((alloc_flags & ALLOC_CPUSET) &&
1165 			!cpuset_zone_allowed_softwall(zone, gfp_mask))
1166 				goto try_next_zone;
1167 
1168 		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
1169 			unsigned long mark;
1170 			if (alloc_flags & ALLOC_WMARK_MIN)
1171 				mark = zone->pages_min;
1172 			else if (alloc_flags & ALLOC_WMARK_LOW)
1173 				mark = zone->pages_low;
1174 			else
1175 				mark = zone->pages_high;
1176 			if (!zone_watermark_ok(zone, order, mark,
1177 				    classzone_idx, alloc_flags)) {
1178 				if (!zone_reclaim_mode ||
1179 				    !zone_reclaim(zone, gfp_mask, order))
1180 					goto this_zone_full;
1181 			}
1182 		}
1183 
1184 		page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
1185 		if (page)
1186 			break;
1187 this_zone_full:
1188 		if (NUMA_BUILD)
1189 			zlc_mark_zone_full(zonelist, z);
1190 try_next_zone:
1191 		if (NUMA_BUILD && !did_zlc_setup) {
1192 			/* we do zlc_setup after the first zone is tried */
1193 			allowednodes = zlc_setup(zonelist, alloc_flags);
1194 			zlc_active = 1;
1195 			did_zlc_setup = 1;
1196 		}
1197 	} while (*(++z) != NULL);
1198 
1199 	if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
1200 		/* Disable zlc cache for second zonelist scan */
1201 		zlc_active = 0;
1202 		goto zonelist_scan;
1203 	}
1204 	return page;
1205 }
1206 
1207 /*
1208  * This is the 'heart' of the zoned buddy allocator.
1209  */
1210 struct page * fastcall
1211 __alloc_pages(gfp_t gfp_mask, unsigned int order,
1212 		struct zonelist *zonelist)
1213 {
1214 	const gfp_t wait = gfp_mask & __GFP_WAIT;
1215 	struct zone **z;
1216 	struct page *page;
1217 	struct reclaim_state reclaim_state;
1218 	struct task_struct *p = current;
1219 	int do_retry;
1220 	int alloc_flags;
1221 	int did_some_progress;
1222 
1223 	might_sleep_if(wait);
1224 
1225 	if (should_fail_alloc_page(gfp_mask, order))
1226 		return NULL;
1227 
1228 restart:
1229 	z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
1230 
1231 	if (unlikely(*z == NULL)) {
1232 		/* Should this ever happen?? */
1233 		return NULL;
1234 	}
1235 
1236 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
1237 				zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
1238 	if (page)
1239 		goto got_pg;
1240 
1241 	/*
1242 	 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
1243 	 * __GFP_NOWARN set) should not cause reclaim since the subsystem
1244 	 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
1245 	 * using a larger set of nodes after it has established that the
1246 	 * allowed per node queues are empty and that nodes are
1247 	 * over allocated.
1248 	 */
1249 	if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1250 		goto nopage;
1251 
1252 	for (z = zonelist->zones; *z; z++)
1253 		wakeup_kswapd(*z, order);
1254 
1255 	/*
1256 	 * OK, we're below the kswapd watermark and have kicked background
1257 	 * reclaim. Now things get more complex, so set up alloc_flags according
1258 	 * to how we want to proceed.
1259 	 *
1260 	 * The caller may dip into page reserves a bit more if the caller
1261 	 * cannot run direct reclaim, or if the caller has realtime scheduling
1262 	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
1263 	 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1264 	 */
1265 	alloc_flags = ALLOC_WMARK_MIN;
1266 	if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
1267 		alloc_flags |= ALLOC_HARDER;
1268 	if (gfp_mask & __GFP_HIGH)
1269 		alloc_flags |= ALLOC_HIGH;
1270 	if (wait)
1271 		alloc_flags |= ALLOC_CPUSET;
1272 
1273 	/*
1274 	 * Go through the zonelist again. Let __GFP_HIGH and allocations
1275 	 * coming from realtime tasks go deeper into reserves.
1276 	 *
1277 	 * This is the last chance, in general, before the goto nopage.
1278 	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1279 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1280 	 */
1281 	page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
1282 	if (page)
1283 		goto got_pg;
1284 
1285 	/* This allocation should allow future memory freeing. */
1286 
1287 rebalance:
1288 	if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
1289 			&& !in_interrupt()) {
1290 		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
1291 nofail_alloc:
1292 			/* go through the zonelist yet again, ignoring mins */
1293 			page = get_page_from_freelist(gfp_mask, order,
1294 				zonelist, ALLOC_NO_WATERMARKS);
1295 			if (page)
1296 				goto got_pg;
1297 			if (gfp_mask & __GFP_NOFAIL) {
1298 				congestion_wait(WRITE, HZ/50);
1299 				goto nofail_alloc;
1300 			}
1301 		}
1302 		goto nopage;
1303 	}
1304 
1305 	/* Atomic allocations - we can't balance anything */
1306 	if (!wait)
1307 		goto nopage;
1308 
1309 	cond_resched();
1310 
1311 	/* We now go into synchronous reclaim */
1312 	cpuset_memory_pressure_bump();
1313 	p->flags |= PF_MEMALLOC;
1314 	reclaim_state.reclaimed_slab = 0;
1315 	p->reclaim_state = &reclaim_state;
1316 
1317 	did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
1318 
1319 	p->reclaim_state = NULL;
1320 	p->flags &= ~PF_MEMALLOC;
1321 
1322 	cond_resched();
1323 
1324 	if (likely(did_some_progress)) {
1325 		page = get_page_from_freelist(gfp_mask, order,
1326 						zonelist, alloc_flags);
1327 		if (page)
1328 			goto got_pg;
1329 	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1330 		/*
1331 		 * Go through the zonelist yet one more time, keep
1332 		 * very high watermark here, this is only to catch
1333 		 * a parallel oom killing, we must fail if we're still
1334 		 * under heavy pressure.
1335 		 */
1336 		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
1337 				zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1338 		if (page)
1339 			goto got_pg;
1340 
1341 		out_of_memory(zonelist, gfp_mask, order);
1342 		goto restart;
1343 	}
1344 
1345 	/*
1346 	 * Don't let big-order allocations loop unless the caller explicitly
1347 	 * requests that.  Wait for some write requests to complete then retry.
1348 	 *
1349 	 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
1350 	 * <= 3, but that may not be true in other implementations.
1351 	 */
1352 	do_retry = 0;
1353 	if (!(gfp_mask & __GFP_NORETRY)) {
1354 		if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
1355 			do_retry = 1;
1356 		if (gfp_mask & __GFP_NOFAIL)
1357 			do_retry = 1;
1358 	}
1359 	if (do_retry) {
1360 		congestion_wait(WRITE, HZ/50);
1361 		goto rebalance;
1362 	}
1363 
1364 nopage:
1365 	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
1366 		printk(KERN_WARNING "%s: page allocation failure."
1367 			" order:%d, mode:0x%x\n",
1368 			p->comm, order, gfp_mask);
1369 		dump_stack();
1370 		show_mem();
1371 	}
1372 got_pg:
1373 	return page;
1374 }
1375 
1376 EXPORT_SYMBOL(__alloc_pages);
1377 
1378 /*
1379  * Common helper functions.
1380  */
1381 fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
1382 {
1383 	struct page * page;
1384 	page = alloc_pages(gfp_mask, order);
1385 	if (!page)
1386 		return 0;
1387 	return (unsigned long) page_address(page);
1388 }
1389 
1390 EXPORT_SYMBOL(__get_free_pages);
1391 
1392 fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
1393 {
1394 	struct page * page;
1395 
1396 	/*
1397 	 * get_zeroed_page() returns a 32-bit address, which cannot represent
1398 	 * a highmem page
1399 	 */
1400 	VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1401 
1402 	page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
1403 	if (page)
1404 		return (unsigned long) page_address(page);
1405 	return 0;
1406 }
1407 
1408 EXPORT_SYMBOL(get_zeroed_page);
1409 
1410 void __pagevec_free(struct pagevec *pvec)
1411 {
1412 	int i = pagevec_count(pvec);
1413 
1414 	while (--i >= 0)
1415 		free_hot_cold_page(pvec->pages[i], pvec->cold);
1416 }
1417 
1418 fastcall void __free_pages(struct page *page, unsigned int order)
1419 {
1420 	if (put_page_testzero(page)) {
1421 		if (order == 0)
1422 			free_hot_page(page);
1423 		else
1424 			__free_pages_ok(page, order);
1425 	}
1426 }
1427 
1428 EXPORT_SYMBOL(__free_pages);
1429 
1430 fastcall void free_pages(unsigned long addr, unsigned int order)
1431 {
1432 	if (addr != 0) {
1433 		VM_BUG_ON(!virt_addr_valid((void *)addr));
1434 		__free_pages(virt_to_page((void *)addr), order);
1435 	}
1436 }
1437 
1438 EXPORT_SYMBOL(free_pages);
1439 
1440 /*
1441  * Total amount of free (allocatable) RAM:
1442  */
1443 unsigned int nr_free_pages(void)
1444 {
1445 	unsigned int sum = 0;
1446 	struct zone *zone;
1447 
1448 	for_each_zone(zone)
1449 		sum += zone->free_pages;
1450 
1451 	return sum;
1452 }
1453 
1454 EXPORT_SYMBOL(nr_free_pages);
1455 
1456 #ifdef CONFIG_NUMA
1457 unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
1458 {
1459 	unsigned int sum = 0;
1460 	enum zone_type i;
1461 
1462 	for (i = 0; i < MAX_NR_ZONES; i++)
1463 		sum += pgdat->node_zones[i].free_pages;
1464 
1465 	return sum;
1466 }
1467 #endif
1468 
1469 static unsigned int nr_free_zone_pages(int offset)
1470 {
1471 	/* Just pick one node, since fallback list is circular */
1472 	pg_data_t *pgdat = NODE_DATA(numa_node_id());
1473 	unsigned int sum = 0;
1474 
1475 	struct zonelist *zonelist = pgdat->node_zonelists + offset;
1476 	struct zone **zonep = zonelist->zones;
1477 	struct zone *zone;
1478 
1479 	for (zone = *zonep++; zone; zone = *zonep++) {
1480 		unsigned long size = zone->present_pages;
1481 		unsigned long high = zone->pages_high;
1482 		if (size > high)
1483 			sum += size - high;
1484 	}
1485 
1486 	return sum;
1487 }
1488 
1489 /*
1490  * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
1491  */
1492 unsigned int nr_free_buffer_pages(void)
1493 {
1494 	return nr_free_zone_pages(gfp_zone(GFP_USER));
1495 }
1496 
1497 /*
1498  * Amount of free RAM allocatable within all zones
1499  */
1500 unsigned int nr_free_pagecache_pages(void)
1501 {
1502 	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
1503 }
1504 
1505 static inline void show_node(struct zone *zone)
1506 {
1507 	if (NUMA_BUILD)
1508 		printk("Node %d ", zone_to_nid(zone));
1509 }
1510 
1511 void si_meminfo(struct sysinfo *val)
1512 {
1513 	val->totalram = totalram_pages;
1514 	val->sharedram = 0;
1515 	val->freeram = nr_free_pages();
1516 	val->bufferram = nr_blockdev_pages();
1517 	val->totalhigh = totalhigh_pages;
1518 	val->freehigh = nr_free_highpages();
1519 	val->mem_unit = PAGE_SIZE;
1520 }
1521 
1522 EXPORT_SYMBOL(si_meminfo);
1523 
1524 #ifdef CONFIG_NUMA
1525 void si_meminfo_node(struct sysinfo *val, int nid)
1526 {
1527 	pg_data_t *pgdat = NODE_DATA(nid);
1528 
1529 	val->totalram = pgdat->node_present_pages;
1530 	val->freeram = nr_free_pages_pgdat(pgdat);
1531 #ifdef CONFIG_HIGHMEM
1532 	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
1533 	val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
1534 #else
1535 	val->totalhigh = 0;
1536 	val->freehigh = 0;
1537 #endif
1538 	val->mem_unit = PAGE_SIZE;
1539 }
1540 #endif
1541 
1542 #define K(x) ((x) << (PAGE_SHIFT-10))
1543 
1544 /*
1545  * Show free area list (used inside shift_scroll-lock stuff)
1546  * We also calculate the percentage fragmentation. We do this by counting the
1547  * memory on each free list with the exception of the first item on the list.
1548  */
1549 void show_free_areas(void)
1550 {
1551 	int cpu;
1552 	unsigned long active;
1553 	unsigned long inactive;
1554 	unsigned long free;
1555 	struct zone *zone;
1556 
1557 	for_each_zone(zone) {
1558 		if (!populated_zone(zone))
1559 			continue;
1560 
1561 		show_node(zone);
1562 		printk("%s per-cpu:\n", zone->name);
1563 
1564 		for_each_online_cpu(cpu) {
1565 			struct per_cpu_pageset *pageset;
1566 
1567 			pageset = zone_pcp(zone, cpu);
1568 
1569 			printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d   "
1570 			       "Cold: hi:%5d, btch:%4d usd:%4d\n",
1571 			       cpu, pageset->pcp[0].high,
1572 			       pageset->pcp[0].batch, pageset->pcp[0].count,
1573 			       pageset->pcp[1].high, pageset->pcp[1].batch,
1574 			       pageset->pcp[1].count);
1575 		}
1576 	}
1577 
1578 	get_zone_counts(&active, &inactive, &free);
1579 
1580 	printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
1581 		"unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
1582 		active,
1583 		inactive,
1584 		global_page_state(NR_FILE_DIRTY),
1585 		global_page_state(NR_WRITEBACK),
1586 		global_page_state(NR_UNSTABLE_NFS),
1587 		nr_free_pages(),
1588 		global_page_state(NR_SLAB_RECLAIMABLE) +
1589 			global_page_state(NR_SLAB_UNRECLAIMABLE),
1590 		global_page_state(NR_FILE_MAPPED),
1591 		global_page_state(NR_PAGETABLE));
1592 
1593 	for_each_zone(zone) {
1594 		int i;
1595 
1596 		if (!populated_zone(zone))
1597 			continue;
1598 
1599 		show_node(zone);
1600 		printk("%s"
1601 			" free:%lukB"
1602 			" min:%lukB"
1603 			" low:%lukB"
1604 			" high:%lukB"
1605 			" active:%lukB"
1606 			" inactive:%lukB"
1607 			" present:%lukB"
1608 			" pages_scanned:%lu"
1609 			" all_unreclaimable? %s"
1610 			"\n",
1611 			zone->name,
1612 			K(zone->free_pages),
1613 			K(zone->pages_min),
1614 			K(zone->pages_low),
1615 			K(zone->pages_high),
1616 			K(zone->nr_active),
1617 			K(zone->nr_inactive),
1618 			K(zone->present_pages),
1619 			zone->pages_scanned,
1620 			(zone->all_unreclaimable ? "yes" : "no")
1621 			);
1622 		printk("lowmem_reserve[]:");
1623 		for (i = 0; i < MAX_NR_ZONES; i++)
1624 			printk(" %lu", zone->lowmem_reserve[i]);
1625 		printk("\n");
1626 	}
1627 
1628 	for_each_zone(zone) {
1629  		unsigned long nr[MAX_ORDER], flags, order, total = 0;
1630 
1631 		if (!populated_zone(zone))
1632 			continue;
1633 
1634 		show_node(zone);
1635 		printk("%s: ", zone->name);
1636 
1637 		spin_lock_irqsave(&zone->lock, flags);
1638 		for (order = 0; order < MAX_ORDER; order++) {
1639 			nr[order] = zone->free_area[order].nr_free;
1640 			total += nr[order] << order;
1641 		}
1642 		spin_unlock_irqrestore(&zone->lock, flags);
1643 		for (order = 0; order < MAX_ORDER; order++)
1644 			printk("%lu*%lukB ", nr[order], K(1UL) << order);
1645 		printk("= %lukB\n", K(total));
1646 	}
1647 
1648 	show_swap_cache_info();
1649 }
1650 
1651 /*
1652  * Builds allocation fallback zone lists.
1653  *
1654  * Add all populated zones of a node to the zonelist.
1655  */
1656 static int __meminit build_zonelists_node(pg_data_t *pgdat,
1657 			struct zonelist *zonelist, int nr_zones, enum zone_type zone_type)
1658 {
1659 	struct zone *zone;
1660 
1661 	BUG_ON(zone_type >= MAX_NR_ZONES);
1662 	zone_type++;
1663 
1664 	do {
1665 		zone_type--;
1666 		zone = pgdat->node_zones + zone_type;
1667 		if (populated_zone(zone)) {
1668 			zonelist->zones[nr_zones++] = zone;
1669 			check_highest_zone(zone_type);
1670 		}
1671 
1672 	} while (zone_type);
1673 	return nr_zones;
1674 }
1675 
1676 #ifdef CONFIG_NUMA
1677 #define MAX_NODE_LOAD (num_online_nodes())
1678 static int __meminitdata node_load[MAX_NUMNODES];
1679 /**
1680  * find_next_best_node - find the next node that should appear in a given node's fallback list
1681  * @node: node whose fallback list we're appending
1682  * @used_node_mask: nodemask_t of already used nodes
1683  *
1684  * We use a number of factors to determine which is the next node that should
1685  * appear on a given node's fallback list.  The node should not have appeared
1686  * already in @node's fallback list, and it should be the next closest node
1687  * according to the distance array (which contains arbitrary distance values
1688  * from each node to each node in the system), and should also prefer nodes
1689  * with no CPUs, since presumably they'll have very little allocation pressure
1690  * on them otherwise.
1691  * It returns -1 if no node is found.
1692  */
1693 static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
1694 {
1695 	int n, val;
1696 	int min_val = INT_MAX;
1697 	int best_node = -1;
1698 
1699 	/* Use the local node if we haven't already */
1700 	if (!node_isset(node, *used_node_mask)) {
1701 		node_set(node, *used_node_mask);
1702 		return node;
1703 	}
1704 
1705 	for_each_online_node(n) {
1706 		cpumask_t tmp;
1707 
1708 		/* Don't want a node to appear more than once */
1709 		if (node_isset(n, *used_node_mask))
1710 			continue;
1711 
1712 		/* Use the distance array to find the distance */
1713 		val = node_distance(node, n);
1714 
1715 		/* Penalize nodes under us ("prefer the next node") */
1716 		val += (n < node);
1717 
1718 		/* Give preference to headless and unused nodes */
1719 		tmp = node_to_cpumask(n);
1720 		if (!cpus_empty(tmp))
1721 			val += PENALTY_FOR_NODE_WITH_CPUS;
1722 
1723 		/* Slight preference for less loaded node */
1724 		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
1725 		val += node_load[n];
1726 
1727 		if (val < min_val) {
1728 			min_val = val;
1729 			best_node = n;
1730 		}
1731 	}
1732 
1733 	if (best_node >= 0)
1734 		node_set(best_node, *used_node_mask);
1735 
1736 	return best_node;
1737 }
1738 
1739 static void __meminit build_zonelists(pg_data_t *pgdat)
1740 {
1741 	int j, node, local_node;
1742 	enum zone_type i;
1743 	int prev_node, load;
1744 	struct zonelist *zonelist;
1745 	nodemask_t used_mask;
1746 
1747 	/* initialize zonelists */
1748 	for (i = 0; i < MAX_NR_ZONES; i++) {
1749 		zonelist = pgdat->node_zonelists + i;
1750 		zonelist->zones[0] = NULL;
1751 	}
1752 
1753 	/* NUMA-aware ordering of nodes */
1754 	local_node = pgdat->node_id;
1755 	load = num_online_nodes();
1756 	prev_node = local_node;
1757 	nodes_clear(used_mask);
1758 	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
1759 		int distance = node_distance(local_node, node);
1760 
1761 		/*
1762 		 * If another node is sufficiently far away then it is better
1763 		 * to reclaim pages in a zone before going off node.
1764 		 */
1765 		if (distance > RECLAIM_DISTANCE)
1766 			zone_reclaim_mode = 1;
1767 
1768 		/*
1769 		 * We don't want to pressure a particular node.
1770 		 * So adding penalty to the first node in same
1771 		 * distance group to make it round-robin.
1772 		 */
1773 
1774 		if (distance != node_distance(local_node, prev_node))
1775 			node_load[node] += load;
1776 		prev_node = node;
1777 		load--;
1778 		for (i = 0; i < MAX_NR_ZONES; i++) {
1779 			zonelist = pgdat->node_zonelists + i;
1780 			for (j = 0; zonelist->zones[j] != NULL; j++);
1781 
1782 	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1783 			zonelist->zones[j] = NULL;
1784 		}
1785 	}
1786 }
1787 
1788 /* Construct the zonelist performance cache - see further mmzone.h */
1789 static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1790 {
1791 	int i;
1792 
1793 	for (i = 0; i < MAX_NR_ZONES; i++) {
1794 		struct zonelist *zonelist;
1795 		struct zonelist_cache *zlc;
1796 		struct zone **z;
1797 
1798 		zonelist = pgdat->node_zonelists + i;
1799 		zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
1800 		bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1801 		for (z = zonelist->zones; *z; z++)
1802 			zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
1803 	}
1804 }
1805 
1806 #else	/* CONFIG_NUMA */
1807 
1808 static void __meminit build_zonelists(pg_data_t *pgdat)
1809 {
1810 	int node, local_node;
1811 	enum zone_type i,j;
1812 
1813 	local_node = pgdat->node_id;
1814 	for (i = 0; i < MAX_NR_ZONES; i++) {
1815 		struct zonelist *zonelist;
1816 
1817 		zonelist = pgdat->node_zonelists + i;
1818 
1819  		j = build_zonelists_node(pgdat, zonelist, 0, i);
1820  		/*
1821  		 * Now we build the zonelist so that it contains the zones
1822  		 * of all the other nodes.
1823  		 * We don't want to pressure a particular node, so when
1824  		 * building the zones for node N, we make sure that the
1825  		 * zones coming right after the local ones are those from
1826  		 * node N+1 (modulo N)
1827  		 */
1828 		for (node = local_node + 1; node < MAX_NUMNODES; node++) {
1829 			if (!node_online(node))
1830 				continue;
1831 			j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1832 		}
1833 		for (node = 0; node < local_node; node++) {
1834 			if (!node_online(node))
1835 				continue;
1836 			j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1837 		}
1838 
1839 		zonelist->zones[j] = NULL;
1840 	}
1841 }
1842 
1843 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
1844 static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1845 {
1846 	int i;
1847 
1848 	for (i = 0; i < MAX_NR_ZONES; i++)
1849 		pgdat->node_zonelists[i].zlcache_ptr = NULL;
1850 }
1851 
1852 #endif	/* CONFIG_NUMA */
1853 
1854 /* return values int ....just for stop_machine_run() */
1855 static int __meminit __build_all_zonelists(void *dummy)
1856 {
1857 	int nid;
1858 
1859 	for_each_online_node(nid) {
1860 		build_zonelists(NODE_DATA(nid));
1861 		build_zonelist_cache(NODE_DATA(nid));
1862 	}
1863 	return 0;
1864 }
1865 
1866 void __meminit build_all_zonelists(void)
1867 {
1868 	if (system_state == SYSTEM_BOOTING) {
1869 		__build_all_zonelists(NULL);
1870 		cpuset_init_current_mems_allowed();
1871 	} else {
1872 		/* we have to stop all cpus to guaranntee there is no user
1873 		   of zonelist */
1874 		stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
1875 		/* cpuset refresh routine should be here */
1876 	}
1877 	vm_total_pages = nr_free_pagecache_pages();
1878 	printk("Built %i zonelists.  Total pages: %ld\n",
1879 			num_online_nodes(), vm_total_pages);
1880 }
1881 
1882 /*
1883  * Helper functions to size the waitqueue hash table.
1884  * Essentially these want to choose hash table sizes sufficiently
1885  * large so that collisions trying to wait on pages are rare.
1886  * But in fact, the number of active page waitqueues on typical
1887  * systems is ridiculously low, less than 200. So this is even
1888  * conservative, even though it seems large.
1889  *
1890  * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
1891  * waitqueues, i.e. the size of the waitq table given the number of pages.
1892  */
1893 #define PAGES_PER_WAITQUEUE	256
1894 
1895 #ifndef CONFIG_MEMORY_HOTPLUG
1896 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
1897 {
1898 	unsigned long size = 1;
1899 
1900 	pages /= PAGES_PER_WAITQUEUE;
1901 
1902 	while (size < pages)
1903 		size <<= 1;
1904 
1905 	/*
1906 	 * Once we have dozens or even hundreds of threads sleeping
1907 	 * on IO we've got bigger problems than wait queue collision.
1908 	 * Limit the size of the wait table to a reasonable size.
1909 	 */
1910 	size = min(size, 4096UL);
1911 
1912 	return max(size, 4UL);
1913 }
1914 #else
1915 /*
1916  * A zone's size might be changed by hot-add, so it is not possible to determine
1917  * a suitable size for its wait_table.  So we use the maximum size now.
1918  *
1919  * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
1920  *
1921  *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
1922  *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
1923  *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
1924  *
1925  * The maximum entries are prepared when a zone's memory is (512K + 256) pages
1926  * or more by the traditional way. (See above).  It equals:
1927  *
1928  *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
1929  *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
1930  *    powerpc (64K page size)             : =  (32G +16M)byte.
1931  */
1932 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
1933 {
1934 	return 4096UL;
1935 }
1936 #endif
1937 
1938 /*
1939  * This is an integer logarithm so that shifts can be used later
1940  * to extract the more random high bits from the multiplicative
1941  * hash function before the remainder is taken.
1942  */
1943 static inline unsigned long wait_table_bits(unsigned long size)
1944 {
1945 	return ffz(~size);
1946 }
1947 
1948 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
1949 
1950 /*
1951  * Initially all pages are reserved - free ones are freed
1952  * up by free_all_bootmem() once the early boot process is
1953  * done. Non-atomic initialization, single-pass.
1954  */
1955 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1956 		unsigned long start_pfn)
1957 {
1958 	struct page *page;
1959 	unsigned long end_pfn = start_pfn + size;
1960 	unsigned long pfn;
1961 
1962 	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1963 		if (!early_pfn_valid(pfn))
1964 			continue;
1965 		if (!early_pfn_in_nid(pfn, nid))
1966 			continue;
1967 		page = pfn_to_page(pfn);
1968 		set_page_links(page, zone, nid, pfn);
1969 		init_page_count(page);
1970 		reset_page_mapcount(page);
1971 		SetPageReserved(page);
1972 		INIT_LIST_HEAD(&page->lru);
1973 #ifdef WANT_PAGE_VIRTUAL
1974 		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
1975 		if (!is_highmem_idx(zone))
1976 			set_page_address(page, __va(pfn << PAGE_SHIFT));
1977 #endif
1978 	}
1979 }
1980 
1981 void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
1982 				unsigned long size)
1983 {
1984 	int order;
1985 	for (order = 0; order < MAX_ORDER ; order++) {
1986 		INIT_LIST_HEAD(&zone->free_area[order].free_list);
1987 		zone->free_area[order].nr_free = 0;
1988 	}
1989 }
1990 
1991 #ifndef __HAVE_ARCH_MEMMAP_INIT
1992 #define memmap_init(size, nid, zone, start_pfn) \
1993 	memmap_init_zone((size), (nid), (zone), (start_pfn))
1994 #endif
1995 
1996 static int __cpuinit zone_batchsize(struct zone *zone)
1997 {
1998 	int batch;
1999 
2000 	/*
2001 	 * The per-cpu-pages pools are set to around 1000th of the
2002 	 * size of the zone.  But no more than 1/2 of a meg.
2003 	 *
2004 	 * OK, so we don't know how big the cache is.  So guess.
2005 	 */
2006 	batch = zone->present_pages / 1024;
2007 	if (batch * PAGE_SIZE > 512 * 1024)
2008 		batch = (512 * 1024) / PAGE_SIZE;
2009 	batch /= 4;		/* We effectively *= 4 below */
2010 	if (batch < 1)
2011 		batch = 1;
2012 
2013 	/*
2014 	 * Clamp the batch to a 2^n - 1 value. Having a power
2015 	 * of 2 value was found to be more likely to have
2016 	 * suboptimal cache aliasing properties in some cases.
2017 	 *
2018 	 * For example if 2 tasks are alternately allocating
2019 	 * batches of pages, one task can end up with a lot
2020 	 * of pages of one half of the possible page colors
2021 	 * and the other with pages of the other colors.
2022 	 */
2023 	batch = (1 << (fls(batch + batch/2)-1)) - 1;
2024 
2025 	return batch;
2026 }
2027 
2028 inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2029 {
2030 	struct per_cpu_pages *pcp;
2031 
2032 	memset(p, 0, sizeof(*p));
2033 
2034 	pcp = &p->pcp[0];		/* hot */
2035 	pcp->count = 0;
2036 	pcp->high = 6 * batch;
2037 	pcp->batch = max(1UL, 1 * batch);
2038 	INIT_LIST_HEAD(&pcp->list);
2039 
2040 	pcp = &p->pcp[1];		/* cold*/
2041 	pcp->count = 0;
2042 	pcp->high = 2 * batch;
2043 	pcp->batch = max(1UL, batch/2);
2044 	INIT_LIST_HEAD(&pcp->list);
2045 }
2046 
2047 /*
2048  * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
2049  * to the value high for the pageset p.
2050  */
2051 
2052 static void setup_pagelist_highmark(struct per_cpu_pageset *p,
2053 				unsigned long high)
2054 {
2055 	struct per_cpu_pages *pcp;
2056 
2057 	pcp = &p->pcp[0]; /* hot list */
2058 	pcp->high = high;
2059 	pcp->batch = max(1UL, high/4);
2060 	if ((high/4) > (PAGE_SHIFT * 8))
2061 		pcp->batch = PAGE_SHIFT * 8;
2062 }
2063 
2064 
2065 #ifdef CONFIG_NUMA
2066 /*
2067  * Boot pageset table. One per cpu which is going to be used for all
2068  * zones and all nodes. The parameters will be set in such a way
2069  * that an item put on a list will immediately be handed over to
2070  * the buddy list. This is safe since pageset manipulation is done
2071  * with interrupts disabled.
2072  *
2073  * Some NUMA counter updates may also be caught by the boot pagesets.
2074  *
2075  * The boot_pagesets must be kept even after bootup is complete for
2076  * unused processors and/or zones. They do play a role for bootstrapping
2077  * hotplugged processors.
2078  *
2079  * zoneinfo_show() and maybe other functions do
2080  * not check if the processor is online before following the pageset pointer.
2081  * Other parts of the kernel may not check if the zone is available.
2082  */
2083 static struct per_cpu_pageset boot_pageset[NR_CPUS];
2084 
2085 /*
2086  * Dynamically allocate memory for the
2087  * per cpu pageset array in struct zone.
2088  */
2089 static int __cpuinit process_zones(int cpu)
2090 {
2091 	struct zone *zone, *dzone;
2092 
2093 	for_each_zone(zone) {
2094 
2095 		if (!populated_zone(zone))
2096 			continue;
2097 
2098 		zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
2099 					 GFP_KERNEL, cpu_to_node(cpu));
2100 		if (!zone_pcp(zone, cpu))
2101 			goto bad;
2102 
2103 		setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
2104 
2105 		if (percpu_pagelist_fraction)
2106 			setup_pagelist_highmark(zone_pcp(zone, cpu),
2107 			 	(zone->present_pages / percpu_pagelist_fraction));
2108 	}
2109 
2110 	return 0;
2111 bad:
2112 	for_each_zone(dzone) {
2113 		if (dzone == zone)
2114 			break;
2115 		kfree(zone_pcp(dzone, cpu));
2116 		zone_pcp(dzone, cpu) = NULL;
2117 	}
2118 	return -ENOMEM;
2119 }
2120 
2121 static inline void free_zone_pagesets(int cpu)
2122 {
2123 	struct zone *zone;
2124 
2125 	for_each_zone(zone) {
2126 		struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
2127 
2128 		/* Free per_cpu_pageset if it is slab allocated */
2129 		if (pset != &boot_pageset[cpu])
2130 			kfree(pset);
2131 		zone_pcp(zone, cpu) = NULL;
2132 	}
2133 }
2134 
2135 static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
2136 		unsigned long action,
2137 		void *hcpu)
2138 {
2139 	int cpu = (long)hcpu;
2140 	int ret = NOTIFY_OK;
2141 
2142 	switch (action) {
2143 	case CPU_UP_PREPARE:
2144 		if (process_zones(cpu))
2145 			ret = NOTIFY_BAD;
2146 		break;
2147 	case CPU_UP_CANCELED:
2148 	case CPU_DEAD:
2149 		free_zone_pagesets(cpu);
2150 		break;
2151 	default:
2152 		break;
2153 	}
2154 	return ret;
2155 }
2156 
2157 static struct notifier_block __cpuinitdata pageset_notifier =
2158 	{ &pageset_cpuup_callback, NULL, 0 };
2159 
2160 void __init setup_per_cpu_pageset(void)
2161 {
2162 	int err;
2163 
2164 	/* Initialize per_cpu_pageset for cpu 0.
2165 	 * A cpuup callback will do this for every cpu
2166 	 * as it comes online
2167 	 */
2168 	err = process_zones(smp_processor_id());
2169 	BUG_ON(err);
2170 	register_cpu_notifier(&pageset_notifier);
2171 }
2172 
2173 #endif
2174 
2175 static __meminit
2176 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
2177 {
2178 	int i;
2179 	struct pglist_data *pgdat = zone->zone_pgdat;
2180 	size_t alloc_size;
2181 
2182 	/*
2183 	 * The per-page waitqueue mechanism uses hashed waitqueues
2184 	 * per zone.
2185 	 */
2186 	zone->wait_table_hash_nr_entries =
2187 		 wait_table_hash_nr_entries(zone_size_pages);
2188 	zone->wait_table_bits =
2189 		wait_table_bits(zone->wait_table_hash_nr_entries);
2190 	alloc_size = zone->wait_table_hash_nr_entries
2191 					* sizeof(wait_queue_head_t);
2192 
2193  	if (system_state == SYSTEM_BOOTING) {
2194 		zone->wait_table = (wait_queue_head_t *)
2195 			alloc_bootmem_node(pgdat, alloc_size);
2196 	} else {
2197 		/*
2198 		 * This case means that a zone whose size was 0 gets new memory
2199 		 * via memory hot-add.
2200 		 * But it may be the case that a new node was hot-added.  In
2201 		 * this case vmalloc() will not be able to use this new node's
2202 		 * memory - this wait_table must be initialized to use this new
2203 		 * node itself as well.
2204 		 * To use this new node's memory, further consideration will be
2205 		 * necessary.
2206 		 */
2207 		zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);
2208 	}
2209 	if (!zone->wait_table)
2210 		return -ENOMEM;
2211 
2212 	for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
2213 		init_waitqueue_head(zone->wait_table + i);
2214 
2215 	return 0;
2216 }
2217 
2218 static __meminit void zone_pcp_init(struct zone *zone)
2219 {
2220 	int cpu;
2221 	unsigned long batch = zone_batchsize(zone);
2222 
2223 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
2224 #ifdef CONFIG_NUMA
2225 		/* Early boot. Slab allocator not functional yet */
2226 		zone_pcp(zone, cpu) = &boot_pageset[cpu];
2227 		setup_pageset(&boot_pageset[cpu],0);
2228 #else
2229 		setup_pageset(zone_pcp(zone,cpu), batch);
2230 #endif
2231 	}
2232 	if (zone->present_pages)
2233 		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
2234 			zone->name, zone->present_pages, batch);
2235 }
2236 
2237 __meminit int init_currently_empty_zone(struct zone *zone,
2238 					unsigned long zone_start_pfn,
2239 					unsigned long size)
2240 {
2241 	struct pglist_data *pgdat = zone->zone_pgdat;
2242 	int ret;
2243 	ret = zone_wait_table_init(zone, size);
2244 	if (ret)
2245 		return ret;
2246 	pgdat->nr_zones = zone_idx(zone) + 1;
2247 
2248 	zone->zone_start_pfn = zone_start_pfn;
2249 
2250 	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
2251 
2252 	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
2253 
2254 	return 0;
2255 }
2256 
2257 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2258 /*
2259  * Basic iterator support. Return the first range of PFNs for a node
2260  * Note: nid == MAX_NUMNODES returns first region regardless of node
2261  */
2262 static int __init first_active_region_index_in_nid(int nid)
2263 {
2264 	int i;
2265 
2266 	for (i = 0; i < nr_nodemap_entries; i++)
2267 		if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
2268 			return i;
2269 
2270 	return -1;
2271 }
2272 
2273 /*
2274  * Basic iterator support. Return the next active range of PFNs for a node
2275  * Note: nid == MAX_NUMNODES returns next region regardles of node
2276  */
2277 static int __init next_active_region_index_in_nid(int index, int nid)
2278 {
2279 	for (index = index + 1; index < nr_nodemap_entries; index++)
2280 		if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
2281 			return index;
2282 
2283 	return -1;
2284 }
2285 
2286 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
2287 /*
2288  * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
2289  * Architectures may implement their own version but if add_active_range()
2290  * was used and there are no special requirements, this is a convenient
2291  * alternative
2292  */
2293 int __init early_pfn_to_nid(unsigned long pfn)
2294 {
2295 	int i;
2296 
2297 	for (i = 0; i < nr_nodemap_entries; i++) {
2298 		unsigned long start_pfn = early_node_map[i].start_pfn;
2299 		unsigned long end_pfn = early_node_map[i].end_pfn;
2300 
2301 		if (start_pfn <= pfn && pfn < end_pfn)
2302 			return early_node_map[i].nid;
2303 	}
2304 
2305 	return 0;
2306 }
2307 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
2308 
2309 /* Basic iterator support to walk early_node_map[] */
2310 #define for_each_active_range_index_in_nid(i, nid) \
2311 	for (i = first_active_region_index_in_nid(nid); i != -1; \
2312 				i = next_active_region_index_in_nid(i, nid))
2313 
2314 /**
2315  * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
2316  * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
2317  * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
2318  *
2319  * If an architecture guarantees that all ranges registered with
2320  * add_active_ranges() contain no holes and may be freed, this
2321  * this function may be used instead of calling free_bootmem() manually.
2322  */
2323 void __init free_bootmem_with_active_regions(int nid,
2324 						unsigned long max_low_pfn)
2325 {
2326 	int i;
2327 
2328 	for_each_active_range_index_in_nid(i, nid) {
2329 		unsigned long size_pages = 0;
2330 		unsigned long end_pfn = early_node_map[i].end_pfn;
2331 
2332 		if (early_node_map[i].start_pfn >= max_low_pfn)
2333 			continue;
2334 
2335 		if (end_pfn > max_low_pfn)
2336 			end_pfn = max_low_pfn;
2337 
2338 		size_pages = end_pfn - early_node_map[i].start_pfn;
2339 		free_bootmem_node(NODE_DATA(early_node_map[i].nid),
2340 				PFN_PHYS(early_node_map[i].start_pfn),
2341 				size_pages << PAGE_SHIFT);
2342 	}
2343 }
2344 
2345 /**
2346  * sparse_memory_present_with_active_regions - Call memory_present for each active range
2347  * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
2348  *
2349  * If an architecture guarantees that all ranges registered with
2350  * add_active_ranges() contain no holes and may be freed, this
2351  * function may be used instead of calling memory_present() manually.
2352  */
2353 void __init sparse_memory_present_with_active_regions(int nid)
2354 {
2355 	int i;
2356 
2357 	for_each_active_range_index_in_nid(i, nid)
2358 		memory_present(early_node_map[i].nid,
2359 				early_node_map[i].start_pfn,
2360 				early_node_map[i].end_pfn);
2361 }
2362 
2363 /**
2364  * push_node_boundaries - Push node boundaries to at least the requested boundary
2365  * @nid: The nid of the node to push the boundary for
2366  * @start_pfn: The start pfn of the node
2367  * @end_pfn: The end pfn of the node
2368  *
2369  * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
2370  * time. Specifically, on x86_64, SRAT will report ranges that can potentially
2371  * be hotplugged even though no physical memory exists. This function allows
2372  * an arch to push out the node boundaries so mem_map is allocated that can
2373  * be used later.
2374  */
2375 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
2376 void __init push_node_boundaries(unsigned int nid,
2377 		unsigned long start_pfn, unsigned long end_pfn)
2378 {
2379 	printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",
2380 			nid, start_pfn, end_pfn);
2381 
2382 	/* Initialise the boundary for this node if necessary */
2383 	if (node_boundary_end_pfn[nid] == 0)
2384 		node_boundary_start_pfn[nid] = -1UL;
2385 
2386 	/* Update the boundaries */
2387 	if (node_boundary_start_pfn[nid] > start_pfn)
2388 		node_boundary_start_pfn[nid] = start_pfn;
2389 	if (node_boundary_end_pfn[nid] < end_pfn)
2390 		node_boundary_end_pfn[nid] = end_pfn;
2391 }
2392 
2393 /* If necessary, push the node boundary out for reserve hotadd */
2394 static void __init account_node_boundary(unsigned int nid,
2395 		unsigned long *start_pfn, unsigned long *end_pfn)
2396 {
2397 	printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
2398 			nid, *start_pfn, *end_pfn);
2399 
2400 	/* Return if boundary information has not been provided */
2401 	if (node_boundary_end_pfn[nid] == 0)
2402 		return;
2403 
2404 	/* Check the boundaries and update if necessary */
2405 	if (node_boundary_start_pfn[nid] < *start_pfn)
2406 		*start_pfn = node_boundary_start_pfn[nid];
2407 	if (node_boundary_end_pfn[nid] > *end_pfn)
2408 		*end_pfn = node_boundary_end_pfn[nid];
2409 }
2410 #else
2411 void __init push_node_boundaries(unsigned int nid,
2412 		unsigned long start_pfn, unsigned long end_pfn) {}
2413 
2414 static void __init account_node_boundary(unsigned int nid,
2415 		unsigned long *start_pfn, unsigned long *end_pfn) {}
2416 #endif
2417 
2418 
2419 /**
2420  * get_pfn_range_for_nid - Return the start and end page frames for a node
2421  * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
2422  * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
2423  * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
2424  *
2425  * It returns the start and end page frame of a node based on information
2426  * provided by an arch calling add_active_range(). If called for a node
2427  * with no available memory, a warning is printed and the start and end
2428  * PFNs will be 0.
2429  */
2430 void __init get_pfn_range_for_nid(unsigned int nid,
2431 			unsigned long *start_pfn, unsigned long *end_pfn)
2432 {
2433 	int i;
2434 	*start_pfn = -1UL;
2435 	*end_pfn = 0;
2436 
2437 	for_each_active_range_index_in_nid(i, nid) {
2438 		*start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
2439 		*end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
2440 	}
2441 
2442 	if (*start_pfn == -1UL) {
2443 		printk(KERN_WARNING "Node %u active with no memory\n", nid);
2444 		*start_pfn = 0;
2445 	}
2446 
2447 	/* Push the node boundaries out if requested */
2448 	account_node_boundary(nid, start_pfn, end_pfn);
2449 }
2450 
2451 /*
2452  * Return the number of pages a zone spans in a node, including holes
2453  * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
2454  */
2455 unsigned long __init zone_spanned_pages_in_node(int nid,
2456 					unsigned long zone_type,
2457 					unsigned long *ignored)
2458 {
2459 	unsigned long node_start_pfn, node_end_pfn;
2460 	unsigned long zone_start_pfn, zone_end_pfn;
2461 
2462 	/* Get the start and end of the node and zone */
2463 	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
2464 	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
2465 	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
2466 
2467 	/* Check that this node has pages within the zone's required range */
2468 	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
2469 		return 0;
2470 
2471 	/* Move the zone boundaries inside the node if necessary */
2472 	zone_end_pfn = min(zone_end_pfn, node_end_pfn);
2473 	zone_start_pfn = max(zone_start_pfn, node_start_pfn);
2474 
2475 	/* Return the spanned pages */
2476 	return zone_end_pfn - zone_start_pfn;
2477 }
2478 
2479 /*
2480  * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
2481  * then all holes in the requested range will be accounted for.
2482  */
2483 unsigned long __init __absent_pages_in_range(int nid,
2484 				unsigned long range_start_pfn,
2485 				unsigned long range_end_pfn)
2486 {
2487 	int i = 0;
2488 	unsigned long prev_end_pfn = 0, hole_pages = 0;
2489 	unsigned long start_pfn;
2490 
2491 	/* Find the end_pfn of the first active range of pfns in the node */
2492 	i = first_active_region_index_in_nid(nid);
2493 	if (i == -1)
2494 		return 0;
2495 
2496 	/* Account for ranges before physical memory on this node */
2497 	if (early_node_map[i].start_pfn > range_start_pfn)
2498 		hole_pages = early_node_map[i].start_pfn - range_start_pfn;
2499 
2500 	prev_end_pfn = early_node_map[i].start_pfn;
2501 
2502 	/* Find all holes for the zone within the node */
2503 	for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
2504 
2505 		/* No need to continue if prev_end_pfn is outside the zone */
2506 		if (prev_end_pfn >= range_end_pfn)
2507 			break;
2508 
2509 		/* Make sure the end of the zone is not within the hole */
2510 		start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
2511 		prev_end_pfn = max(prev_end_pfn, range_start_pfn);
2512 
2513 		/* Update the hole size cound and move on */
2514 		if (start_pfn > range_start_pfn) {
2515 			BUG_ON(prev_end_pfn > start_pfn);
2516 			hole_pages += start_pfn - prev_end_pfn;
2517 		}
2518 		prev_end_pfn = early_node_map[i].end_pfn;
2519 	}
2520 
2521 	/* Account for ranges past physical memory on this node */
2522 	if (range_end_pfn > prev_end_pfn)
2523 		hole_pages += range_end_pfn -
2524 				max(range_start_pfn, prev_end_pfn);
2525 
2526 	return hole_pages;
2527 }
2528 
2529 /**
2530  * absent_pages_in_range - Return number of page frames in holes within a range
2531  * @start_pfn: The start PFN to start searching for holes
2532  * @end_pfn: The end PFN to stop searching for holes
2533  *
2534  * It returns the number of pages frames in memory holes within a range.
2535  */
2536 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
2537 							unsigned long end_pfn)
2538 {
2539 	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
2540 }
2541 
2542 /* Return the number of page frames in holes in a zone on a node */
2543 unsigned long __init zone_absent_pages_in_node(int nid,
2544 					unsigned long zone_type,
2545 					unsigned long *ignored)
2546 {
2547 	unsigned long node_start_pfn, node_end_pfn;
2548 	unsigned long zone_start_pfn, zone_end_pfn;
2549 
2550 	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
2551 	zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
2552 							node_start_pfn);
2553 	zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
2554 							node_end_pfn);
2555 
2556 	return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
2557 }
2558 
2559 #else
2560 static inline unsigned long zone_spanned_pages_in_node(int nid,
2561 					unsigned long zone_type,
2562 					unsigned long *zones_size)
2563 {
2564 	return zones_size[zone_type];
2565 }
2566 
2567 static inline unsigned long zone_absent_pages_in_node(int nid,
2568 						unsigned long zone_type,
2569 						unsigned long *zholes_size)
2570 {
2571 	if (!zholes_size)
2572 		return 0;
2573 
2574 	return zholes_size[zone_type];
2575 }
2576 
2577 #endif
2578 
2579 static void __init calculate_node_totalpages(struct pglist_data *pgdat,
2580 		unsigned long *zones_size, unsigned long *zholes_size)
2581 {
2582 	unsigned long realtotalpages, totalpages = 0;
2583 	enum zone_type i;
2584 
2585 	for (i = 0; i < MAX_NR_ZONES; i++)
2586 		totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
2587 								zones_size);
2588 	pgdat->node_spanned_pages = totalpages;
2589 
2590 	realtotalpages = totalpages;
2591 	for (i = 0; i < MAX_NR_ZONES; i++)
2592 		realtotalpages -=
2593 			zone_absent_pages_in_node(pgdat->node_id, i,
2594 								zholes_size);
2595 	pgdat->node_present_pages = realtotalpages;
2596 	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
2597 							realtotalpages);
2598 }
2599 
2600 /*
2601  * Set up the zone data structures:
2602  *   - mark all pages reserved
2603  *   - mark all memory queues empty
2604  *   - clear the memory bitmaps
2605  */
2606 static void __meminit free_area_init_core(struct pglist_data *pgdat,
2607 		unsigned long *zones_size, unsigned long *zholes_size)
2608 {
2609 	enum zone_type j;
2610 	int nid = pgdat->node_id;
2611 	unsigned long zone_start_pfn = pgdat->node_start_pfn;
2612 	int ret;
2613 
2614 	pgdat_resize_init(pgdat);
2615 	pgdat->nr_zones = 0;
2616 	init_waitqueue_head(&pgdat->kswapd_wait);
2617 	pgdat->kswapd_max_order = 0;
2618 
2619 	for (j = 0; j < MAX_NR_ZONES; j++) {
2620 		struct zone *zone = pgdat->node_zones + j;
2621 		unsigned long size, realsize, memmap_pages;
2622 
2623 		size = zone_spanned_pages_in_node(nid, j, zones_size);
2624 		realsize = size - zone_absent_pages_in_node(nid, j,
2625 								zholes_size);
2626 
2627 		/*
2628 		 * Adjust realsize so that it accounts for how much memory
2629 		 * is used by this zone for memmap. This affects the watermark
2630 		 * and per-cpu initialisations
2631 		 */
2632 		memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;
2633 		if (realsize >= memmap_pages) {
2634 			realsize -= memmap_pages;
2635 			printk(KERN_DEBUG
2636 				"  %s zone: %lu pages used for memmap\n",
2637 				zone_names[j], memmap_pages);
2638 		} else
2639 			printk(KERN_WARNING
2640 				"  %s zone: %lu pages exceeds realsize %lu\n",
2641 				zone_names[j], memmap_pages, realsize);
2642 
2643 		/* Account for reserved DMA pages */
2644 		if (j == ZONE_DMA && realsize > dma_reserve) {
2645 			realsize -= dma_reserve;
2646 			printk(KERN_DEBUG "  DMA zone: %lu pages reserved\n",
2647 								dma_reserve);
2648 		}
2649 
2650 		if (!is_highmem_idx(j))
2651 			nr_kernel_pages += realsize;
2652 		nr_all_pages += realsize;
2653 
2654 		zone->spanned_pages = size;
2655 		zone->present_pages = realsize;
2656 #ifdef CONFIG_NUMA
2657 		zone->node = nid;
2658 		zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
2659 						/ 100;
2660 		zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
2661 #endif
2662 		zone->name = zone_names[j];
2663 		spin_lock_init(&zone->lock);
2664 		spin_lock_init(&zone->lru_lock);
2665 		zone_seqlock_init(zone);
2666 		zone->zone_pgdat = pgdat;
2667 		zone->free_pages = 0;
2668 
2669 		zone->prev_priority = DEF_PRIORITY;
2670 
2671 		zone_pcp_init(zone);
2672 		INIT_LIST_HEAD(&zone->active_list);
2673 		INIT_LIST_HEAD(&zone->inactive_list);
2674 		zone->nr_scan_active = 0;
2675 		zone->nr_scan_inactive = 0;
2676 		zone->nr_active = 0;
2677 		zone->nr_inactive = 0;
2678 		zap_zone_vm_stats(zone);
2679 		atomic_set(&zone->reclaim_in_progress, 0);
2680 		if (!size)
2681 			continue;
2682 
2683 		ret = init_currently_empty_zone(zone, zone_start_pfn, size);
2684 		BUG_ON(ret);
2685 		zone_start_pfn += size;
2686 	}
2687 }
2688 
2689 static void __init alloc_node_mem_map(struct pglist_data *pgdat)
2690 {
2691 	/* Skip empty nodes */
2692 	if (!pgdat->node_spanned_pages)
2693 		return;
2694 
2695 #ifdef CONFIG_FLAT_NODE_MEM_MAP
2696 	/* ia64 gets its own node_mem_map, before this, without bootmem */
2697 	if (!pgdat->node_mem_map) {
2698 		unsigned long size, start, end;
2699 		struct page *map;
2700 
2701 		/*
2702 		 * The zone's endpoints aren't required to be MAX_ORDER
2703 		 * aligned but the node_mem_map endpoints must be in order
2704 		 * for the buddy allocator to function correctly.
2705 		 */
2706 		start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
2707 		end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
2708 		end = ALIGN(end, MAX_ORDER_NR_PAGES);
2709 		size =  (end - start) * sizeof(struct page);
2710 		map = alloc_remap(pgdat->node_id, size);
2711 		if (!map)
2712 			map = alloc_bootmem_node(pgdat, size);
2713 		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
2714 	}
2715 #ifdef CONFIG_FLATMEM
2716 	/*
2717 	 * With no DISCONTIG, the global mem_map is just set as node 0's
2718 	 */
2719 	if (pgdat == NODE_DATA(0)) {
2720 		mem_map = NODE_DATA(0)->node_mem_map;
2721 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2722 		if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
2723 			mem_map -= pgdat->node_start_pfn;
2724 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
2725 	}
2726 #endif
2727 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
2728 }
2729 
2730 void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
2731 		unsigned long *zones_size, unsigned long node_start_pfn,
2732 		unsigned long *zholes_size)
2733 {
2734 	pgdat->node_id = nid;
2735 	pgdat->node_start_pfn = node_start_pfn;
2736 	calculate_node_totalpages(pgdat, zones_size, zholes_size);
2737 
2738 	alloc_node_mem_map(pgdat);
2739 
2740 	free_area_init_core(pgdat, zones_size, zholes_size);
2741 }
2742 
2743 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2744 /**
2745  * add_active_range - Register a range of PFNs backed by physical memory
2746  * @nid: The node ID the range resides on
2747  * @start_pfn: The start PFN of the available physical memory
2748  * @end_pfn: The end PFN of the available physical memory
2749  *
2750  * These ranges are stored in an early_node_map[] and later used by
2751  * free_area_init_nodes() to calculate zone sizes and holes. If the
2752  * range spans a memory hole, it is up to the architecture to ensure
2753  * the memory is not freed by the bootmem allocator. If possible
2754  * the range being registered will be merged with existing ranges.
2755  */
2756 void __init add_active_range(unsigned int nid, unsigned long start_pfn,
2757 						unsigned long end_pfn)
2758 {
2759 	int i;
2760 
2761 	printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "
2762 			  "%d entries of %d used\n",
2763 			  nid, start_pfn, end_pfn,
2764 			  nr_nodemap_entries, MAX_ACTIVE_REGIONS);
2765 
2766 	/* Merge with existing active regions if possible */
2767 	for (i = 0; i < nr_nodemap_entries; i++) {
2768 		if (early_node_map[i].nid != nid)
2769 			continue;
2770 
2771 		/* Skip if an existing region covers this new one */
2772 		if (start_pfn >= early_node_map[i].start_pfn &&
2773 				end_pfn <= early_node_map[i].end_pfn)
2774 			return;
2775 
2776 		/* Merge forward if suitable */
2777 		if (start_pfn <= early_node_map[i].end_pfn &&
2778 				end_pfn > early_node_map[i].end_pfn) {
2779 			early_node_map[i].end_pfn = end_pfn;
2780 			return;
2781 		}
2782 
2783 		/* Merge backward if suitable */
2784 		if (start_pfn < early_node_map[i].end_pfn &&
2785 				end_pfn >= early_node_map[i].start_pfn) {
2786 			early_node_map[i].start_pfn = start_pfn;
2787 			return;
2788 		}
2789 	}
2790 
2791 	/* Check that early_node_map is large enough */
2792 	if (i >= MAX_ACTIVE_REGIONS) {
2793 		printk(KERN_CRIT "More than %d memory regions, truncating\n",
2794 							MAX_ACTIVE_REGIONS);
2795 		return;
2796 	}
2797 
2798 	early_node_map[i].nid = nid;
2799 	early_node_map[i].start_pfn = start_pfn;
2800 	early_node_map[i].end_pfn = end_pfn;
2801 	nr_nodemap_entries = i + 1;
2802 }
2803 
2804 /**
2805  * shrink_active_range - Shrink an existing registered range of PFNs
2806  * @nid: The node id the range is on that should be shrunk
2807  * @old_end_pfn: The old end PFN of the range
2808  * @new_end_pfn: The new PFN of the range
2809  *
2810  * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
2811  * The map is kept at the end physical page range that has already been
2812  * registered with add_active_range(). This function allows an arch to shrink
2813  * an existing registered range.
2814  */
2815 void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
2816 						unsigned long new_end_pfn)
2817 {
2818 	int i;
2819 
2820 	/* Find the old active region end and shrink */
2821 	for_each_active_range_index_in_nid(i, nid)
2822 		if (early_node_map[i].end_pfn == old_end_pfn) {
2823 			early_node_map[i].end_pfn = new_end_pfn;
2824 			break;
2825 		}
2826 }
2827 
2828 /**
2829  * remove_all_active_ranges - Remove all currently registered regions
2830  *
2831  * During discovery, it may be found that a table like SRAT is invalid
2832  * and an alternative discovery method must be used. This function removes
2833  * all currently registered regions.
2834  */
2835 void __init remove_all_active_ranges(void)
2836 {
2837 	memset(early_node_map, 0, sizeof(early_node_map));
2838 	nr_nodemap_entries = 0;
2839 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
2840 	memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
2841 	memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
2842 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
2843 }
2844 
2845 /* Compare two active node_active_regions */
2846 static int __init cmp_node_active_region(const void *a, const void *b)
2847 {
2848 	struct node_active_region *arange = (struct node_active_region *)a;
2849 	struct node_active_region *brange = (struct node_active_region *)b;
2850 
2851 	/* Done this way to avoid overflows */
2852 	if (arange->start_pfn > brange->start_pfn)
2853 		return 1;
2854 	if (arange->start_pfn < brange->start_pfn)
2855 		return -1;
2856 
2857 	return 0;
2858 }
2859 
2860 /* sort the node_map by start_pfn */
2861 static void __init sort_node_map(void)
2862 {
2863 	sort(early_node_map, (size_t)nr_nodemap_entries,
2864 			sizeof(struct node_active_region),
2865 			cmp_node_active_region, NULL);
2866 }
2867 
2868 /* Find the lowest pfn for a node. This depends on a sorted early_node_map */
2869 unsigned long __init find_min_pfn_for_node(unsigned long nid)
2870 {
2871 	int i;
2872 
2873 	/* Regions in the early_node_map can be in any order */
2874 	sort_node_map();
2875 
2876 	/* Assuming a sorted map, the first range found has the starting pfn */
2877 	for_each_active_range_index_in_nid(i, nid)
2878 		return early_node_map[i].start_pfn;
2879 
2880 	printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
2881 	return 0;
2882 }
2883 
2884 /**
2885  * find_min_pfn_with_active_regions - Find the minimum PFN registered
2886  *
2887  * It returns the minimum PFN based on information provided via
2888  * add_active_range().
2889  */
2890 unsigned long __init find_min_pfn_with_active_regions(void)
2891 {
2892 	return find_min_pfn_for_node(MAX_NUMNODES);
2893 }
2894 
2895 /**
2896  * find_max_pfn_with_active_regions - Find the maximum PFN registered
2897  *
2898  * It returns the maximum PFN based on information provided via
2899  * add_active_range().
2900  */
2901 unsigned long __init find_max_pfn_with_active_regions(void)
2902 {
2903 	int i;
2904 	unsigned long max_pfn = 0;
2905 
2906 	for (i = 0; i < nr_nodemap_entries; i++)
2907 		max_pfn = max(max_pfn, early_node_map[i].end_pfn);
2908 
2909 	return max_pfn;
2910 }
2911 
2912 /**
2913  * free_area_init_nodes - Initialise all pg_data_t and zone data
2914  * @max_zone_pfn: an array of max PFNs for each zone
2915  *
2916  * This will call free_area_init_node() for each active node in the system.
2917  * Using the page ranges provided by add_active_range(), the size of each
2918  * zone in each node and their holes is calculated. If the maximum PFN
2919  * between two adjacent zones match, it is assumed that the zone is empty.
2920  * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
2921  * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
2922  * starts where the previous one ended. For example, ZONE_DMA32 starts
2923  * at arch_max_dma_pfn.
2924  */
2925 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
2926 {
2927 	unsigned long nid;
2928 	enum zone_type i;
2929 
2930 	/* Record where the zone boundaries are */
2931 	memset(arch_zone_lowest_possible_pfn, 0,
2932 				sizeof(arch_zone_lowest_possible_pfn));
2933 	memset(arch_zone_highest_possible_pfn, 0,
2934 				sizeof(arch_zone_highest_possible_pfn));
2935 	arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
2936 	arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
2937 	for (i = 1; i < MAX_NR_ZONES; i++) {
2938 		arch_zone_lowest_possible_pfn[i] =
2939 			arch_zone_highest_possible_pfn[i-1];
2940 		arch_zone_highest_possible_pfn[i] =
2941 			max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
2942 	}
2943 
2944 	/* Print out the zone ranges */
2945 	printk("Zone PFN ranges:\n");
2946 	for (i = 0; i < MAX_NR_ZONES; i++)
2947 		printk("  %-8s %8lu -> %8lu\n",
2948 				zone_names[i],
2949 				arch_zone_lowest_possible_pfn[i],
2950 				arch_zone_highest_possible_pfn[i]);
2951 
2952 	/* Print out the early_node_map[] */
2953 	printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
2954 	for (i = 0; i < nr_nodemap_entries; i++)
2955 		printk("  %3d: %8lu -> %8lu\n", early_node_map[i].nid,
2956 						early_node_map[i].start_pfn,
2957 						early_node_map[i].end_pfn);
2958 
2959 	/* Initialise every node */
2960 	for_each_online_node(nid) {
2961 		pg_data_t *pgdat = NODE_DATA(nid);
2962 		free_area_init_node(nid, pgdat, NULL,
2963 				find_min_pfn_for_node(nid), NULL);
2964 	}
2965 }
2966 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
2967 
2968 /**
2969  * set_dma_reserve - set the specified number of pages reserved in the first zone
2970  * @new_dma_reserve: The number of pages to mark reserved
2971  *
2972  * The per-cpu batchsize and zone watermarks are determined by present_pages.
2973  * In the DMA zone, a significant percentage may be consumed by kernel image
2974  * and other unfreeable allocations which can skew the watermarks badly. This
2975  * function may optionally be used to account for unfreeable pages in the
2976  * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
2977  * smaller per-cpu batchsize.
2978  */
2979 void __init set_dma_reserve(unsigned long new_dma_reserve)
2980 {
2981 	dma_reserve = new_dma_reserve;
2982 }
2983 
2984 #ifndef CONFIG_NEED_MULTIPLE_NODES
2985 static bootmem_data_t contig_bootmem_data;
2986 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
2987 
2988 EXPORT_SYMBOL(contig_page_data);
2989 #endif
2990 
2991 void __init free_area_init(unsigned long *zones_size)
2992 {
2993 	free_area_init_node(0, NODE_DATA(0), zones_size,
2994 			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
2995 }
2996 
2997 static int page_alloc_cpu_notify(struct notifier_block *self,
2998 				 unsigned long action, void *hcpu)
2999 {
3000 	int cpu = (unsigned long)hcpu;
3001 
3002 	if (action == CPU_DEAD) {
3003 		local_irq_disable();
3004 		__drain_pages(cpu);
3005 		vm_events_fold_cpu(cpu);
3006 		local_irq_enable();
3007 		refresh_cpu_vm_stats(cpu);
3008 	}
3009 	return NOTIFY_OK;
3010 }
3011 
3012 void __init page_alloc_init(void)
3013 {
3014 	hotcpu_notifier(page_alloc_cpu_notify, 0);
3015 }
3016 
3017 /*
3018  * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
3019  *	or min_free_kbytes changes.
3020  */
3021 static void calculate_totalreserve_pages(void)
3022 {
3023 	struct pglist_data *pgdat;
3024 	unsigned long reserve_pages = 0;
3025 	enum zone_type i, j;
3026 
3027 	for_each_online_pgdat(pgdat) {
3028 		for (i = 0; i < MAX_NR_ZONES; i++) {
3029 			struct zone *zone = pgdat->node_zones + i;
3030 			unsigned long max = 0;
3031 
3032 			/* Find valid and maximum lowmem_reserve in the zone */
3033 			for (j = i; j < MAX_NR_ZONES; j++) {
3034 				if (zone->lowmem_reserve[j] > max)
3035 					max = zone->lowmem_reserve[j];
3036 			}
3037 
3038 			/* we treat pages_high as reserved pages. */
3039 			max += zone->pages_high;
3040 
3041 			if (max > zone->present_pages)
3042 				max = zone->present_pages;
3043 			reserve_pages += max;
3044 		}
3045 	}
3046 	totalreserve_pages = reserve_pages;
3047 }
3048 
3049 /*
3050  * setup_per_zone_lowmem_reserve - called whenever
3051  *	sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
3052  *	has a correct pages reserved value, so an adequate number of
3053  *	pages are left in the zone after a successful __alloc_pages().
3054  */
3055 static void setup_per_zone_lowmem_reserve(void)
3056 {
3057 	struct pglist_data *pgdat;
3058 	enum zone_type j, idx;
3059 
3060 	for_each_online_pgdat(pgdat) {
3061 		for (j = 0; j < MAX_NR_ZONES; j++) {
3062 			struct zone *zone = pgdat->node_zones + j;
3063 			unsigned long present_pages = zone->present_pages;
3064 
3065 			zone->lowmem_reserve[j] = 0;
3066 
3067 			idx = j;
3068 			while (idx) {
3069 				struct zone *lower_zone;
3070 
3071 				idx--;
3072 
3073 				if (sysctl_lowmem_reserve_ratio[idx] < 1)
3074 					sysctl_lowmem_reserve_ratio[idx] = 1;
3075 
3076 				lower_zone = pgdat->node_zones + idx;
3077 				lower_zone->lowmem_reserve[j] = present_pages /
3078 					sysctl_lowmem_reserve_ratio[idx];
3079 				present_pages += lower_zone->present_pages;
3080 			}
3081 		}
3082 	}
3083 
3084 	/* update totalreserve_pages */
3085 	calculate_totalreserve_pages();
3086 }
3087 
3088 /**
3089  * setup_per_zone_pages_min - called when min_free_kbytes changes.
3090  *
3091  * Ensures that the pages_{min,low,high} values for each zone are set correctly
3092  * with respect to min_free_kbytes.
3093  */
3094 void setup_per_zone_pages_min(void)
3095 {
3096 	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
3097 	unsigned long lowmem_pages = 0;
3098 	struct zone *zone;
3099 	unsigned long flags;
3100 
3101 	/* Calculate total number of !ZONE_HIGHMEM pages */
3102 	for_each_zone(zone) {
3103 		if (!is_highmem(zone))
3104 			lowmem_pages += zone->present_pages;
3105 	}
3106 
3107 	for_each_zone(zone) {
3108 		u64 tmp;
3109 
3110 		spin_lock_irqsave(&zone->lru_lock, flags);
3111 		tmp = (u64)pages_min * zone->present_pages;
3112 		do_div(tmp, lowmem_pages);
3113 		if (is_highmem(zone)) {
3114 			/*
3115 			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
3116 			 * need highmem pages, so cap pages_min to a small
3117 			 * value here.
3118 			 *
3119 			 * The (pages_high-pages_low) and (pages_low-pages_min)
3120 			 * deltas controls asynch page reclaim, and so should
3121 			 * not be capped for highmem.
3122 			 */
3123 			int min_pages;
3124 
3125 			min_pages = zone->present_pages / 1024;
3126 			if (min_pages < SWAP_CLUSTER_MAX)
3127 				min_pages = SWAP_CLUSTER_MAX;
3128 			if (min_pages > 128)
3129 				min_pages = 128;
3130 			zone->pages_min = min_pages;
3131 		} else {
3132 			/*
3133 			 * If it's a lowmem zone, reserve a number of pages
3134 			 * proportionate to the zone's size.
3135 			 */
3136 			zone->pages_min = tmp;
3137 		}
3138 
3139 		zone->pages_low   = zone->pages_min + (tmp >> 2);
3140 		zone->pages_high  = zone->pages_min + (tmp >> 1);
3141 		spin_unlock_irqrestore(&zone->lru_lock, flags);
3142 	}
3143 
3144 	/* update totalreserve_pages */
3145 	calculate_totalreserve_pages();
3146 }
3147 
3148 /*
3149  * Initialise min_free_kbytes.
3150  *
3151  * For small machines we want it small (128k min).  For large machines
3152  * we want it large (64MB max).  But it is not linear, because network
3153  * bandwidth does not increase linearly with machine size.  We use
3154  *
3155  * 	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
3156  *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
3157  *
3158  * which yields
3159  *
3160  * 16MB:	512k
3161  * 32MB:	724k
3162  * 64MB:	1024k
3163  * 128MB:	1448k
3164  * 256MB:	2048k
3165  * 512MB:	2896k
3166  * 1024MB:	4096k
3167  * 2048MB:	5792k
3168  * 4096MB:	8192k
3169  * 8192MB:	11584k
3170  * 16384MB:	16384k
3171  */
3172 static int __init init_per_zone_pages_min(void)
3173 {
3174 	unsigned long lowmem_kbytes;
3175 
3176 	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
3177 
3178 	min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
3179 	if (min_free_kbytes < 128)
3180 		min_free_kbytes = 128;
3181 	if (min_free_kbytes > 65536)
3182 		min_free_kbytes = 65536;
3183 	setup_per_zone_pages_min();
3184 	setup_per_zone_lowmem_reserve();
3185 	return 0;
3186 }
3187 module_init(init_per_zone_pages_min)
3188 
3189 /*
3190  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
3191  *	that we can call two helper functions whenever min_free_kbytes
3192  *	changes.
3193  */
3194 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
3195 	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
3196 {
3197 	proc_dointvec(table, write, file, buffer, length, ppos);
3198 	setup_per_zone_pages_min();
3199 	return 0;
3200 }
3201 
3202 #ifdef CONFIG_NUMA
3203 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
3204 	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
3205 {
3206 	struct zone *zone;
3207 	int rc;
3208 
3209 	rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
3210 	if (rc)
3211 		return rc;
3212 
3213 	for_each_zone(zone)
3214 		zone->min_unmapped_pages = (zone->present_pages *
3215 				sysctl_min_unmapped_ratio) / 100;
3216 	return 0;
3217 }
3218 
3219 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
3220 	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
3221 {
3222 	struct zone *zone;
3223 	int rc;
3224 
3225 	rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
3226 	if (rc)
3227 		return rc;
3228 
3229 	for_each_zone(zone)
3230 		zone->min_slab_pages = (zone->present_pages *
3231 				sysctl_min_slab_ratio) / 100;
3232 	return 0;
3233 }
3234 #endif
3235 
3236 /*
3237  * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
3238  *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
3239  *	whenever sysctl_lowmem_reserve_ratio changes.
3240  *
3241  * The reserve ratio obviously has absolutely no relation with the
3242  * pages_min watermarks. The lowmem reserve ratio can only make sense
3243  * if in function of the boot time zone sizes.
3244  */
3245 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
3246 	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
3247 {
3248 	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
3249 	setup_per_zone_lowmem_reserve();
3250 	return 0;
3251 }
3252 
3253 /*
3254  * percpu_pagelist_fraction - changes the pcp->high for each zone on each
3255  * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
3256  * can have before it gets flushed back to buddy allocator.
3257  */
3258 
3259 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
3260 	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
3261 {
3262 	struct zone *zone;
3263 	unsigned int cpu;
3264 	int ret;
3265 
3266 	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
3267 	if (!write || (ret == -EINVAL))
3268 		return ret;
3269 	for_each_zone(zone) {
3270 		for_each_online_cpu(cpu) {
3271 			unsigned long  high;
3272 			high = zone->present_pages / percpu_pagelist_fraction;
3273 			setup_pagelist_highmark(zone_pcp(zone, cpu), high);
3274 		}
3275 	}
3276 	return 0;
3277 }
3278 
3279 int hashdist = HASHDIST_DEFAULT;
3280 
3281 #ifdef CONFIG_NUMA
3282 static int __init set_hashdist(char *str)
3283 {
3284 	if (!str)
3285 		return 0;
3286 	hashdist = simple_strtoul(str, &str, 0);
3287 	return 1;
3288 }
3289 __setup("hashdist=", set_hashdist);
3290 #endif
3291 
3292 /*
3293  * allocate a large system hash table from bootmem
3294  * - it is assumed that the hash table must contain an exact power-of-2
3295  *   quantity of entries
3296  * - limit is the number of hash buckets, not the total allocation size
3297  */
3298 void *__init alloc_large_system_hash(const char *tablename,
3299 				     unsigned long bucketsize,
3300 				     unsigned long numentries,
3301 				     int scale,
3302 				     int flags,
3303 				     unsigned int *_hash_shift,
3304 				     unsigned int *_hash_mask,
3305 				     unsigned long limit)
3306 {
3307 	unsigned long long max = limit;
3308 	unsigned long log2qty, size;
3309 	void *table = NULL;
3310 
3311 	/* allow the kernel cmdline to have a say */
3312 	if (!numentries) {
3313 		/* round applicable memory size up to nearest megabyte */
3314 		numentries = nr_kernel_pages;
3315 		numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
3316 		numentries >>= 20 - PAGE_SHIFT;
3317 		numentries <<= 20 - PAGE_SHIFT;
3318 
3319 		/* limit to 1 bucket per 2^scale bytes of low memory */
3320 		if (scale > PAGE_SHIFT)
3321 			numentries >>= (scale - PAGE_SHIFT);
3322 		else
3323 			numentries <<= (PAGE_SHIFT - scale);
3324 	}
3325 	numentries = roundup_pow_of_two(numentries);
3326 
3327 	/* limit allocation size to 1/16 total memory by default */
3328 	if (max == 0) {
3329 		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
3330 		do_div(max, bucketsize);
3331 	}
3332 
3333 	if (numentries > max)
3334 		numentries = max;
3335 
3336 	log2qty = ilog2(numentries);
3337 
3338 	do {
3339 		size = bucketsize << log2qty;
3340 		if (flags & HASH_EARLY)
3341 			table = alloc_bootmem(size);
3342 		else if (hashdist)
3343 			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
3344 		else {
3345 			unsigned long order;
3346 			for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
3347 				;
3348 			table = (void*) __get_free_pages(GFP_ATOMIC, order);
3349 		}
3350 	} while (!table && size > PAGE_SIZE && --log2qty);
3351 
3352 	if (!table)
3353 		panic("Failed to allocate %s hash table\n", tablename);
3354 
3355 	printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
3356 	       tablename,
3357 	       (1U << log2qty),
3358 	       ilog2(size) - PAGE_SHIFT,
3359 	       size);
3360 
3361 	if (_hash_shift)
3362 		*_hash_shift = log2qty;
3363 	if (_hash_mask)
3364 		*_hash_mask = (1 << log2qty) - 1;
3365 
3366 	return table;
3367 }
3368 
3369 #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
3370 struct page *pfn_to_page(unsigned long pfn)
3371 {
3372 	return __pfn_to_page(pfn);
3373 }
3374 unsigned long page_to_pfn(struct page *page)
3375 {
3376 	return __page_to_pfn(page);
3377 }
3378 EXPORT_SYMBOL(pfn_to_page);
3379 EXPORT_SYMBOL(page_to_pfn);
3380 #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
3381 
3382 #if MAX_NUMNODES > 1
3383 /*
3384  * Find the highest possible node id.
3385  */
3386 int highest_possible_node_id(void)
3387 {
3388 	unsigned int node;
3389 	unsigned int highest = 0;
3390 
3391 	for_each_node_mask(node, node_possible_map)
3392 		highest = node;
3393 	return highest;
3394 }
3395 EXPORT_SYMBOL(highest_possible_node_id);
3396 #endif
3397