1 /* 2 * linux/mm/page_alloc.c 3 * 4 * Manages the free list, the system allocates free pages here. 5 * Note that kmalloc() lives in slab.c 6 * 7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 8 * Swap reorganised 29.12.95, Stephen Tweedie 9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 15 */ 16 17 #include <linux/stddef.h> 18 #include <linux/mm.h> 19 #include <linux/swap.h> 20 #include <linux/interrupt.h> 21 #include <linux/pagemap.h> 22 #include <linux/bootmem.h> 23 #include <linux/compiler.h> 24 #include <linux/kernel.h> 25 #include <linux/module.h> 26 #include <linux/suspend.h> 27 #include <linux/pagevec.h> 28 #include <linux/blkdev.h> 29 #include <linux/slab.h> 30 #include <linux/notifier.h> 31 #include <linux/topology.h> 32 #include <linux/sysctl.h> 33 #include <linux/cpu.h> 34 #include <linux/cpuset.h> 35 #include <linux/memory_hotplug.h> 36 #include <linux/nodemask.h> 37 #include <linux/vmalloc.h> 38 #include <linux/mempolicy.h> 39 #include <linux/stop_machine.h> 40 #include <linux/sort.h> 41 #include <linux/pfn.h> 42 #include <linux/backing-dev.h> 43 #include <linux/fault-inject.h> 44 45 #include <asm/tlbflush.h> 46 #include <asm/div64.h> 47 #include "internal.h" 48 49 /* 50 * MCD - HACK: Find somewhere to initialize this EARLY, or make this 51 * initializer cleaner 52 */ 53 nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; 54 EXPORT_SYMBOL(node_online_map); 55 nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; 56 EXPORT_SYMBOL(node_possible_map); 57 unsigned long totalram_pages __read_mostly; 58 unsigned long totalreserve_pages __read_mostly; 59 long nr_swap_pages; 60 int percpu_pagelist_fraction; 61 62 static void __free_pages_ok(struct page *page, unsigned int order); 63 64 /* 65 * results with 256, 32 in the lowmem_reserve sysctl: 66 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 67 * 1G machine -> (16M dma, 784M normal, 224M high) 68 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 69 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 70 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 71 * 72 * TBD: should special case ZONE_DMA32 machines here - in those we normally 73 * don't need any ZONE_NORMAL reservation 74 */ 75 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 76 #ifdef CONFIG_ZONE_DMA 77 256, 78 #endif 79 #ifdef CONFIG_ZONE_DMA32 80 256, 81 #endif 82 #ifdef CONFIG_HIGHMEM 83 32, 84 #endif 85 32, 86 }; 87 88 EXPORT_SYMBOL(totalram_pages); 89 90 static char * const zone_names[MAX_NR_ZONES] = { 91 #ifdef CONFIG_ZONE_DMA 92 "DMA", 93 #endif 94 #ifdef CONFIG_ZONE_DMA32 95 "DMA32", 96 #endif 97 "Normal", 98 #ifdef CONFIG_HIGHMEM 99 "HighMem", 100 #endif 101 "Movable", 102 }; 103 104 int min_free_kbytes = 1024; 105 106 unsigned long __meminitdata nr_kernel_pages; 107 unsigned long __meminitdata nr_all_pages; 108 static unsigned long __meminitdata dma_reserve; 109 110 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP 111 /* 112 * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct 113 * ranges of memory (RAM) that may be registered with add_active_range(). 114 * Ranges passed to add_active_range() will be merged if possible 115 * so the number of times add_active_range() can be called is 116 * related to the number of nodes and the number of holes 117 */ 118 #ifdef CONFIG_MAX_ACTIVE_REGIONS 119 /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */ 120 #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS 121 #else 122 #if MAX_NUMNODES >= 32 123 /* If there can be many nodes, allow up to 50 holes per node */ 124 #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50) 125 #else 126 /* By default, allow up to 256 distinct regions */ 127 #define MAX_ACTIVE_REGIONS 256 128 #endif 129 #endif 130 131 static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS]; 132 static int __meminitdata nr_nodemap_entries; 133 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 134 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 135 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE 136 static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; 137 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; 138 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ 139 unsigned long __initdata required_kernelcore; 140 unsigned long __initdata required_movablecore; 141 unsigned long __initdata zone_movable_pfn[MAX_NUMNODES]; 142 143 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 144 int movable_zone; 145 EXPORT_SYMBOL(movable_zone); 146 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 147 148 #if MAX_NUMNODES > 1 149 int nr_node_ids __read_mostly = MAX_NUMNODES; 150 EXPORT_SYMBOL(nr_node_ids); 151 #endif 152 153 #ifdef CONFIG_DEBUG_VM 154 static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 155 { 156 int ret = 0; 157 unsigned seq; 158 unsigned long pfn = page_to_pfn(page); 159 160 do { 161 seq = zone_span_seqbegin(zone); 162 if (pfn >= zone->zone_start_pfn + zone->spanned_pages) 163 ret = 1; 164 else if (pfn < zone->zone_start_pfn) 165 ret = 1; 166 } while (zone_span_seqretry(zone, seq)); 167 168 return ret; 169 } 170 171 static int page_is_consistent(struct zone *zone, struct page *page) 172 { 173 if (!pfn_valid_within(page_to_pfn(page))) 174 return 0; 175 if (zone != page_zone(page)) 176 return 0; 177 178 return 1; 179 } 180 /* 181 * Temporary debugging check for pages not lying within a given zone. 182 */ 183 static int bad_range(struct zone *zone, struct page *page) 184 { 185 if (page_outside_zone_boundaries(zone, page)) 186 return 1; 187 if (!page_is_consistent(zone, page)) 188 return 1; 189 190 return 0; 191 } 192 #else 193 static inline int bad_range(struct zone *zone, struct page *page) 194 { 195 return 0; 196 } 197 #endif 198 199 static void bad_page(struct page *page) 200 { 201 printk(KERN_EMERG "Bad page state in process '%s'\n" 202 KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n" 203 KERN_EMERG "Trying to fix it up, but a reboot is needed\n" 204 KERN_EMERG "Backtrace:\n", 205 current->comm, page, (int)(2*sizeof(unsigned long)), 206 (unsigned long)page->flags, page->mapping, 207 page_mapcount(page), page_count(page)); 208 dump_stack(); 209 page->flags &= ~(1 << PG_lru | 210 1 << PG_private | 211 1 << PG_locked | 212 1 << PG_active | 213 1 << PG_dirty | 214 1 << PG_reclaim | 215 1 << PG_slab | 216 1 << PG_swapcache | 217 1 << PG_writeback | 218 1 << PG_buddy ); 219 set_page_count(page, 0); 220 reset_page_mapcount(page); 221 page->mapping = NULL; 222 add_taint(TAINT_BAD_PAGE); 223 } 224 225 /* 226 * Higher-order pages are called "compound pages". They are structured thusly: 227 * 228 * The first PAGE_SIZE page is called the "head page". 229 * 230 * The remaining PAGE_SIZE pages are called "tail pages". 231 * 232 * All pages have PG_compound set. All pages have their ->private pointing at 233 * the head page (even the head page has this). 234 * 235 * The first tail page's ->lru.next holds the address of the compound page's 236 * put_page() function. Its ->lru.prev holds the order of allocation. 237 * This usage means that zero-order pages may not be compound. 238 */ 239 240 static void free_compound_page(struct page *page) 241 { 242 __free_pages_ok(page, compound_order(page)); 243 } 244 245 static void prep_compound_page(struct page *page, unsigned long order) 246 { 247 int i; 248 int nr_pages = 1 << order; 249 250 set_compound_page_dtor(page, free_compound_page); 251 set_compound_order(page, order); 252 __SetPageHead(page); 253 for (i = 1; i < nr_pages; i++) { 254 struct page *p = page + i; 255 256 __SetPageTail(p); 257 p->first_page = page; 258 } 259 } 260 261 static void destroy_compound_page(struct page *page, unsigned long order) 262 { 263 int i; 264 int nr_pages = 1 << order; 265 266 if (unlikely(compound_order(page) != order)) 267 bad_page(page); 268 269 if (unlikely(!PageHead(page))) 270 bad_page(page); 271 __ClearPageHead(page); 272 for (i = 1; i < nr_pages; i++) { 273 struct page *p = page + i; 274 275 if (unlikely(!PageTail(p) | 276 (p->first_page != page))) 277 bad_page(page); 278 __ClearPageTail(p); 279 } 280 } 281 282 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) 283 { 284 int i; 285 286 VM_BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); 287 /* 288 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO 289 * and __GFP_HIGHMEM from hard or soft interrupt context. 290 */ 291 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); 292 for (i = 0; i < (1 << order); i++) 293 clear_highpage(page + i); 294 } 295 296 /* 297 * function for dealing with page's order in buddy system. 298 * zone->lock is already acquired when we use these. 299 * So, we don't need atomic page->flags operations here. 300 */ 301 static inline unsigned long page_order(struct page *page) 302 { 303 return page_private(page); 304 } 305 306 static inline void set_page_order(struct page *page, int order) 307 { 308 set_page_private(page, order); 309 __SetPageBuddy(page); 310 } 311 312 static inline void rmv_page_order(struct page *page) 313 { 314 __ClearPageBuddy(page); 315 set_page_private(page, 0); 316 } 317 318 /* 319 * Locate the struct page for both the matching buddy in our 320 * pair (buddy1) and the combined O(n+1) page they form (page). 321 * 322 * 1) Any buddy B1 will have an order O twin B2 which satisfies 323 * the following equation: 324 * B2 = B1 ^ (1 << O) 325 * For example, if the starting buddy (buddy2) is #8 its order 326 * 1 buddy is #10: 327 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 328 * 329 * 2) Any buddy B will have an order O+1 parent P which 330 * satisfies the following equation: 331 * P = B & ~(1 << O) 332 * 333 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER 334 */ 335 static inline struct page * 336 __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) 337 { 338 unsigned long buddy_idx = page_idx ^ (1 << order); 339 340 return page + (buddy_idx - page_idx); 341 } 342 343 static inline unsigned long 344 __find_combined_index(unsigned long page_idx, unsigned int order) 345 { 346 return (page_idx & ~(1 << order)); 347 } 348 349 /* 350 * This function checks whether a page is free && is the buddy 351 * we can do coalesce a page and its buddy if 352 * (a) the buddy is not in a hole && 353 * (b) the buddy is in the buddy system && 354 * (c) a page and its buddy have the same order && 355 * (d) a page and its buddy are in the same zone. 356 * 357 * For recording whether a page is in the buddy system, we use PG_buddy. 358 * Setting, clearing, and testing PG_buddy is serialized by zone->lock. 359 * 360 * For recording page's order, we use page_private(page). 361 */ 362 static inline int page_is_buddy(struct page *page, struct page *buddy, 363 int order) 364 { 365 if (!pfn_valid_within(page_to_pfn(buddy))) 366 return 0; 367 368 if (page_zone_id(page) != page_zone_id(buddy)) 369 return 0; 370 371 if (PageBuddy(buddy) && page_order(buddy) == order) { 372 BUG_ON(page_count(buddy) != 0); 373 return 1; 374 } 375 return 0; 376 } 377 378 /* 379 * Freeing function for a buddy system allocator. 380 * 381 * The concept of a buddy system is to maintain direct-mapped table 382 * (containing bit values) for memory blocks of various "orders". 383 * The bottom level table contains the map for the smallest allocatable 384 * units of memory (here, pages), and each level above it describes 385 * pairs of units from the levels below, hence, "buddies". 386 * At a high level, all that happens here is marking the table entry 387 * at the bottom level available, and propagating the changes upward 388 * as necessary, plus some accounting needed to play nicely with other 389 * parts of the VM system. 390 * At each level, we keep a list of pages, which are heads of continuous 391 * free pages of length of (1 << order) and marked with PG_buddy. Page's 392 * order is recorded in page_private(page) field. 393 * So when we are allocating or freeing one, we can derive the state of the 394 * other. That is, if we allocate a small block, and both were 395 * free, the remainder of the region must be split into blocks. 396 * If a block is freed, and its buddy is also free, then this 397 * triggers coalescing into a block of larger size. 398 * 399 * -- wli 400 */ 401 402 static inline void __free_one_page(struct page *page, 403 struct zone *zone, unsigned int order) 404 { 405 unsigned long page_idx; 406 int order_size = 1 << order; 407 408 if (unlikely(PageCompound(page))) 409 destroy_compound_page(page, order); 410 411 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 412 413 VM_BUG_ON(page_idx & (order_size - 1)); 414 VM_BUG_ON(bad_range(zone, page)); 415 416 __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); 417 while (order < MAX_ORDER-1) { 418 unsigned long combined_idx; 419 struct free_area *area; 420 struct page *buddy; 421 422 buddy = __page_find_buddy(page, page_idx, order); 423 if (!page_is_buddy(page, buddy, order)) 424 break; /* Move the buddy up one level. */ 425 426 list_del(&buddy->lru); 427 area = zone->free_area + order; 428 area->nr_free--; 429 rmv_page_order(buddy); 430 combined_idx = __find_combined_index(page_idx, order); 431 page = page + (combined_idx - page_idx); 432 page_idx = combined_idx; 433 order++; 434 } 435 set_page_order(page, order); 436 list_add(&page->lru, &zone->free_area[order].free_list); 437 zone->free_area[order].nr_free++; 438 } 439 440 static inline int free_pages_check(struct page *page) 441 { 442 if (unlikely(page_mapcount(page) | 443 (page->mapping != NULL) | 444 (page_count(page) != 0) | 445 (page->flags & ( 446 1 << PG_lru | 447 1 << PG_private | 448 1 << PG_locked | 449 1 << PG_active | 450 1 << PG_slab | 451 1 << PG_swapcache | 452 1 << PG_writeback | 453 1 << PG_reserved | 454 1 << PG_buddy )))) 455 bad_page(page); 456 /* 457 * PageReclaim == PageTail. It is only an error 458 * for PageReclaim to be set if PageCompound is clear. 459 */ 460 if (unlikely(!PageCompound(page) && PageReclaim(page))) 461 bad_page(page); 462 if (PageDirty(page)) 463 __ClearPageDirty(page); 464 /* 465 * For now, we report if PG_reserved was found set, but do not 466 * clear it, and do not free the page. But we shall soon need 467 * to do more, for when the ZERO_PAGE count wraps negative. 468 */ 469 return PageReserved(page); 470 } 471 472 /* 473 * Frees a list of pages. 474 * Assumes all pages on list are in same zone, and of same order. 475 * count is the number of pages to free. 476 * 477 * If the zone was previously in an "all pages pinned" state then look to 478 * see if this freeing clears that state. 479 * 480 * And clear the zone's pages_scanned counter, to hold off the "all pages are 481 * pinned" detection logic. 482 */ 483 static void free_pages_bulk(struct zone *zone, int count, 484 struct list_head *list, int order) 485 { 486 spin_lock(&zone->lock); 487 zone->all_unreclaimable = 0; 488 zone->pages_scanned = 0; 489 while (count--) { 490 struct page *page; 491 492 VM_BUG_ON(list_empty(list)); 493 page = list_entry(list->prev, struct page, lru); 494 /* have to delete it as __free_one_page list manipulates */ 495 list_del(&page->lru); 496 __free_one_page(page, zone, order); 497 } 498 spin_unlock(&zone->lock); 499 } 500 501 static void free_one_page(struct zone *zone, struct page *page, int order) 502 { 503 spin_lock(&zone->lock); 504 zone->all_unreclaimable = 0; 505 zone->pages_scanned = 0; 506 __free_one_page(page, zone, order); 507 spin_unlock(&zone->lock); 508 } 509 510 static void __free_pages_ok(struct page *page, unsigned int order) 511 { 512 unsigned long flags; 513 int i; 514 int reserved = 0; 515 516 for (i = 0 ; i < (1 << order) ; ++i) 517 reserved += free_pages_check(page + i); 518 if (reserved) 519 return; 520 521 if (!PageHighMem(page)) 522 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); 523 arch_free_page(page, order); 524 kernel_map_pages(page, 1 << order, 0); 525 526 local_irq_save(flags); 527 __count_vm_events(PGFREE, 1 << order); 528 free_one_page(page_zone(page), page, order); 529 local_irq_restore(flags); 530 } 531 532 /* 533 * permit the bootmem allocator to evade page validation on high-order frees 534 */ 535 void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) 536 { 537 if (order == 0) { 538 __ClearPageReserved(page); 539 set_page_count(page, 0); 540 set_page_refcounted(page); 541 __free_page(page); 542 } else { 543 int loop; 544 545 prefetchw(page); 546 for (loop = 0; loop < BITS_PER_LONG; loop++) { 547 struct page *p = &page[loop]; 548 549 if (loop + 1 < BITS_PER_LONG) 550 prefetchw(p + 1); 551 __ClearPageReserved(p); 552 set_page_count(p, 0); 553 } 554 555 set_page_refcounted(page); 556 __free_pages(page, order); 557 } 558 } 559 560 561 /* 562 * The order of subdivision here is critical for the IO subsystem. 563 * Please do not alter this order without good reasons and regression 564 * testing. Specifically, as large blocks of memory are subdivided, 565 * the order in which smaller blocks are delivered depends on the order 566 * they're subdivided in this function. This is the primary factor 567 * influencing the order in which pages are delivered to the IO 568 * subsystem according to empirical testing, and this is also justified 569 * by considering the behavior of a buddy system containing a single 570 * large block of memory acted on by a series of small allocations. 571 * This behavior is a critical factor in sglist merging's success. 572 * 573 * -- wli 574 */ 575 static inline void expand(struct zone *zone, struct page *page, 576 int low, int high, struct free_area *area) 577 { 578 unsigned long size = 1 << high; 579 580 while (high > low) { 581 area--; 582 high--; 583 size >>= 1; 584 VM_BUG_ON(bad_range(zone, &page[size])); 585 list_add(&page[size].lru, &area->free_list); 586 area->nr_free++; 587 set_page_order(&page[size], high); 588 } 589 } 590 591 /* 592 * This page is about to be returned from the page allocator 593 */ 594 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 595 { 596 if (unlikely(page_mapcount(page) | 597 (page->mapping != NULL) | 598 (page_count(page) != 0) | 599 (page->flags & ( 600 1 << PG_lru | 601 1 << PG_private | 602 1 << PG_locked | 603 1 << PG_active | 604 1 << PG_dirty | 605 1 << PG_reclaim | 606 1 << PG_slab | 607 1 << PG_swapcache | 608 1 << PG_writeback | 609 1 << PG_reserved | 610 1 << PG_buddy )))) 611 bad_page(page); 612 613 /* 614 * For now, we report if PG_reserved was found set, but do not 615 * clear it, and do not allocate the page: as a safety net. 616 */ 617 if (PageReserved(page)) 618 return 1; 619 620 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 621 1 << PG_referenced | 1 << PG_arch_1 | 622 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); 623 set_page_private(page, 0); 624 set_page_refcounted(page); 625 626 arch_alloc_page(page, order); 627 kernel_map_pages(page, 1 << order, 1); 628 629 if (gfp_flags & __GFP_ZERO) 630 prep_zero_page(page, order, gfp_flags); 631 632 if (order && (gfp_flags & __GFP_COMP)) 633 prep_compound_page(page, order); 634 635 return 0; 636 } 637 638 /* 639 * Do the hard work of removing an element from the buddy allocator. 640 * Call me with the zone->lock already held. 641 */ 642 static struct page *__rmqueue(struct zone *zone, unsigned int order) 643 { 644 struct free_area * area; 645 unsigned int current_order; 646 struct page *page; 647 648 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 649 area = zone->free_area + current_order; 650 if (list_empty(&area->free_list)) 651 continue; 652 653 page = list_entry(area->free_list.next, struct page, lru); 654 list_del(&page->lru); 655 rmv_page_order(page); 656 area->nr_free--; 657 __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); 658 expand(zone, page, order, current_order, area); 659 return page; 660 } 661 662 return NULL; 663 } 664 665 /* 666 * Obtain a specified number of elements from the buddy allocator, all under 667 * a single hold of the lock, for efficiency. Add them to the supplied list. 668 * Returns the number of new pages which were placed at *list. 669 */ 670 static int rmqueue_bulk(struct zone *zone, unsigned int order, 671 unsigned long count, struct list_head *list) 672 { 673 int i; 674 675 spin_lock(&zone->lock); 676 for (i = 0; i < count; ++i) { 677 struct page *page = __rmqueue(zone, order); 678 if (unlikely(page == NULL)) 679 break; 680 list_add_tail(&page->lru, list); 681 } 682 spin_unlock(&zone->lock); 683 return i; 684 } 685 686 #ifdef CONFIG_NUMA 687 /* 688 * Called from the vmstat counter updater to drain pagesets of this 689 * currently executing processor on remote nodes after they have 690 * expired. 691 * 692 * Note that this function must be called with the thread pinned to 693 * a single processor. 694 */ 695 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 696 { 697 unsigned long flags; 698 int to_drain; 699 700 local_irq_save(flags); 701 if (pcp->count >= pcp->batch) 702 to_drain = pcp->batch; 703 else 704 to_drain = pcp->count; 705 free_pages_bulk(zone, to_drain, &pcp->list, 0); 706 pcp->count -= to_drain; 707 local_irq_restore(flags); 708 } 709 #endif 710 711 static void __drain_pages(unsigned int cpu) 712 { 713 unsigned long flags; 714 struct zone *zone; 715 int i; 716 717 for_each_zone(zone) { 718 struct per_cpu_pageset *pset; 719 720 if (!populated_zone(zone)) 721 continue; 722 723 pset = zone_pcp(zone, cpu); 724 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 725 struct per_cpu_pages *pcp; 726 727 pcp = &pset->pcp[i]; 728 local_irq_save(flags); 729 free_pages_bulk(zone, pcp->count, &pcp->list, 0); 730 pcp->count = 0; 731 local_irq_restore(flags); 732 } 733 } 734 } 735 736 #ifdef CONFIG_PM 737 738 void mark_free_pages(struct zone *zone) 739 { 740 unsigned long pfn, max_zone_pfn; 741 unsigned long flags; 742 int order; 743 struct list_head *curr; 744 745 if (!zone->spanned_pages) 746 return; 747 748 spin_lock_irqsave(&zone->lock, flags); 749 750 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 751 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 752 if (pfn_valid(pfn)) { 753 struct page *page = pfn_to_page(pfn); 754 755 if (!swsusp_page_is_forbidden(page)) 756 swsusp_unset_page_free(page); 757 } 758 759 for (order = MAX_ORDER - 1; order >= 0; --order) 760 list_for_each(curr, &zone->free_area[order].free_list) { 761 unsigned long i; 762 763 pfn = page_to_pfn(list_entry(curr, struct page, lru)); 764 for (i = 0; i < (1UL << order); i++) 765 swsusp_set_page_free(pfn_to_page(pfn + i)); 766 } 767 768 spin_unlock_irqrestore(&zone->lock, flags); 769 } 770 771 /* 772 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 773 */ 774 void drain_local_pages(void) 775 { 776 unsigned long flags; 777 778 local_irq_save(flags); 779 __drain_pages(smp_processor_id()); 780 local_irq_restore(flags); 781 } 782 #endif /* CONFIG_PM */ 783 784 /* 785 * Free a 0-order page 786 */ 787 static void fastcall free_hot_cold_page(struct page *page, int cold) 788 { 789 struct zone *zone = page_zone(page); 790 struct per_cpu_pages *pcp; 791 unsigned long flags; 792 793 if (PageAnon(page)) 794 page->mapping = NULL; 795 if (free_pages_check(page)) 796 return; 797 798 if (!PageHighMem(page)) 799 debug_check_no_locks_freed(page_address(page), PAGE_SIZE); 800 arch_free_page(page, 0); 801 kernel_map_pages(page, 1, 0); 802 803 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 804 local_irq_save(flags); 805 __count_vm_event(PGFREE); 806 list_add(&page->lru, &pcp->list); 807 pcp->count++; 808 if (pcp->count >= pcp->high) { 809 free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 810 pcp->count -= pcp->batch; 811 } 812 local_irq_restore(flags); 813 put_cpu(); 814 } 815 816 void fastcall free_hot_page(struct page *page) 817 { 818 free_hot_cold_page(page, 0); 819 } 820 821 void fastcall free_cold_page(struct page *page) 822 { 823 free_hot_cold_page(page, 1); 824 } 825 826 /* 827 * split_page takes a non-compound higher-order page, and splits it into 828 * n (1<<order) sub-pages: page[0..n] 829 * Each sub-page must be freed individually. 830 * 831 * Note: this is probably too low level an operation for use in drivers. 832 * Please consult with lkml before using this in your driver. 833 */ 834 void split_page(struct page *page, unsigned int order) 835 { 836 int i; 837 838 VM_BUG_ON(PageCompound(page)); 839 VM_BUG_ON(!page_count(page)); 840 for (i = 1; i < (1 << order); i++) 841 set_page_refcounted(page + i); 842 } 843 844 /* 845 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 846 * we cheat by calling it from here, in the order > 0 path. Saves a branch 847 * or two. 848 */ 849 static struct page *buffered_rmqueue(struct zonelist *zonelist, 850 struct zone *zone, int order, gfp_t gfp_flags) 851 { 852 unsigned long flags; 853 struct page *page; 854 int cold = !!(gfp_flags & __GFP_COLD); 855 int cpu; 856 857 again: 858 cpu = get_cpu(); 859 if (likely(order == 0)) { 860 struct per_cpu_pages *pcp; 861 862 pcp = &zone_pcp(zone, cpu)->pcp[cold]; 863 local_irq_save(flags); 864 if (!pcp->count) { 865 pcp->count = rmqueue_bulk(zone, 0, 866 pcp->batch, &pcp->list); 867 if (unlikely(!pcp->count)) 868 goto failed; 869 } 870 page = list_entry(pcp->list.next, struct page, lru); 871 list_del(&page->lru); 872 pcp->count--; 873 } else { 874 spin_lock_irqsave(&zone->lock, flags); 875 page = __rmqueue(zone, order); 876 spin_unlock(&zone->lock); 877 if (!page) 878 goto failed; 879 } 880 881 __count_zone_vm_events(PGALLOC, zone, 1 << order); 882 zone_statistics(zonelist, zone); 883 local_irq_restore(flags); 884 put_cpu(); 885 886 VM_BUG_ON(bad_range(zone, page)); 887 if (prep_new_page(page, order, gfp_flags)) 888 goto again; 889 return page; 890 891 failed: 892 local_irq_restore(flags); 893 put_cpu(); 894 return NULL; 895 } 896 897 #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ 898 #define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ 899 #define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ 900 #define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ 901 #define ALLOC_HARDER 0x10 /* try to alloc harder */ 902 #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ 903 #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ 904 905 #ifdef CONFIG_FAIL_PAGE_ALLOC 906 907 static struct fail_page_alloc_attr { 908 struct fault_attr attr; 909 910 u32 ignore_gfp_highmem; 911 u32 ignore_gfp_wait; 912 u32 min_order; 913 914 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 915 916 struct dentry *ignore_gfp_highmem_file; 917 struct dentry *ignore_gfp_wait_file; 918 struct dentry *min_order_file; 919 920 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 921 922 } fail_page_alloc = { 923 .attr = FAULT_ATTR_INITIALIZER, 924 .ignore_gfp_wait = 1, 925 .ignore_gfp_highmem = 1, 926 .min_order = 1, 927 }; 928 929 static int __init setup_fail_page_alloc(char *str) 930 { 931 return setup_fault_attr(&fail_page_alloc.attr, str); 932 } 933 __setup("fail_page_alloc=", setup_fail_page_alloc); 934 935 static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 936 { 937 if (order < fail_page_alloc.min_order) 938 return 0; 939 if (gfp_mask & __GFP_NOFAIL) 940 return 0; 941 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 942 return 0; 943 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) 944 return 0; 945 946 return should_fail(&fail_page_alloc.attr, 1 << order); 947 } 948 949 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 950 951 static int __init fail_page_alloc_debugfs(void) 952 { 953 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 954 struct dentry *dir; 955 int err; 956 957 err = init_fault_attr_dentries(&fail_page_alloc.attr, 958 "fail_page_alloc"); 959 if (err) 960 return err; 961 dir = fail_page_alloc.attr.dentries.dir; 962 963 fail_page_alloc.ignore_gfp_wait_file = 964 debugfs_create_bool("ignore-gfp-wait", mode, dir, 965 &fail_page_alloc.ignore_gfp_wait); 966 967 fail_page_alloc.ignore_gfp_highmem_file = 968 debugfs_create_bool("ignore-gfp-highmem", mode, dir, 969 &fail_page_alloc.ignore_gfp_highmem); 970 fail_page_alloc.min_order_file = 971 debugfs_create_u32("min-order", mode, dir, 972 &fail_page_alloc.min_order); 973 974 if (!fail_page_alloc.ignore_gfp_wait_file || 975 !fail_page_alloc.ignore_gfp_highmem_file || 976 !fail_page_alloc.min_order_file) { 977 err = -ENOMEM; 978 debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); 979 debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); 980 debugfs_remove(fail_page_alloc.min_order_file); 981 cleanup_fault_attr_dentries(&fail_page_alloc.attr); 982 } 983 984 return err; 985 } 986 987 late_initcall(fail_page_alloc_debugfs); 988 989 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 990 991 #else /* CONFIG_FAIL_PAGE_ALLOC */ 992 993 static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 994 { 995 return 0; 996 } 997 998 #endif /* CONFIG_FAIL_PAGE_ALLOC */ 999 1000 /* 1001 * Return 1 if free pages are above 'mark'. This takes into account the order 1002 * of the allocation. 1003 */ 1004 int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1005 int classzone_idx, int alloc_flags) 1006 { 1007 /* free_pages my go negative - that's OK */ 1008 long min = mark; 1009 long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1; 1010 int o; 1011 1012 if (alloc_flags & ALLOC_HIGH) 1013 min -= min / 2; 1014 if (alloc_flags & ALLOC_HARDER) 1015 min -= min / 4; 1016 1017 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 1018 return 0; 1019 for (o = 0; o < order; o++) { 1020 /* At the next order, this order's pages become unavailable */ 1021 free_pages -= z->free_area[o].nr_free << o; 1022 1023 /* Require fewer higher order pages to be free */ 1024 min >>= 1; 1025 1026 if (free_pages <= min) 1027 return 0; 1028 } 1029 return 1; 1030 } 1031 1032 #ifdef CONFIG_NUMA 1033 /* 1034 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to 1035 * skip over zones that are not allowed by the cpuset, or that have 1036 * been recently (in last second) found to be nearly full. See further 1037 * comments in mmzone.h. Reduces cache footprint of zonelist scans 1038 * that have to skip over alot of full or unallowed zones. 1039 * 1040 * If the zonelist cache is present in the passed in zonelist, then 1041 * returns a pointer to the allowed node mask (either the current 1042 * tasks mems_allowed, or node_online_map.) 1043 * 1044 * If the zonelist cache is not available for this zonelist, does 1045 * nothing and returns NULL. 1046 * 1047 * If the fullzones BITMAP in the zonelist cache is stale (more than 1048 * a second since last zap'd) then we zap it out (clear its bits.) 1049 * 1050 * We hold off even calling zlc_setup, until after we've checked the 1051 * first zone in the zonelist, on the theory that most allocations will 1052 * be satisfied from that first zone, so best to examine that zone as 1053 * quickly as we can. 1054 */ 1055 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1056 { 1057 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1058 nodemask_t *allowednodes; /* zonelist_cache approximation */ 1059 1060 zlc = zonelist->zlcache_ptr; 1061 if (!zlc) 1062 return NULL; 1063 1064 if (jiffies - zlc->last_full_zap > 1 * HZ) { 1065 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1066 zlc->last_full_zap = jiffies; 1067 } 1068 1069 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? 1070 &cpuset_current_mems_allowed : 1071 &node_online_map; 1072 return allowednodes; 1073 } 1074 1075 /* 1076 * Given 'z' scanning a zonelist, run a couple of quick checks to see 1077 * if it is worth looking at further for free memory: 1078 * 1) Check that the zone isn't thought to be full (doesn't have its 1079 * bit set in the zonelist_cache fullzones BITMAP). 1080 * 2) Check that the zones node (obtained from the zonelist_cache 1081 * z_to_n[] mapping) is allowed in the passed in allowednodes mask. 1082 * Return true (non-zero) if zone is worth looking at further, or 1083 * else return false (zero) if it is not. 1084 * 1085 * This check -ignores- the distinction between various watermarks, 1086 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is 1087 * found to be full for any variation of these watermarks, it will 1088 * be considered full for up to one second by all requests, unless 1089 * we are so low on memory on all allowed nodes that we are forced 1090 * into the second scan of the zonelist. 1091 * 1092 * In the second scan we ignore this zonelist cache and exactly 1093 * apply the watermarks to all zones, even it is slower to do so. 1094 * We are low on memory in the second scan, and should leave no stone 1095 * unturned looking for a free page. 1096 */ 1097 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, 1098 nodemask_t *allowednodes) 1099 { 1100 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1101 int i; /* index of *z in zonelist zones */ 1102 int n; /* node that zone *z is on */ 1103 1104 zlc = zonelist->zlcache_ptr; 1105 if (!zlc) 1106 return 1; 1107 1108 i = z - zonelist->zones; 1109 n = zlc->z_to_n[i]; 1110 1111 /* This zone is worth trying if it is allowed but not full */ 1112 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); 1113 } 1114 1115 /* 1116 * Given 'z' scanning a zonelist, set the corresponding bit in 1117 * zlc->fullzones, so that subsequent attempts to allocate a page 1118 * from that zone don't waste time re-examining it. 1119 */ 1120 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) 1121 { 1122 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1123 int i; /* index of *z in zonelist zones */ 1124 1125 zlc = zonelist->zlcache_ptr; 1126 if (!zlc) 1127 return; 1128 1129 i = z - zonelist->zones; 1130 1131 set_bit(i, zlc->fullzones); 1132 } 1133 1134 #else /* CONFIG_NUMA */ 1135 1136 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1137 { 1138 return NULL; 1139 } 1140 1141 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, 1142 nodemask_t *allowednodes) 1143 { 1144 return 1; 1145 } 1146 1147 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) 1148 { 1149 } 1150 #endif /* CONFIG_NUMA */ 1151 1152 /* 1153 * get_page_from_freelist goes through the zonelist trying to allocate 1154 * a page. 1155 */ 1156 static struct page * 1157 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, 1158 struct zonelist *zonelist, int alloc_flags) 1159 { 1160 struct zone **z; 1161 struct page *page = NULL; 1162 int classzone_idx = zone_idx(zonelist->zones[0]); 1163 struct zone *zone; 1164 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1165 int zlc_active = 0; /* set if using zonelist_cache */ 1166 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1167 1168 zonelist_scan: 1169 /* 1170 * Scan zonelist, looking for a zone with enough free. 1171 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1172 */ 1173 z = zonelist->zones; 1174 1175 do { 1176 if (NUMA_BUILD && zlc_active && 1177 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1178 continue; 1179 zone = *z; 1180 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && 1181 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) 1182 break; 1183 if ((alloc_flags & ALLOC_CPUSET) && 1184 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1185 goto try_next_zone; 1186 1187 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1188 unsigned long mark; 1189 if (alloc_flags & ALLOC_WMARK_MIN) 1190 mark = zone->pages_min; 1191 else if (alloc_flags & ALLOC_WMARK_LOW) 1192 mark = zone->pages_low; 1193 else 1194 mark = zone->pages_high; 1195 if (!zone_watermark_ok(zone, order, mark, 1196 classzone_idx, alloc_flags)) { 1197 if (!zone_reclaim_mode || 1198 !zone_reclaim(zone, gfp_mask, order)) 1199 goto this_zone_full; 1200 } 1201 } 1202 1203 page = buffered_rmqueue(zonelist, zone, order, gfp_mask); 1204 if (page) 1205 break; 1206 this_zone_full: 1207 if (NUMA_BUILD) 1208 zlc_mark_zone_full(zonelist, z); 1209 try_next_zone: 1210 if (NUMA_BUILD && !did_zlc_setup) { 1211 /* we do zlc_setup after the first zone is tried */ 1212 allowednodes = zlc_setup(zonelist, alloc_flags); 1213 zlc_active = 1; 1214 did_zlc_setup = 1; 1215 } 1216 } while (*(++z) != NULL); 1217 1218 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { 1219 /* Disable zlc cache for second zonelist scan */ 1220 zlc_active = 0; 1221 goto zonelist_scan; 1222 } 1223 return page; 1224 } 1225 1226 /* 1227 * This is the 'heart' of the zoned buddy allocator. 1228 */ 1229 struct page * fastcall 1230 __alloc_pages(gfp_t gfp_mask, unsigned int order, 1231 struct zonelist *zonelist) 1232 { 1233 const gfp_t wait = gfp_mask & __GFP_WAIT; 1234 struct zone **z; 1235 struct page *page; 1236 struct reclaim_state reclaim_state; 1237 struct task_struct *p = current; 1238 int do_retry; 1239 int alloc_flags; 1240 int did_some_progress; 1241 1242 might_sleep_if(wait); 1243 1244 if (should_fail_alloc_page(gfp_mask, order)) 1245 return NULL; 1246 1247 restart: 1248 z = zonelist->zones; /* the list of zones suitable for gfp_mask */ 1249 1250 if (unlikely(*z == NULL)) { 1251 /* Should this ever happen?? */ 1252 return NULL; 1253 } 1254 1255 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 1256 zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); 1257 if (page) 1258 goto got_pg; 1259 1260 /* 1261 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 1262 * __GFP_NOWARN set) should not cause reclaim since the subsystem 1263 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim 1264 * using a larger set of nodes after it has established that the 1265 * allowed per node queues are empty and that nodes are 1266 * over allocated. 1267 */ 1268 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 1269 goto nopage; 1270 1271 for (z = zonelist->zones; *z; z++) 1272 wakeup_kswapd(*z, order); 1273 1274 /* 1275 * OK, we're below the kswapd watermark and have kicked background 1276 * reclaim. Now things get more complex, so set up alloc_flags according 1277 * to how we want to proceed. 1278 * 1279 * The caller may dip into page reserves a bit more if the caller 1280 * cannot run direct reclaim, or if the caller has realtime scheduling 1281 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 1282 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). 1283 */ 1284 alloc_flags = ALLOC_WMARK_MIN; 1285 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) 1286 alloc_flags |= ALLOC_HARDER; 1287 if (gfp_mask & __GFP_HIGH) 1288 alloc_flags |= ALLOC_HIGH; 1289 if (wait) 1290 alloc_flags |= ALLOC_CPUSET; 1291 1292 /* 1293 * Go through the zonelist again. Let __GFP_HIGH and allocations 1294 * coming from realtime tasks go deeper into reserves. 1295 * 1296 * This is the last chance, in general, before the goto nopage. 1297 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 1298 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1299 */ 1300 page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags); 1301 if (page) 1302 goto got_pg; 1303 1304 /* This allocation should allow future memory freeing. */ 1305 1306 rebalance: 1307 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 1308 && !in_interrupt()) { 1309 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 1310 nofail_alloc: 1311 /* go through the zonelist yet again, ignoring mins */ 1312 page = get_page_from_freelist(gfp_mask, order, 1313 zonelist, ALLOC_NO_WATERMARKS); 1314 if (page) 1315 goto got_pg; 1316 if (gfp_mask & __GFP_NOFAIL) { 1317 congestion_wait(WRITE, HZ/50); 1318 goto nofail_alloc; 1319 } 1320 } 1321 goto nopage; 1322 } 1323 1324 /* Atomic allocations - we can't balance anything */ 1325 if (!wait) 1326 goto nopage; 1327 1328 cond_resched(); 1329 1330 /* We now go into synchronous reclaim */ 1331 cpuset_memory_pressure_bump(); 1332 p->flags |= PF_MEMALLOC; 1333 reclaim_state.reclaimed_slab = 0; 1334 p->reclaim_state = &reclaim_state; 1335 1336 did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask); 1337 1338 p->reclaim_state = NULL; 1339 p->flags &= ~PF_MEMALLOC; 1340 1341 cond_resched(); 1342 1343 if (likely(did_some_progress)) { 1344 page = get_page_from_freelist(gfp_mask, order, 1345 zonelist, alloc_flags); 1346 if (page) 1347 goto got_pg; 1348 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 1349 /* 1350 * Go through the zonelist yet one more time, keep 1351 * very high watermark here, this is only to catch 1352 * a parallel oom killing, we must fail if we're still 1353 * under heavy pressure. 1354 */ 1355 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 1356 zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); 1357 if (page) 1358 goto got_pg; 1359 1360 out_of_memory(zonelist, gfp_mask, order); 1361 goto restart; 1362 } 1363 1364 /* 1365 * Don't let big-order allocations loop unless the caller explicitly 1366 * requests that. Wait for some write requests to complete then retry. 1367 * 1368 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order 1369 * <= 3, but that may not be true in other implementations. 1370 */ 1371 do_retry = 0; 1372 if (!(gfp_mask & __GFP_NORETRY)) { 1373 if ((order <= PAGE_ALLOC_COSTLY_ORDER) || 1374 (gfp_mask & __GFP_REPEAT)) 1375 do_retry = 1; 1376 if (gfp_mask & __GFP_NOFAIL) 1377 do_retry = 1; 1378 } 1379 if (do_retry) { 1380 congestion_wait(WRITE, HZ/50); 1381 goto rebalance; 1382 } 1383 1384 nopage: 1385 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { 1386 printk(KERN_WARNING "%s: page allocation failure." 1387 " order:%d, mode:0x%x\n", 1388 p->comm, order, gfp_mask); 1389 dump_stack(); 1390 show_mem(); 1391 } 1392 got_pg: 1393 return page; 1394 } 1395 1396 EXPORT_SYMBOL(__alloc_pages); 1397 1398 /* 1399 * Common helper functions. 1400 */ 1401 fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 1402 { 1403 struct page * page; 1404 page = alloc_pages(gfp_mask, order); 1405 if (!page) 1406 return 0; 1407 return (unsigned long) page_address(page); 1408 } 1409 1410 EXPORT_SYMBOL(__get_free_pages); 1411 1412 fastcall unsigned long get_zeroed_page(gfp_t gfp_mask) 1413 { 1414 struct page * page; 1415 1416 /* 1417 * get_zeroed_page() returns a 32-bit address, which cannot represent 1418 * a highmem page 1419 */ 1420 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); 1421 1422 page = alloc_pages(gfp_mask | __GFP_ZERO, 0); 1423 if (page) 1424 return (unsigned long) page_address(page); 1425 return 0; 1426 } 1427 1428 EXPORT_SYMBOL(get_zeroed_page); 1429 1430 void __pagevec_free(struct pagevec *pvec) 1431 { 1432 int i = pagevec_count(pvec); 1433 1434 while (--i >= 0) 1435 free_hot_cold_page(pvec->pages[i], pvec->cold); 1436 } 1437 1438 fastcall void __free_pages(struct page *page, unsigned int order) 1439 { 1440 if (put_page_testzero(page)) { 1441 if (order == 0) 1442 free_hot_page(page); 1443 else 1444 __free_pages_ok(page, order); 1445 } 1446 } 1447 1448 EXPORT_SYMBOL(__free_pages); 1449 1450 fastcall void free_pages(unsigned long addr, unsigned int order) 1451 { 1452 if (addr != 0) { 1453 VM_BUG_ON(!virt_addr_valid((void *)addr)); 1454 __free_pages(virt_to_page((void *)addr), order); 1455 } 1456 } 1457 1458 EXPORT_SYMBOL(free_pages); 1459 1460 static unsigned int nr_free_zone_pages(int offset) 1461 { 1462 /* Just pick one node, since fallback list is circular */ 1463 pg_data_t *pgdat = NODE_DATA(numa_node_id()); 1464 unsigned int sum = 0; 1465 1466 struct zonelist *zonelist = pgdat->node_zonelists + offset; 1467 struct zone **zonep = zonelist->zones; 1468 struct zone *zone; 1469 1470 for (zone = *zonep++; zone; zone = *zonep++) { 1471 unsigned long size = zone->present_pages; 1472 unsigned long high = zone->pages_high; 1473 if (size > high) 1474 sum += size - high; 1475 } 1476 1477 return sum; 1478 } 1479 1480 /* 1481 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL 1482 */ 1483 unsigned int nr_free_buffer_pages(void) 1484 { 1485 return nr_free_zone_pages(gfp_zone(GFP_USER)); 1486 } 1487 EXPORT_SYMBOL_GPL(nr_free_buffer_pages); 1488 1489 /* 1490 * Amount of free RAM allocatable within all zones 1491 */ 1492 unsigned int nr_free_pagecache_pages(void) 1493 { 1494 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); 1495 } 1496 1497 static inline void show_node(struct zone *zone) 1498 { 1499 if (NUMA_BUILD) 1500 printk("Node %d ", zone_to_nid(zone)); 1501 } 1502 1503 void si_meminfo(struct sysinfo *val) 1504 { 1505 val->totalram = totalram_pages; 1506 val->sharedram = 0; 1507 val->freeram = global_page_state(NR_FREE_PAGES); 1508 val->bufferram = nr_blockdev_pages(); 1509 val->totalhigh = totalhigh_pages; 1510 val->freehigh = nr_free_highpages(); 1511 val->mem_unit = PAGE_SIZE; 1512 } 1513 1514 EXPORT_SYMBOL(si_meminfo); 1515 1516 #ifdef CONFIG_NUMA 1517 void si_meminfo_node(struct sysinfo *val, int nid) 1518 { 1519 pg_data_t *pgdat = NODE_DATA(nid); 1520 1521 val->totalram = pgdat->node_present_pages; 1522 val->freeram = node_page_state(nid, NR_FREE_PAGES); 1523 #ifdef CONFIG_HIGHMEM 1524 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; 1525 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], 1526 NR_FREE_PAGES); 1527 #else 1528 val->totalhigh = 0; 1529 val->freehigh = 0; 1530 #endif 1531 val->mem_unit = PAGE_SIZE; 1532 } 1533 #endif 1534 1535 #define K(x) ((x) << (PAGE_SHIFT-10)) 1536 1537 /* 1538 * Show free area list (used inside shift_scroll-lock stuff) 1539 * We also calculate the percentage fragmentation. We do this by counting the 1540 * memory on each free list with the exception of the first item on the list. 1541 */ 1542 void show_free_areas(void) 1543 { 1544 int cpu; 1545 struct zone *zone; 1546 1547 for_each_zone(zone) { 1548 if (!populated_zone(zone)) 1549 continue; 1550 1551 show_node(zone); 1552 printk("%s per-cpu:\n", zone->name); 1553 1554 for_each_online_cpu(cpu) { 1555 struct per_cpu_pageset *pageset; 1556 1557 pageset = zone_pcp(zone, cpu); 1558 1559 printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d " 1560 "Cold: hi:%5d, btch:%4d usd:%4d\n", 1561 cpu, pageset->pcp[0].high, 1562 pageset->pcp[0].batch, pageset->pcp[0].count, 1563 pageset->pcp[1].high, pageset->pcp[1].batch, 1564 pageset->pcp[1].count); 1565 } 1566 } 1567 1568 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n" 1569 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", 1570 global_page_state(NR_ACTIVE), 1571 global_page_state(NR_INACTIVE), 1572 global_page_state(NR_FILE_DIRTY), 1573 global_page_state(NR_WRITEBACK), 1574 global_page_state(NR_UNSTABLE_NFS), 1575 global_page_state(NR_FREE_PAGES), 1576 global_page_state(NR_SLAB_RECLAIMABLE) + 1577 global_page_state(NR_SLAB_UNRECLAIMABLE), 1578 global_page_state(NR_FILE_MAPPED), 1579 global_page_state(NR_PAGETABLE), 1580 global_page_state(NR_BOUNCE)); 1581 1582 for_each_zone(zone) { 1583 int i; 1584 1585 if (!populated_zone(zone)) 1586 continue; 1587 1588 show_node(zone); 1589 printk("%s" 1590 " free:%lukB" 1591 " min:%lukB" 1592 " low:%lukB" 1593 " high:%lukB" 1594 " active:%lukB" 1595 " inactive:%lukB" 1596 " present:%lukB" 1597 " pages_scanned:%lu" 1598 " all_unreclaimable? %s" 1599 "\n", 1600 zone->name, 1601 K(zone_page_state(zone, NR_FREE_PAGES)), 1602 K(zone->pages_min), 1603 K(zone->pages_low), 1604 K(zone->pages_high), 1605 K(zone_page_state(zone, NR_ACTIVE)), 1606 K(zone_page_state(zone, NR_INACTIVE)), 1607 K(zone->present_pages), 1608 zone->pages_scanned, 1609 (zone->all_unreclaimable ? "yes" : "no") 1610 ); 1611 printk("lowmem_reserve[]:"); 1612 for (i = 0; i < MAX_NR_ZONES; i++) 1613 printk(" %lu", zone->lowmem_reserve[i]); 1614 printk("\n"); 1615 } 1616 1617 for_each_zone(zone) { 1618 unsigned long nr[MAX_ORDER], flags, order, total = 0; 1619 1620 if (!populated_zone(zone)) 1621 continue; 1622 1623 show_node(zone); 1624 printk("%s: ", zone->name); 1625 1626 spin_lock_irqsave(&zone->lock, flags); 1627 for (order = 0; order < MAX_ORDER; order++) { 1628 nr[order] = zone->free_area[order].nr_free; 1629 total += nr[order] << order; 1630 } 1631 spin_unlock_irqrestore(&zone->lock, flags); 1632 for (order = 0; order < MAX_ORDER; order++) 1633 printk("%lu*%lukB ", nr[order], K(1UL) << order); 1634 printk("= %lukB\n", K(total)); 1635 } 1636 1637 show_swap_cache_info(); 1638 } 1639 1640 /* 1641 * Builds allocation fallback zone lists. 1642 * 1643 * Add all populated zones of a node to the zonelist. 1644 */ 1645 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, 1646 int nr_zones, enum zone_type zone_type) 1647 { 1648 struct zone *zone; 1649 1650 BUG_ON(zone_type >= MAX_NR_ZONES); 1651 zone_type++; 1652 1653 do { 1654 zone_type--; 1655 zone = pgdat->node_zones + zone_type; 1656 if (populated_zone(zone)) { 1657 zonelist->zones[nr_zones++] = zone; 1658 check_highest_zone(zone_type); 1659 } 1660 1661 } while (zone_type); 1662 return nr_zones; 1663 } 1664 1665 1666 /* 1667 * zonelist_order: 1668 * 0 = automatic detection of better ordering. 1669 * 1 = order by ([node] distance, -zonetype) 1670 * 2 = order by (-zonetype, [node] distance) 1671 * 1672 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create 1673 * the same zonelist. So only NUMA can configure this param. 1674 */ 1675 #define ZONELIST_ORDER_DEFAULT 0 1676 #define ZONELIST_ORDER_NODE 1 1677 #define ZONELIST_ORDER_ZONE 2 1678 1679 /* zonelist order in the kernel. 1680 * set_zonelist_order() will set this to NODE or ZONE. 1681 */ 1682 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; 1683 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; 1684 1685 1686 #ifdef CONFIG_NUMA 1687 /* The value user specified ....changed by config */ 1688 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; 1689 /* string for sysctl */ 1690 #define NUMA_ZONELIST_ORDER_LEN 16 1691 char numa_zonelist_order[16] = "default"; 1692 1693 /* 1694 * interface for configure zonelist ordering. 1695 * command line option "numa_zonelist_order" 1696 * = "[dD]efault - default, automatic configuration. 1697 * = "[nN]ode - order by node locality, then by zone within node 1698 * = "[zZ]one - order by zone, then by locality within zone 1699 */ 1700 1701 static int __parse_numa_zonelist_order(char *s) 1702 { 1703 if (*s == 'd' || *s == 'D') { 1704 user_zonelist_order = ZONELIST_ORDER_DEFAULT; 1705 } else if (*s == 'n' || *s == 'N') { 1706 user_zonelist_order = ZONELIST_ORDER_NODE; 1707 } else if (*s == 'z' || *s == 'Z') { 1708 user_zonelist_order = ZONELIST_ORDER_ZONE; 1709 } else { 1710 printk(KERN_WARNING 1711 "Ignoring invalid numa_zonelist_order value: " 1712 "%s\n", s); 1713 return -EINVAL; 1714 } 1715 return 0; 1716 } 1717 1718 static __init int setup_numa_zonelist_order(char *s) 1719 { 1720 if (s) 1721 return __parse_numa_zonelist_order(s); 1722 return 0; 1723 } 1724 early_param("numa_zonelist_order", setup_numa_zonelist_order); 1725 1726 /* 1727 * sysctl handler for numa_zonelist_order 1728 */ 1729 int numa_zonelist_order_handler(ctl_table *table, int write, 1730 struct file *file, void __user *buffer, size_t *length, 1731 loff_t *ppos) 1732 { 1733 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 1734 int ret; 1735 1736 if (write) 1737 strncpy(saved_string, (char*)table->data, 1738 NUMA_ZONELIST_ORDER_LEN); 1739 ret = proc_dostring(table, write, file, buffer, length, ppos); 1740 if (ret) 1741 return ret; 1742 if (write) { 1743 int oldval = user_zonelist_order; 1744 if (__parse_numa_zonelist_order((char*)table->data)) { 1745 /* 1746 * bogus value. restore saved string 1747 */ 1748 strncpy((char*)table->data, saved_string, 1749 NUMA_ZONELIST_ORDER_LEN); 1750 user_zonelist_order = oldval; 1751 } else if (oldval != user_zonelist_order) 1752 build_all_zonelists(); 1753 } 1754 return 0; 1755 } 1756 1757 1758 #define MAX_NODE_LOAD (num_online_nodes()) 1759 static int node_load[MAX_NUMNODES]; 1760 1761 /** 1762 * find_next_best_node - find the next node that should appear in a given node's fallback list 1763 * @node: node whose fallback list we're appending 1764 * @used_node_mask: nodemask_t of already used nodes 1765 * 1766 * We use a number of factors to determine which is the next node that should 1767 * appear on a given node's fallback list. The node should not have appeared 1768 * already in @node's fallback list, and it should be the next closest node 1769 * according to the distance array (which contains arbitrary distance values 1770 * from each node to each node in the system), and should also prefer nodes 1771 * with no CPUs, since presumably they'll have very little allocation pressure 1772 * on them otherwise. 1773 * It returns -1 if no node is found. 1774 */ 1775 static int find_next_best_node(int node, nodemask_t *used_node_mask) 1776 { 1777 int n, val; 1778 int min_val = INT_MAX; 1779 int best_node = -1; 1780 1781 /* Use the local node if we haven't already */ 1782 if (!node_isset(node, *used_node_mask)) { 1783 node_set(node, *used_node_mask); 1784 return node; 1785 } 1786 1787 for_each_online_node(n) { 1788 cpumask_t tmp; 1789 1790 /* Don't want a node to appear more than once */ 1791 if (node_isset(n, *used_node_mask)) 1792 continue; 1793 1794 /* Use the distance array to find the distance */ 1795 val = node_distance(node, n); 1796 1797 /* Penalize nodes under us ("prefer the next node") */ 1798 val += (n < node); 1799 1800 /* Give preference to headless and unused nodes */ 1801 tmp = node_to_cpumask(n); 1802 if (!cpus_empty(tmp)) 1803 val += PENALTY_FOR_NODE_WITH_CPUS; 1804 1805 /* Slight preference for less loaded node */ 1806 val *= (MAX_NODE_LOAD*MAX_NUMNODES); 1807 val += node_load[n]; 1808 1809 if (val < min_val) { 1810 min_val = val; 1811 best_node = n; 1812 } 1813 } 1814 1815 if (best_node >= 0) 1816 node_set(best_node, *used_node_mask); 1817 1818 return best_node; 1819 } 1820 1821 1822 /* 1823 * Build zonelists ordered by node and zones within node. 1824 * This results in maximum locality--normal zone overflows into local 1825 * DMA zone, if any--but risks exhausting DMA zone. 1826 */ 1827 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) 1828 { 1829 enum zone_type i; 1830 int j; 1831 struct zonelist *zonelist; 1832 1833 for (i = 0; i < MAX_NR_ZONES; i++) { 1834 zonelist = pgdat->node_zonelists + i; 1835 for (j = 0; zonelist->zones[j] != NULL; j++) 1836 ; 1837 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); 1838 zonelist->zones[j] = NULL; 1839 } 1840 } 1841 1842 /* 1843 * Build zonelists ordered by zone and nodes within zones. 1844 * This results in conserving DMA zone[s] until all Normal memory is 1845 * exhausted, but results in overflowing to remote node while memory 1846 * may still exist in local DMA zone. 1847 */ 1848 static int node_order[MAX_NUMNODES]; 1849 1850 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) 1851 { 1852 enum zone_type i; 1853 int pos, j, node; 1854 int zone_type; /* needs to be signed */ 1855 struct zone *z; 1856 struct zonelist *zonelist; 1857 1858 for (i = 0; i < MAX_NR_ZONES; i++) { 1859 zonelist = pgdat->node_zonelists + i; 1860 pos = 0; 1861 for (zone_type = i; zone_type >= 0; zone_type--) { 1862 for (j = 0; j < nr_nodes; j++) { 1863 node = node_order[j]; 1864 z = &NODE_DATA(node)->node_zones[zone_type]; 1865 if (populated_zone(z)) { 1866 zonelist->zones[pos++] = z; 1867 check_highest_zone(zone_type); 1868 } 1869 } 1870 } 1871 zonelist->zones[pos] = NULL; 1872 } 1873 } 1874 1875 static int default_zonelist_order(void) 1876 { 1877 int nid, zone_type; 1878 unsigned long low_kmem_size,total_size; 1879 struct zone *z; 1880 int average_size; 1881 /* 1882 * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem. 1883 * If they are really small and used heavily, the system can fall 1884 * into OOM very easily. 1885 * This function detect ZONE_DMA/DMA32 size and confgigures zone order. 1886 */ 1887 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ 1888 low_kmem_size = 0; 1889 total_size = 0; 1890 for_each_online_node(nid) { 1891 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 1892 z = &NODE_DATA(nid)->node_zones[zone_type]; 1893 if (populated_zone(z)) { 1894 if (zone_type < ZONE_NORMAL) 1895 low_kmem_size += z->present_pages; 1896 total_size += z->present_pages; 1897 } 1898 } 1899 } 1900 if (!low_kmem_size || /* there are no DMA area. */ 1901 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ 1902 return ZONELIST_ORDER_NODE; 1903 /* 1904 * look into each node's config. 1905 * If there is a node whose DMA/DMA32 memory is very big area on 1906 * local memory, NODE_ORDER may be suitable. 1907 */ 1908 average_size = total_size / (num_online_nodes() + 1); 1909 for_each_online_node(nid) { 1910 low_kmem_size = 0; 1911 total_size = 0; 1912 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 1913 z = &NODE_DATA(nid)->node_zones[zone_type]; 1914 if (populated_zone(z)) { 1915 if (zone_type < ZONE_NORMAL) 1916 low_kmem_size += z->present_pages; 1917 total_size += z->present_pages; 1918 } 1919 } 1920 if (low_kmem_size && 1921 total_size > average_size && /* ignore small node */ 1922 low_kmem_size > total_size * 70/100) 1923 return ZONELIST_ORDER_NODE; 1924 } 1925 return ZONELIST_ORDER_ZONE; 1926 } 1927 1928 static void set_zonelist_order(void) 1929 { 1930 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) 1931 current_zonelist_order = default_zonelist_order(); 1932 else 1933 current_zonelist_order = user_zonelist_order; 1934 } 1935 1936 static void build_zonelists(pg_data_t *pgdat) 1937 { 1938 int j, node, load; 1939 enum zone_type i; 1940 nodemask_t used_mask; 1941 int local_node, prev_node; 1942 struct zonelist *zonelist; 1943 int order = current_zonelist_order; 1944 1945 /* initialize zonelists */ 1946 for (i = 0; i < MAX_NR_ZONES; i++) { 1947 zonelist = pgdat->node_zonelists + i; 1948 zonelist->zones[0] = NULL; 1949 } 1950 1951 /* NUMA-aware ordering of nodes */ 1952 local_node = pgdat->node_id; 1953 load = num_online_nodes(); 1954 prev_node = local_node; 1955 nodes_clear(used_mask); 1956 1957 memset(node_load, 0, sizeof(node_load)); 1958 memset(node_order, 0, sizeof(node_order)); 1959 j = 0; 1960 1961 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 1962 int distance = node_distance(local_node, node); 1963 1964 /* 1965 * If another node is sufficiently far away then it is better 1966 * to reclaim pages in a zone before going off node. 1967 */ 1968 if (distance > RECLAIM_DISTANCE) 1969 zone_reclaim_mode = 1; 1970 1971 /* 1972 * We don't want to pressure a particular node. 1973 * So adding penalty to the first node in same 1974 * distance group to make it round-robin. 1975 */ 1976 if (distance != node_distance(local_node, prev_node)) 1977 node_load[node] = load; 1978 1979 prev_node = node; 1980 load--; 1981 if (order == ZONELIST_ORDER_NODE) 1982 build_zonelists_in_node_order(pgdat, node); 1983 else 1984 node_order[j++] = node; /* remember order */ 1985 } 1986 1987 if (order == ZONELIST_ORDER_ZONE) { 1988 /* calculate node order -- i.e., DMA last! */ 1989 build_zonelists_in_zone_order(pgdat, j); 1990 } 1991 } 1992 1993 /* Construct the zonelist performance cache - see further mmzone.h */ 1994 static void build_zonelist_cache(pg_data_t *pgdat) 1995 { 1996 int i; 1997 1998 for (i = 0; i < MAX_NR_ZONES; i++) { 1999 struct zonelist *zonelist; 2000 struct zonelist_cache *zlc; 2001 struct zone **z; 2002 2003 zonelist = pgdat->node_zonelists + i; 2004 zonelist->zlcache_ptr = zlc = &zonelist->zlcache; 2005 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 2006 for (z = zonelist->zones; *z; z++) 2007 zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z); 2008 } 2009 } 2010 2011 2012 #else /* CONFIG_NUMA */ 2013 2014 static void set_zonelist_order(void) 2015 { 2016 current_zonelist_order = ZONELIST_ORDER_ZONE; 2017 } 2018 2019 static void build_zonelists(pg_data_t *pgdat) 2020 { 2021 int node, local_node; 2022 enum zone_type i,j; 2023 2024 local_node = pgdat->node_id; 2025 for (i = 0; i < MAX_NR_ZONES; i++) { 2026 struct zonelist *zonelist; 2027 2028 zonelist = pgdat->node_zonelists + i; 2029 2030 j = build_zonelists_node(pgdat, zonelist, 0, i); 2031 /* 2032 * Now we build the zonelist so that it contains the zones 2033 * of all the other nodes. 2034 * We don't want to pressure a particular node, so when 2035 * building the zones for node N, we make sure that the 2036 * zones coming right after the local ones are those from 2037 * node N+1 (modulo N) 2038 */ 2039 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 2040 if (!node_online(node)) 2041 continue; 2042 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); 2043 } 2044 for (node = 0; node < local_node; node++) { 2045 if (!node_online(node)) 2046 continue; 2047 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); 2048 } 2049 2050 zonelist->zones[j] = NULL; 2051 } 2052 } 2053 2054 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ 2055 static void build_zonelist_cache(pg_data_t *pgdat) 2056 { 2057 int i; 2058 2059 for (i = 0; i < MAX_NR_ZONES; i++) 2060 pgdat->node_zonelists[i].zlcache_ptr = NULL; 2061 } 2062 2063 #endif /* CONFIG_NUMA */ 2064 2065 /* return values int ....just for stop_machine_run() */ 2066 static int __build_all_zonelists(void *dummy) 2067 { 2068 int nid; 2069 2070 for_each_online_node(nid) { 2071 build_zonelists(NODE_DATA(nid)); 2072 build_zonelist_cache(NODE_DATA(nid)); 2073 } 2074 return 0; 2075 } 2076 2077 void build_all_zonelists(void) 2078 { 2079 set_zonelist_order(); 2080 2081 if (system_state == SYSTEM_BOOTING) { 2082 __build_all_zonelists(NULL); 2083 cpuset_init_current_mems_allowed(); 2084 } else { 2085 /* we have to stop all cpus to guaranntee there is no user 2086 of zonelist */ 2087 stop_machine_run(__build_all_zonelists, NULL, NR_CPUS); 2088 /* cpuset refresh routine should be here */ 2089 } 2090 vm_total_pages = nr_free_pagecache_pages(); 2091 printk("Built %i zonelists in %s order. Total pages: %ld\n", 2092 num_online_nodes(), 2093 zonelist_order_name[current_zonelist_order], 2094 vm_total_pages); 2095 #ifdef CONFIG_NUMA 2096 printk("Policy zone: %s\n", zone_names[policy_zone]); 2097 #endif 2098 } 2099 2100 /* 2101 * Helper functions to size the waitqueue hash table. 2102 * Essentially these want to choose hash table sizes sufficiently 2103 * large so that collisions trying to wait on pages are rare. 2104 * But in fact, the number of active page waitqueues on typical 2105 * systems is ridiculously low, less than 200. So this is even 2106 * conservative, even though it seems large. 2107 * 2108 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to 2109 * waitqueues, i.e. the size of the waitq table given the number of pages. 2110 */ 2111 #define PAGES_PER_WAITQUEUE 256 2112 2113 #ifndef CONFIG_MEMORY_HOTPLUG 2114 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 2115 { 2116 unsigned long size = 1; 2117 2118 pages /= PAGES_PER_WAITQUEUE; 2119 2120 while (size < pages) 2121 size <<= 1; 2122 2123 /* 2124 * Once we have dozens or even hundreds of threads sleeping 2125 * on IO we've got bigger problems than wait queue collision. 2126 * Limit the size of the wait table to a reasonable size. 2127 */ 2128 size = min(size, 4096UL); 2129 2130 return max(size, 4UL); 2131 } 2132 #else 2133 /* 2134 * A zone's size might be changed by hot-add, so it is not possible to determine 2135 * a suitable size for its wait_table. So we use the maximum size now. 2136 * 2137 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: 2138 * 2139 * i386 (preemption config) : 4096 x 16 = 64Kbyte. 2140 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. 2141 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. 2142 * 2143 * The maximum entries are prepared when a zone's memory is (512K + 256) pages 2144 * or more by the traditional way. (See above). It equals: 2145 * 2146 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. 2147 * ia64(16K page size) : = ( 8G + 4M)byte. 2148 * powerpc (64K page size) : = (32G +16M)byte. 2149 */ 2150 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 2151 { 2152 return 4096UL; 2153 } 2154 #endif 2155 2156 /* 2157 * This is an integer logarithm so that shifts can be used later 2158 * to extract the more random high bits from the multiplicative 2159 * hash function before the remainder is taken. 2160 */ 2161 static inline unsigned long wait_table_bits(unsigned long size) 2162 { 2163 return ffz(~size); 2164 } 2165 2166 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 2167 2168 /* 2169 * Initially all pages are reserved - free ones are freed 2170 * up by free_all_bootmem() once the early boot process is 2171 * done. Non-atomic initialization, single-pass. 2172 */ 2173 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, 2174 unsigned long start_pfn, enum memmap_context context) 2175 { 2176 struct page *page; 2177 unsigned long end_pfn = start_pfn + size; 2178 unsigned long pfn; 2179 2180 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 2181 /* 2182 * There can be holes in boot-time mem_map[]s 2183 * handed to this function. They do not 2184 * exist on hotplugged memory. 2185 */ 2186 if (context == MEMMAP_EARLY) { 2187 if (!early_pfn_valid(pfn)) 2188 continue; 2189 if (!early_pfn_in_nid(pfn, nid)) 2190 continue; 2191 } 2192 page = pfn_to_page(pfn); 2193 set_page_links(page, zone, nid, pfn); 2194 init_page_count(page); 2195 reset_page_mapcount(page); 2196 SetPageReserved(page); 2197 INIT_LIST_HEAD(&page->lru); 2198 #ifdef WANT_PAGE_VIRTUAL 2199 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 2200 if (!is_highmem_idx(zone)) 2201 set_page_address(page, __va(pfn << PAGE_SHIFT)); 2202 #endif 2203 } 2204 } 2205 2206 static void __meminit zone_init_free_lists(struct pglist_data *pgdat, 2207 struct zone *zone, unsigned long size) 2208 { 2209 int order; 2210 for (order = 0; order < MAX_ORDER ; order++) { 2211 INIT_LIST_HEAD(&zone->free_area[order].free_list); 2212 zone->free_area[order].nr_free = 0; 2213 } 2214 } 2215 2216 #ifndef __HAVE_ARCH_MEMMAP_INIT 2217 #define memmap_init(size, nid, zone, start_pfn) \ 2218 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) 2219 #endif 2220 2221 static int __devinit zone_batchsize(struct zone *zone) 2222 { 2223 int batch; 2224 2225 /* 2226 * The per-cpu-pages pools are set to around 1000th of the 2227 * size of the zone. But no more than 1/2 of a meg. 2228 * 2229 * OK, so we don't know how big the cache is. So guess. 2230 */ 2231 batch = zone->present_pages / 1024; 2232 if (batch * PAGE_SIZE > 512 * 1024) 2233 batch = (512 * 1024) / PAGE_SIZE; 2234 batch /= 4; /* We effectively *= 4 below */ 2235 if (batch < 1) 2236 batch = 1; 2237 2238 /* 2239 * Clamp the batch to a 2^n - 1 value. Having a power 2240 * of 2 value was found to be more likely to have 2241 * suboptimal cache aliasing properties in some cases. 2242 * 2243 * For example if 2 tasks are alternately allocating 2244 * batches of pages, one task can end up with a lot 2245 * of pages of one half of the possible page colors 2246 * and the other with pages of the other colors. 2247 */ 2248 batch = (1 << (fls(batch + batch/2)-1)) - 1; 2249 2250 return batch; 2251 } 2252 2253 inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 2254 { 2255 struct per_cpu_pages *pcp; 2256 2257 memset(p, 0, sizeof(*p)); 2258 2259 pcp = &p->pcp[0]; /* hot */ 2260 pcp->count = 0; 2261 pcp->high = 6 * batch; 2262 pcp->batch = max(1UL, 1 * batch); 2263 INIT_LIST_HEAD(&pcp->list); 2264 2265 pcp = &p->pcp[1]; /* cold*/ 2266 pcp->count = 0; 2267 pcp->high = 2 * batch; 2268 pcp->batch = max(1UL, batch/2); 2269 INIT_LIST_HEAD(&pcp->list); 2270 } 2271 2272 /* 2273 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist 2274 * to the value high for the pageset p. 2275 */ 2276 2277 static void setup_pagelist_highmark(struct per_cpu_pageset *p, 2278 unsigned long high) 2279 { 2280 struct per_cpu_pages *pcp; 2281 2282 pcp = &p->pcp[0]; /* hot list */ 2283 pcp->high = high; 2284 pcp->batch = max(1UL, high/4); 2285 if ((high/4) > (PAGE_SHIFT * 8)) 2286 pcp->batch = PAGE_SHIFT * 8; 2287 } 2288 2289 2290 #ifdef CONFIG_NUMA 2291 /* 2292 * Boot pageset table. One per cpu which is going to be used for all 2293 * zones and all nodes. The parameters will be set in such a way 2294 * that an item put on a list will immediately be handed over to 2295 * the buddy list. This is safe since pageset manipulation is done 2296 * with interrupts disabled. 2297 * 2298 * Some NUMA counter updates may also be caught by the boot pagesets. 2299 * 2300 * The boot_pagesets must be kept even after bootup is complete for 2301 * unused processors and/or zones. They do play a role for bootstrapping 2302 * hotplugged processors. 2303 * 2304 * zoneinfo_show() and maybe other functions do 2305 * not check if the processor is online before following the pageset pointer. 2306 * Other parts of the kernel may not check if the zone is available. 2307 */ 2308 static struct per_cpu_pageset boot_pageset[NR_CPUS]; 2309 2310 /* 2311 * Dynamically allocate memory for the 2312 * per cpu pageset array in struct zone. 2313 */ 2314 static int __cpuinit process_zones(int cpu) 2315 { 2316 struct zone *zone, *dzone; 2317 2318 for_each_zone(zone) { 2319 2320 if (!populated_zone(zone)) 2321 continue; 2322 2323 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 2324 GFP_KERNEL, cpu_to_node(cpu)); 2325 if (!zone_pcp(zone, cpu)) 2326 goto bad; 2327 2328 setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); 2329 2330 if (percpu_pagelist_fraction) 2331 setup_pagelist_highmark(zone_pcp(zone, cpu), 2332 (zone->present_pages / percpu_pagelist_fraction)); 2333 } 2334 2335 return 0; 2336 bad: 2337 for_each_zone(dzone) { 2338 if (dzone == zone) 2339 break; 2340 kfree(zone_pcp(dzone, cpu)); 2341 zone_pcp(dzone, cpu) = NULL; 2342 } 2343 return -ENOMEM; 2344 } 2345 2346 static inline void free_zone_pagesets(int cpu) 2347 { 2348 struct zone *zone; 2349 2350 for_each_zone(zone) { 2351 struct per_cpu_pageset *pset = zone_pcp(zone, cpu); 2352 2353 /* Free per_cpu_pageset if it is slab allocated */ 2354 if (pset != &boot_pageset[cpu]) 2355 kfree(pset); 2356 zone_pcp(zone, cpu) = NULL; 2357 } 2358 } 2359 2360 static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, 2361 unsigned long action, 2362 void *hcpu) 2363 { 2364 int cpu = (long)hcpu; 2365 int ret = NOTIFY_OK; 2366 2367 switch (action) { 2368 case CPU_UP_PREPARE: 2369 case CPU_UP_PREPARE_FROZEN: 2370 if (process_zones(cpu)) 2371 ret = NOTIFY_BAD; 2372 break; 2373 case CPU_UP_CANCELED: 2374 case CPU_UP_CANCELED_FROZEN: 2375 case CPU_DEAD: 2376 case CPU_DEAD_FROZEN: 2377 free_zone_pagesets(cpu); 2378 break; 2379 default: 2380 break; 2381 } 2382 return ret; 2383 } 2384 2385 static struct notifier_block __cpuinitdata pageset_notifier = 2386 { &pageset_cpuup_callback, NULL, 0 }; 2387 2388 void __init setup_per_cpu_pageset(void) 2389 { 2390 int err; 2391 2392 /* Initialize per_cpu_pageset for cpu 0. 2393 * A cpuup callback will do this for every cpu 2394 * as it comes online 2395 */ 2396 err = process_zones(smp_processor_id()); 2397 BUG_ON(err); 2398 register_cpu_notifier(&pageset_notifier); 2399 } 2400 2401 #endif 2402 2403 static noinline __init_refok 2404 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 2405 { 2406 int i; 2407 struct pglist_data *pgdat = zone->zone_pgdat; 2408 size_t alloc_size; 2409 2410 /* 2411 * The per-page waitqueue mechanism uses hashed waitqueues 2412 * per zone. 2413 */ 2414 zone->wait_table_hash_nr_entries = 2415 wait_table_hash_nr_entries(zone_size_pages); 2416 zone->wait_table_bits = 2417 wait_table_bits(zone->wait_table_hash_nr_entries); 2418 alloc_size = zone->wait_table_hash_nr_entries 2419 * sizeof(wait_queue_head_t); 2420 2421 if (system_state == SYSTEM_BOOTING) { 2422 zone->wait_table = (wait_queue_head_t *) 2423 alloc_bootmem_node(pgdat, alloc_size); 2424 } else { 2425 /* 2426 * This case means that a zone whose size was 0 gets new memory 2427 * via memory hot-add. 2428 * But it may be the case that a new node was hot-added. In 2429 * this case vmalloc() will not be able to use this new node's 2430 * memory - this wait_table must be initialized to use this new 2431 * node itself as well. 2432 * To use this new node's memory, further consideration will be 2433 * necessary. 2434 */ 2435 zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size); 2436 } 2437 if (!zone->wait_table) 2438 return -ENOMEM; 2439 2440 for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) 2441 init_waitqueue_head(zone->wait_table + i); 2442 2443 return 0; 2444 } 2445 2446 static __meminit void zone_pcp_init(struct zone *zone) 2447 { 2448 int cpu; 2449 unsigned long batch = zone_batchsize(zone); 2450 2451 for (cpu = 0; cpu < NR_CPUS; cpu++) { 2452 #ifdef CONFIG_NUMA 2453 /* Early boot. Slab allocator not functional yet */ 2454 zone_pcp(zone, cpu) = &boot_pageset[cpu]; 2455 setup_pageset(&boot_pageset[cpu],0); 2456 #else 2457 setup_pageset(zone_pcp(zone,cpu), batch); 2458 #endif 2459 } 2460 if (zone->present_pages) 2461 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", 2462 zone->name, zone->present_pages, batch); 2463 } 2464 2465 __meminit int init_currently_empty_zone(struct zone *zone, 2466 unsigned long zone_start_pfn, 2467 unsigned long size, 2468 enum memmap_context context) 2469 { 2470 struct pglist_data *pgdat = zone->zone_pgdat; 2471 int ret; 2472 ret = zone_wait_table_init(zone, size); 2473 if (ret) 2474 return ret; 2475 pgdat->nr_zones = zone_idx(zone) + 1; 2476 2477 zone->zone_start_pfn = zone_start_pfn; 2478 2479 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); 2480 2481 zone_init_free_lists(pgdat, zone, zone->spanned_pages); 2482 2483 return 0; 2484 } 2485 2486 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP 2487 /* 2488 * Basic iterator support. Return the first range of PFNs for a node 2489 * Note: nid == MAX_NUMNODES returns first region regardless of node 2490 */ 2491 static int __meminit first_active_region_index_in_nid(int nid) 2492 { 2493 int i; 2494 2495 for (i = 0; i < nr_nodemap_entries; i++) 2496 if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) 2497 return i; 2498 2499 return -1; 2500 } 2501 2502 /* 2503 * Basic iterator support. Return the next active range of PFNs for a node 2504 * Note: nid == MAX_NUMNODES returns next region regardles of node 2505 */ 2506 static int __meminit next_active_region_index_in_nid(int index, int nid) 2507 { 2508 for (index = index + 1; index < nr_nodemap_entries; index++) 2509 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) 2510 return index; 2511 2512 return -1; 2513 } 2514 2515 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 2516 /* 2517 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. 2518 * Architectures may implement their own version but if add_active_range() 2519 * was used and there are no special requirements, this is a convenient 2520 * alternative 2521 */ 2522 int __meminit early_pfn_to_nid(unsigned long pfn) 2523 { 2524 int i; 2525 2526 for (i = 0; i < nr_nodemap_entries; i++) { 2527 unsigned long start_pfn = early_node_map[i].start_pfn; 2528 unsigned long end_pfn = early_node_map[i].end_pfn; 2529 2530 if (start_pfn <= pfn && pfn < end_pfn) 2531 return early_node_map[i].nid; 2532 } 2533 2534 return 0; 2535 } 2536 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ 2537 2538 /* Basic iterator support to walk early_node_map[] */ 2539 #define for_each_active_range_index_in_nid(i, nid) \ 2540 for (i = first_active_region_index_in_nid(nid); i != -1; \ 2541 i = next_active_region_index_in_nid(i, nid)) 2542 2543 /** 2544 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range 2545 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. 2546 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node 2547 * 2548 * If an architecture guarantees that all ranges registered with 2549 * add_active_ranges() contain no holes and may be freed, this 2550 * this function may be used instead of calling free_bootmem() manually. 2551 */ 2552 void __init free_bootmem_with_active_regions(int nid, 2553 unsigned long max_low_pfn) 2554 { 2555 int i; 2556 2557 for_each_active_range_index_in_nid(i, nid) { 2558 unsigned long size_pages = 0; 2559 unsigned long end_pfn = early_node_map[i].end_pfn; 2560 2561 if (early_node_map[i].start_pfn >= max_low_pfn) 2562 continue; 2563 2564 if (end_pfn > max_low_pfn) 2565 end_pfn = max_low_pfn; 2566 2567 size_pages = end_pfn - early_node_map[i].start_pfn; 2568 free_bootmem_node(NODE_DATA(early_node_map[i].nid), 2569 PFN_PHYS(early_node_map[i].start_pfn), 2570 size_pages << PAGE_SHIFT); 2571 } 2572 } 2573 2574 /** 2575 * sparse_memory_present_with_active_regions - Call memory_present for each active range 2576 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. 2577 * 2578 * If an architecture guarantees that all ranges registered with 2579 * add_active_ranges() contain no holes and may be freed, this 2580 * function may be used instead of calling memory_present() manually. 2581 */ 2582 void __init sparse_memory_present_with_active_regions(int nid) 2583 { 2584 int i; 2585 2586 for_each_active_range_index_in_nid(i, nid) 2587 memory_present(early_node_map[i].nid, 2588 early_node_map[i].start_pfn, 2589 early_node_map[i].end_pfn); 2590 } 2591 2592 /** 2593 * push_node_boundaries - Push node boundaries to at least the requested boundary 2594 * @nid: The nid of the node to push the boundary for 2595 * @start_pfn: The start pfn of the node 2596 * @end_pfn: The end pfn of the node 2597 * 2598 * In reserve-based hot-add, mem_map is allocated that is unused until hotadd 2599 * time. Specifically, on x86_64, SRAT will report ranges that can potentially 2600 * be hotplugged even though no physical memory exists. This function allows 2601 * an arch to push out the node boundaries so mem_map is allocated that can 2602 * be used later. 2603 */ 2604 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE 2605 void __init push_node_boundaries(unsigned int nid, 2606 unsigned long start_pfn, unsigned long end_pfn) 2607 { 2608 printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n", 2609 nid, start_pfn, end_pfn); 2610 2611 /* Initialise the boundary for this node if necessary */ 2612 if (node_boundary_end_pfn[nid] == 0) 2613 node_boundary_start_pfn[nid] = -1UL; 2614 2615 /* Update the boundaries */ 2616 if (node_boundary_start_pfn[nid] > start_pfn) 2617 node_boundary_start_pfn[nid] = start_pfn; 2618 if (node_boundary_end_pfn[nid] < end_pfn) 2619 node_boundary_end_pfn[nid] = end_pfn; 2620 } 2621 2622 /* If necessary, push the node boundary out for reserve hotadd */ 2623 static void __meminit account_node_boundary(unsigned int nid, 2624 unsigned long *start_pfn, unsigned long *end_pfn) 2625 { 2626 printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", 2627 nid, *start_pfn, *end_pfn); 2628 2629 /* Return if boundary information has not been provided */ 2630 if (node_boundary_end_pfn[nid] == 0) 2631 return; 2632 2633 /* Check the boundaries and update if necessary */ 2634 if (node_boundary_start_pfn[nid] < *start_pfn) 2635 *start_pfn = node_boundary_start_pfn[nid]; 2636 if (node_boundary_end_pfn[nid] > *end_pfn) 2637 *end_pfn = node_boundary_end_pfn[nid]; 2638 } 2639 #else 2640 void __init push_node_boundaries(unsigned int nid, 2641 unsigned long start_pfn, unsigned long end_pfn) {} 2642 2643 static void __meminit account_node_boundary(unsigned int nid, 2644 unsigned long *start_pfn, unsigned long *end_pfn) {} 2645 #endif 2646 2647 2648 /** 2649 * get_pfn_range_for_nid - Return the start and end page frames for a node 2650 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 2651 * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 2652 * @end_pfn: Passed by reference. On return, it will have the node end_pfn. 2653 * 2654 * It returns the start and end page frame of a node based on information 2655 * provided by an arch calling add_active_range(). If called for a node 2656 * with no available memory, a warning is printed and the start and end 2657 * PFNs will be 0. 2658 */ 2659 void __meminit get_pfn_range_for_nid(unsigned int nid, 2660 unsigned long *start_pfn, unsigned long *end_pfn) 2661 { 2662 int i; 2663 *start_pfn = -1UL; 2664 *end_pfn = 0; 2665 2666 for_each_active_range_index_in_nid(i, nid) { 2667 *start_pfn = min(*start_pfn, early_node_map[i].start_pfn); 2668 *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); 2669 } 2670 2671 if (*start_pfn == -1UL) { 2672 printk(KERN_WARNING "Node %u active with no memory\n", nid); 2673 *start_pfn = 0; 2674 } 2675 2676 /* Push the node boundaries out if requested */ 2677 account_node_boundary(nid, start_pfn, end_pfn); 2678 } 2679 2680 /* 2681 * This finds a zone that can be used for ZONE_MOVABLE pages. The 2682 * assumption is made that zones within a node are ordered in monotonic 2683 * increasing memory addresses so that the "highest" populated zone is used 2684 */ 2685 void __init find_usable_zone_for_movable(void) 2686 { 2687 int zone_index; 2688 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { 2689 if (zone_index == ZONE_MOVABLE) 2690 continue; 2691 2692 if (arch_zone_highest_possible_pfn[zone_index] > 2693 arch_zone_lowest_possible_pfn[zone_index]) 2694 break; 2695 } 2696 2697 VM_BUG_ON(zone_index == -1); 2698 movable_zone = zone_index; 2699 } 2700 2701 /* 2702 * The zone ranges provided by the architecture do not include ZONE_MOVABLE 2703 * because it is sized independant of architecture. Unlike the other zones, 2704 * the starting point for ZONE_MOVABLE is not fixed. It may be different 2705 * in each node depending on the size of each node and how evenly kernelcore 2706 * is distributed. This helper function adjusts the zone ranges 2707 * provided by the architecture for a given node by using the end of the 2708 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 2709 * zones within a node are in order of monotonic increases memory addresses 2710 */ 2711 void __meminit adjust_zone_range_for_zone_movable(int nid, 2712 unsigned long zone_type, 2713 unsigned long node_start_pfn, 2714 unsigned long node_end_pfn, 2715 unsigned long *zone_start_pfn, 2716 unsigned long *zone_end_pfn) 2717 { 2718 /* Only adjust if ZONE_MOVABLE is on this node */ 2719 if (zone_movable_pfn[nid]) { 2720 /* Size ZONE_MOVABLE */ 2721 if (zone_type == ZONE_MOVABLE) { 2722 *zone_start_pfn = zone_movable_pfn[nid]; 2723 *zone_end_pfn = min(node_end_pfn, 2724 arch_zone_highest_possible_pfn[movable_zone]); 2725 2726 /* Adjust for ZONE_MOVABLE starting within this range */ 2727 } else if (*zone_start_pfn < zone_movable_pfn[nid] && 2728 *zone_end_pfn > zone_movable_pfn[nid]) { 2729 *zone_end_pfn = zone_movable_pfn[nid]; 2730 2731 /* Check if this whole range is within ZONE_MOVABLE */ 2732 } else if (*zone_start_pfn >= zone_movable_pfn[nid]) 2733 *zone_start_pfn = *zone_end_pfn; 2734 } 2735 } 2736 2737 /* 2738 * Return the number of pages a zone spans in a node, including holes 2739 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 2740 */ 2741 static unsigned long __meminit zone_spanned_pages_in_node(int nid, 2742 unsigned long zone_type, 2743 unsigned long *ignored) 2744 { 2745 unsigned long node_start_pfn, node_end_pfn; 2746 unsigned long zone_start_pfn, zone_end_pfn; 2747 2748 /* Get the start and end of the node and zone */ 2749 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); 2750 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; 2751 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; 2752 adjust_zone_range_for_zone_movable(nid, zone_type, 2753 node_start_pfn, node_end_pfn, 2754 &zone_start_pfn, &zone_end_pfn); 2755 2756 /* Check that this node has pages within the zone's required range */ 2757 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) 2758 return 0; 2759 2760 /* Move the zone boundaries inside the node if necessary */ 2761 zone_end_pfn = min(zone_end_pfn, node_end_pfn); 2762 zone_start_pfn = max(zone_start_pfn, node_start_pfn); 2763 2764 /* Return the spanned pages */ 2765 return zone_end_pfn - zone_start_pfn; 2766 } 2767 2768 /* 2769 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 2770 * then all holes in the requested range will be accounted for. 2771 */ 2772 unsigned long __meminit __absent_pages_in_range(int nid, 2773 unsigned long range_start_pfn, 2774 unsigned long range_end_pfn) 2775 { 2776 int i = 0; 2777 unsigned long prev_end_pfn = 0, hole_pages = 0; 2778 unsigned long start_pfn; 2779 2780 /* Find the end_pfn of the first active range of pfns in the node */ 2781 i = first_active_region_index_in_nid(nid); 2782 if (i == -1) 2783 return 0; 2784 2785 /* Account for ranges before physical memory on this node */ 2786 if (early_node_map[i].start_pfn > range_start_pfn) 2787 hole_pages = early_node_map[i].start_pfn - range_start_pfn; 2788 2789 prev_end_pfn = early_node_map[i].start_pfn; 2790 2791 /* Find all holes for the zone within the node */ 2792 for (; i != -1; i = next_active_region_index_in_nid(i, nid)) { 2793 2794 /* No need to continue if prev_end_pfn is outside the zone */ 2795 if (prev_end_pfn >= range_end_pfn) 2796 break; 2797 2798 /* Make sure the end of the zone is not within the hole */ 2799 start_pfn = min(early_node_map[i].start_pfn, range_end_pfn); 2800 prev_end_pfn = max(prev_end_pfn, range_start_pfn); 2801 2802 /* Update the hole size cound and move on */ 2803 if (start_pfn > range_start_pfn) { 2804 BUG_ON(prev_end_pfn > start_pfn); 2805 hole_pages += start_pfn - prev_end_pfn; 2806 } 2807 prev_end_pfn = early_node_map[i].end_pfn; 2808 } 2809 2810 /* Account for ranges past physical memory on this node */ 2811 if (range_end_pfn > prev_end_pfn) 2812 hole_pages += range_end_pfn - 2813 max(range_start_pfn, prev_end_pfn); 2814 2815 return hole_pages; 2816 } 2817 2818 /** 2819 * absent_pages_in_range - Return number of page frames in holes within a range 2820 * @start_pfn: The start PFN to start searching for holes 2821 * @end_pfn: The end PFN to stop searching for holes 2822 * 2823 * It returns the number of pages frames in memory holes within a range. 2824 */ 2825 unsigned long __init absent_pages_in_range(unsigned long start_pfn, 2826 unsigned long end_pfn) 2827 { 2828 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); 2829 } 2830 2831 /* Return the number of page frames in holes in a zone on a node */ 2832 static unsigned long __meminit zone_absent_pages_in_node(int nid, 2833 unsigned long zone_type, 2834 unsigned long *ignored) 2835 { 2836 unsigned long node_start_pfn, node_end_pfn; 2837 unsigned long zone_start_pfn, zone_end_pfn; 2838 2839 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); 2840 zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type], 2841 node_start_pfn); 2842 zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type], 2843 node_end_pfn); 2844 2845 adjust_zone_range_for_zone_movable(nid, zone_type, 2846 node_start_pfn, node_end_pfn, 2847 &zone_start_pfn, &zone_end_pfn); 2848 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 2849 } 2850 2851 #else 2852 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, 2853 unsigned long zone_type, 2854 unsigned long *zones_size) 2855 { 2856 return zones_size[zone_type]; 2857 } 2858 2859 static inline unsigned long __meminit zone_absent_pages_in_node(int nid, 2860 unsigned long zone_type, 2861 unsigned long *zholes_size) 2862 { 2863 if (!zholes_size) 2864 return 0; 2865 2866 return zholes_size[zone_type]; 2867 } 2868 2869 #endif 2870 2871 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, 2872 unsigned long *zones_size, unsigned long *zholes_size) 2873 { 2874 unsigned long realtotalpages, totalpages = 0; 2875 enum zone_type i; 2876 2877 for (i = 0; i < MAX_NR_ZONES; i++) 2878 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, 2879 zones_size); 2880 pgdat->node_spanned_pages = totalpages; 2881 2882 realtotalpages = totalpages; 2883 for (i = 0; i < MAX_NR_ZONES; i++) 2884 realtotalpages -= 2885 zone_absent_pages_in_node(pgdat->node_id, i, 2886 zholes_size); 2887 pgdat->node_present_pages = realtotalpages; 2888 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, 2889 realtotalpages); 2890 } 2891 2892 /* 2893 * Set up the zone data structures: 2894 * - mark all pages reserved 2895 * - mark all memory queues empty 2896 * - clear the memory bitmaps 2897 */ 2898 static void __meminit free_area_init_core(struct pglist_data *pgdat, 2899 unsigned long *zones_size, unsigned long *zholes_size) 2900 { 2901 enum zone_type j; 2902 int nid = pgdat->node_id; 2903 unsigned long zone_start_pfn = pgdat->node_start_pfn; 2904 int ret; 2905 2906 pgdat_resize_init(pgdat); 2907 pgdat->nr_zones = 0; 2908 init_waitqueue_head(&pgdat->kswapd_wait); 2909 pgdat->kswapd_max_order = 0; 2910 2911 for (j = 0; j < MAX_NR_ZONES; j++) { 2912 struct zone *zone = pgdat->node_zones + j; 2913 unsigned long size, realsize, memmap_pages; 2914 2915 size = zone_spanned_pages_in_node(nid, j, zones_size); 2916 realsize = size - zone_absent_pages_in_node(nid, j, 2917 zholes_size); 2918 2919 /* 2920 * Adjust realsize so that it accounts for how much memory 2921 * is used by this zone for memmap. This affects the watermark 2922 * and per-cpu initialisations 2923 */ 2924 memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT; 2925 if (realsize >= memmap_pages) { 2926 realsize -= memmap_pages; 2927 printk(KERN_DEBUG 2928 " %s zone: %lu pages used for memmap\n", 2929 zone_names[j], memmap_pages); 2930 } else 2931 printk(KERN_WARNING 2932 " %s zone: %lu pages exceeds realsize %lu\n", 2933 zone_names[j], memmap_pages, realsize); 2934 2935 /* Account for reserved pages */ 2936 if (j == 0 && realsize > dma_reserve) { 2937 realsize -= dma_reserve; 2938 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 2939 zone_names[0], dma_reserve); 2940 } 2941 2942 if (!is_highmem_idx(j)) 2943 nr_kernel_pages += realsize; 2944 nr_all_pages += realsize; 2945 2946 zone->spanned_pages = size; 2947 zone->present_pages = realsize; 2948 #ifdef CONFIG_NUMA 2949 zone->node = nid; 2950 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) 2951 / 100; 2952 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; 2953 #endif 2954 zone->name = zone_names[j]; 2955 spin_lock_init(&zone->lock); 2956 spin_lock_init(&zone->lru_lock); 2957 zone_seqlock_init(zone); 2958 zone->zone_pgdat = pgdat; 2959 2960 zone->prev_priority = DEF_PRIORITY; 2961 2962 zone_pcp_init(zone); 2963 INIT_LIST_HEAD(&zone->active_list); 2964 INIT_LIST_HEAD(&zone->inactive_list); 2965 zone->nr_scan_active = 0; 2966 zone->nr_scan_inactive = 0; 2967 zap_zone_vm_stats(zone); 2968 atomic_set(&zone->reclaim_in_progress, 0); 2969 if (!size) 2970 continue; 2971 2972 ret = init_currently_empty_zone(zone, zone_start_pfn, 2973 size, MEMMAP_EARLY); 2974 BUG_ON(ret); 2975 zone_start_pfn += size; 2976 } 2977 } 2978 2979 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) 2980 { 2981 /* Skip empty nodes */ 2982 if (!pgdat->node_spanned_pages) 2983 return; 2984 2985 #ifdef CONFIG_FLAT_NODE_MEM_MAP 2986 /* ia64 gets its own node_mem_map, before this, without bootmem */ 2987 if (!pgdat->node_mem_map) { 2988 unsigned long size, start, end; 2989 struct page *map; 2990 2991 /* 2992 * The zone's endpoints aren't required to be MAX_ORDER 2993 * aligned but the node_mem_map endpoints must be in order 2994 * for the buddy allocator to function correctly. 2995 */ 2996 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 2997 end = pgdat->node_start_pfn + pgdat->node_spanned_pages; 2998 end = ALIGN(end, MAX_ORDER_NR_PAGES); 2999 size = (end - start) * sizeof(struct page); 3000 map = alloc_remap(pgdat->node_id, size); 3001 if (!map) 3002 map = alloc_bootmem_node(pgdat, size); 3003 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); 3004 } 3005 #ifndef CONFIG_NEED_MULTIPLE_NODES 3006 /* 3007 * With no DISCONTIG, the global mem_map is just set as node 0's 3008 */ 3009 if (pgdat == NODE_DATA(0)) { 3010 mem_map = NODE_DATA(0)->node_mem_map; 3011 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP 3012 if (page_to_pfn(mem_map) != pgdat->node_start_pfn) 3013 mem_map -= pgdat->node_start_pfn; 3014 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 3015 } 3016 #endif 3017 #endif /* CONFIG_FLAT_NODE_MEM_MAP */ 3018 } 3019 3020 void __meminit free_area_init_node(int nid, struct pglist_data *pgdat, 3021 unsigned long *zones_size, unsigned long node_start_pfn, 3022 unsigned long *zholes_size) 3023 { 3024 pgdat->node_id = nid; 3025 pgdat->node_start_pfn = node_start_pfn; 3026 calculate_node_totalpages(pgdat, zones_size, zholes_size); 3027 3028 alloc_node_mem_map(pgdat); 3029 3030 free_area_init_core(pgdat, zones_size, zholes_size); 3031 } 3032 3033 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP 3034 3035 #if MAX_NUMNODES > 1 3036 /* 3037 * Figure out the number of possible node ids. 3038 */ 3039 static void __init setup_nr_node_ids(void) 3040 { 3041 unsigned int node; 3042 unsigned int highest = 0; 3043 3044 for_each_node_mask(node, node_possible_map) 3045 highest = node; 3046 nr_node_ids = highest + 1; 3047 } 3048 #else 3049 static inline void setup_nr_node_ids(void) 3050 { 3051 } 3052 #endif 3053 3054 /** 3055 * add_active_range - Register a range of PFNs backed by physical memory 3056 * @nid: The node ID the range resides on 3057 * @start_pfn: The start PFN of the available physical memory 3058 * @end_pfn: The end PFN of the available physical memory 3059 * 3060 * These ranges are stored in an early_node_map[] and later used by 3061 * free_area_init_nodes() to calculate zone sizes and holes. If the 3062 * range spans a memory hole, it is up to the architecture to ensure 3063 * the memory is not freed by the bootmem allocator. If possible 3064 * the range being registered will be merged with existing ranges. 3065 */ 3066 void __init add_active_range(unsigned int nid, unsigned long start_pfn, 3067 unsigned long end_pfn) 3068 { 3069 int i; 3070 3071 printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) " 3072 "%d entries of %d used\n", 3073 nid, start_pfn, end_pfn, 3074 nr_nodemap_entries, MAX_ACTIVE_REGIONS); 3075 3076 /* Merge with existing active regions if possible */ 3077 for (i = 0; i < nr_nodemap_entries; i++) { 3078 if (early_node_map[i].nid != nid) 3079 continue; 3080 3081 /* Skip if an existing region covers this new one */ 3082 if (start_pfn >= early_node_map[i].start_pfn && 3083 end_pfn <= early_node_map[i].end_pfn) 3084 return; 3085 3086 /* Merge forward if suitable */ 3087 if (start_pfn <= early_node_map[i].end_pfn && 3088 end_pfn > early_node_map[i].end_pfn) { 3089 early_node_map[i].end_pfn = end_pfn; 3090 return; 3091 } 3092 3093 /* Merge backward if suitable */ 3094 if (start_pfn < early_node_map[i].end_pfn && 3095 end_pfn >= early_node_map[i].start_pfn) { 3096 early_node_map[i].start_pfn = start_pfn; 3097 return; 3098 } 3099 } 3100 3101 /* Check that early_node_map is large enough */ 3102 if (i >= MAX_ACTIVE_REGIONS) { 3103 printk(KERN_CRIT "More than %d memory regions, truncating\n", 3104 MAX_ACTIVE_REGIONS); 3105 return; 3106 } 3107 3108 early_node_map[i].nid = nid; 3109 early_node_map[i].start_pfn = start_pfn; 3110 early_node_map[i].end_pfn = end_pfn; 3111 nr_nodemap_entries = i + 1; 3112 } 3113 3114 /** 3115 * shrink_active_range - Shrink an existing registered range of PFNs 3116 * @nid: The node id the range is on that should be shrunk 3117 * @old_end_pfn: The old end PFN of the range 3118 * @new_end_pfn: The new PFN of the range 3119 * 3120 * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. 3121 * The map is kept at the end physical page range that has already been 3122 * registered with add_active_range(). This function allows an arch to shrink 3123 * an existing registered range. 3124 */ 3125 void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn, 3126 unsigned long new_end_pfn) 3127 { 3128 int i; 3129 3130 /* Find the old active region end and shrink */ 3131 for_each_active_range_index_in_nid(i, nid) 3132 if (early_node_map[i].end_pfn == old_end_pfn) { 3133 early_node_map[i].end_pfn = new_end_pfn; 3134 break; 3135 } 3136 } 3137 3138 /** 3139 * remove_all_active_ranges - Remove all currently registered regions 3140 * 3141 * During discovery, it may be found that a table like SRAT is invalid 3142 * and an alternative discovery method must be used. This function removes 3143 * all currently registered regions. 3144 */ 3145 void __init remove_all_active_ranges(void) 3146 { 3147 memset(early_node_map, 0, sizeof(early_node_map)); 3148 nr_nodemap_entries = 0; 3149 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE 3150 memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn)); 3151 memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn)); 3152 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ 3153 } 3154 3155 /* Compare two active node_active_regions */ 3156 static int __init cmp_node_active_region(const void *a, const void *b) 3157 { 3158 struct node_active_region *arange = (struct node_active_region *)a; 3159 struct node_active_region *brange = (struct node_active_region *)b; 3160 3161 /* Done this way to avoid overflows */ 3162 if (arange->start_pfn > brange->start_pfn) 3163 return 1; 3164 if (arange->start_pfn < brange->start_pfn) 3165 return -1; 3166 3167 return 0; 3168 } 3169 3170 /* sort the node_map by start_pfn */ 3171 static void __init sort_node_map(void) 3172 { 3173 sort(early_node_map, (size_t)nr_nodemap_entries, 3174 sizeof(struct node_active_region), 3175 cmp_node_active_region, NULL); 3176 } 3177 3178 /* Find the lowest pfn for a node */ 3179 unsigned long __init find_min_pfn_for_node(unsigned long nid) 3180 { 3181 int i; 3182 unsigned long min_pfn = ULONG_MAX; 3183 3184 /* Assuming a sorted map, the first range found has the starting pfn */ 3185 for_each_active_range_index_in_nid(i, nid) 3186 min_pfn = min(min_pfn, early_node_map[i].start_pfn); 3187 3188 if (min_pfn == ULONG_MAX) { 3189 printk(KERN_WARNING 3190 "Could not find start_pfn for node %lu\n", nid); 3191 return 0; 3192 } 3193 3194 return min_pfn; 3195 } 3196 3197 /** 3198 * find_min_pfn_with_active_regions - Find the minimum PFN registered 3199 * 3200 * It returns the minimum PFN based on information provided via 3201 * add_active_range(). 3202 */ 3203 unsigned long __init find_min_pfn_with_active_regions(void) 3204 { 3205 return find_min_pfn_for_node(MAX_NUMNODES); 3206 } 3207 3208 /** 3209 * find_max_pfn_with_active_regions - Find the maximum PFN registered 3210 * 3211 * It returns the maximum PFN based on information provided via 3212 * add_active_range(). 3213 */ 3214 unsigned long __init find_max_pfn_with_active_regions(void) 3215 { 3216 int i; 3217 unsigned long max_pfn = 0; 3218 3219 for (i = 0; i < nr_nodemap_entries; i++) 3220 max_pfn = max(max_pfn, early_node_map[i].end_pfn); 3221 3222 return max_pfn; 3223 } 3224 3225 unsigned long __init early_calculate_totalpages(void) 3226 { 3227 int i; 3228 unsigned long totalpages = 0; 3229 3230 for (i = 0; i < nr_nodemap_entries; i++) 3231 totalpages += early_node_map[i].end_pfn - 3232 early_node_map[i].start_pfn; 3233 3234 return totalpages; 3235 } 3236 3237 /* 3238 * Find the PFN the Movable zone begins in each node. Kernel memory 3239 * is spread evenly between nodes as long as the nodes have enough 3240 * memory. When they don't, some nodes will have more kernelcore than 3241 * others 3242 */ 3243 void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) 3244 { 3245 int i, nid; 3246 unsigned long usable_startpfn; 3247 unsigned long kernelcore_node, kernelcore_remaining; 3248 int usable_nodes = num_online_nodes(); 3249 3250 /* 3251 * If movablecore was specified, calculate what size of 3252 * kernelcore that corresponds so that memory usable for 3253 * any allocation type is evenly spread. If both kernelcore 3254 * and movablecore are specified, then the value of kernelcore 3255 * will be used for required_kernelcore if it's greater than 3256 * what movablecore would have allowed. 3257 */ 3258 if (required_movablecore) { 3259 unsigned long totalpages = early_calculate_totalpages(); 3260 unsigned long corepages; 3261 3262 /* 3263 * Round-up so that ZONE_MOVABLE is at least as large as what 3264 * was requested by the user 3265 */ 3266 required_movablecore = 3267 roundup(required_movablecore, MAX_ORDER_NR_PAGES); 3268 corepages = totalpages - required_movablecore; 3269 3270 required_kernelcore = max(required_kernelcore, corepages); 3271 } 3272 3273 /* If kernelcore was not specified, there is no ZONE_MOVABLE */ 3274 if (!required_kernelcore) 3275 return; 3276 3277 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 3278 find_usable_zone_for_movable(); 3279 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 3280 3281 restart: 3282 /* Spread kernelcore memory as evenly as possible throughout nodes */ 3283 kernelcore_node = required_kernelcore / usable_nodes; 3284 for_each_online_node(nid) { 3285 /* 3286 * Recalculate kernelcore_node if the division per node 3287 * now exceeds what is necessary to satisfy the requested 3288 * amount of memory for the kernel 3289 */ 3290 if (required_kernelcore < kernelcore_node) 3291 kernelcore_node = required_kernelcore / usable_nodes; 3292 3293 /* 3294 * As the map is walked, we track how much memory is usable 3295 * by the kernel using kernelcore_remaining. When it is 3296 * 0, the rest of the node is usable by ZONE_MOVABLE 3297 */ 3298 kernelcore_remaining = kernelcore_node; 3299 3300 /* Go through each range of PFNs within this node */ 3301 for_each_active_range_index_in_nid(i, nid) { 3302 unsigned long start_pfn, end_pfn; 3303 unsigned long size_pages; 3304 3305 start_pfn = max(early_node_map[i].start_pfn, 3306 zone_movable_pfn[nid]); 3307 end_pfn = early_node_map[i].end_pfn; 3308 if (start_pfn >= end_pfn) 3309 continue; 3310 3311 /* Account for what is only usable for kernelcore */ 3312 if (start_pfn < usable_startpfn) { 3313 unsigned long kernel_pages; 3314 kernel_pages = min(end_pfn, usable_startpfn) 3315 - start_pfn; 3316 3317 kernelcore_remaining -= min(kernel_pages, 3318 kernelcore_remaining); 3319 required_kernelcore -= min(kernel_pages, 3320 required_kernelcore); 3321 3322 /* Continue if range is now fully accounted */ 3323 if (end_pfn <= usable_startpfn) { 3324 3325 /* 3326 * Push zone_movable_pfn to the end so 3327 * that if we have to rebalance 3328 * kernelcore across nodes, we will 3329 * not double account here 3330 */ 3331 zone_movable_pfn[nid] = end_pfn; 3332 continue; 3333 } 3334 start_pfn = usable_startpfn; 3335 } 3336 3337 /* 3338 * The usable PFN range for ZONE_MOVABLE is from 3339 * start_pfn->end_pfn. Calculate size_pages as the 3340 * number of pages used as kernelcore 3341 */ 3342 size_pages = end_pfn - start_pfn; 3343 if (size_pages > kernelcore_remaining) 3344 size_pages = kernelcore_remaining; 3345 zone_movable_pfn[nid] = start_pfn + size_pages; 3346 3347 /* 3348 * Some kernelcore has been met, update counts and 3349 * break if the kernelcore for this node has been 3350 * satisified 3351 */ 3352 required_kernelcore -= min(required_kernelcore, 3353 size_pages); 3354 kernelcore_remaining -= size_pages; 3355 if (!kernelcore_remaining) 3356 break; 3357 } 3358 } 3359 3360 /* 3361 * If there is still required_kernelcore, we do another pass with one 3362 * less node in the count. This will push zone_movable_pfn[nid] further 3363 * along on the nodes that still have memory until kernelcore is 3364 * satisified 3365 */ 3366 usable_nodes--; 3367 if (usable_nodes && required_kernelcore > usable_nodes) 3368 goto restart; 3369 3370 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 3371 for (nid = 0; nid < MAX_NUMNODES; nid++) 3372 zone_movable_pfn[nid] = 3373 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 3374 } 3375 3376 /** 3377 * free_area_init_nodes - Initialise all pg_data_t and zone data 3378 * @max_zone_pfn: an array of max PFNs for each zone 3379 * 3380 * This will call free_area_init_node() for each active node in the system. 3381 * Using the page ranges provided by add_active_range(), the size of each 3382 * zone in each node and their holes is calculated. If the maximum PFN 3383 * between two adjacent zones match, it is assumed that the zone is empty. 3384 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed 3385 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone 3386 * starts where the previous one ended. For example, ZONE_DMA32 starts 3387 * at arch_max_dma_pfn. 3388 */ 3389 void __init free_area_init_nodes(unsigned long *max_zone_pfn) 3390 { 3391 unsigned long nid; 3392 enum zone_type i; 3393 3394 /* Sort early_node_map as initialisation assumes it is sorted */ 3395 sort_node_map(); 3396 3397 /* Record where the zone boundaries are */ 3398 memset(arch_zone_lowest_possible_pfn, 0, 3399 sizeof(arch_zone_lowest_possible_pfn)); 3400 memset(arch_zone_highest_possible_pfn, 0, 3401 sizeof(arch_zone_highest_possible_pfn)); 3402 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); 3403 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; 3404 for (i = 1; i < MAX_NR_ZONES; i++) { 3405 if (i == ZONE_MOVABLE) 3406 continue; 3407 arch_zone_lowest_possible_pfn[i] = 3408 arch_zone_highest_possible_pfn[i-1]; 3409 arch_zone_highest_possible_pfn[i] = 3410 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); 3411 } 3412 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; 3413 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; 3414 3415 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 3416 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 3417 find_zone_movable_pfns_for_nodes(zone_movable_pfn); 3418 3419 /* Print out the zone ranges */ 3420 printk("Zone PFN ranges:\n"); 3421 for (i = 0; i < MAX_NR_ZONES; i++) { 3422 if (i == ZONE_MOVABLE) 3423 continue; 3424 printk(" %-8s %8lu -> %8lu\n", 3425 zone_names[i], 3426 arch_zone_lowest_possible_pfn[i], 3427 arch_zone_highest_possible_pfn[i]); 3428 } 3429 3430 /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 3431 printk("Movable zone start PFN for each node\n"); 3432 for (i = 0; i < MAX_NUMNODES; i++) { 3433 if (zone_movable_pfn[i]) 3434 printk(" Node %d: %lu\n", i, zone_movable_pfn[i]); 3435 } 3436 3437 /* Print out the early_node_map[] */ 3438 printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); 3439 for (i = 0; i < nr_nodemap_entries; i++) 3440 printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid, 3441 early_node_map[i].start_pfn, 3442 early_node_map[i].end_pfn); 3443 3444 /* Initialise every node */ 3445 setup_nr_node_ids(); 3446 for_each_online_node(nid) { 3447 pg_data_t *pgdat = NODE_DATA(nid); 3448 free_area_init_node(nid, pgdat, NULL, 3449 find_min_pfn_for_node(nid), NULL); 3450 } 3451 } 3452 3453 static int __init cmdline_parse_core(char *p, unsigned long *core) 3454 { 3455 unsigned long long coremem; 3456 if (!p) 3457 return -EINVAL; 3458 3459 coremem = memparse(p, &p); 3460 *core = coremem >> PAGE_SHIFT; 3461 3462 /* Paranoid check that UL is enough for the coremem value */ 3463 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); 3464 3465 return 0; 3466 } 3467 3468 /* 3469 * kernelcore=size sets the amount of memory for use for allocations that 3470 * cannot be reclaimed or migrated. 3471 */ 3472 static int __init cmdline_parse_kernelcore(char *p) 3473 { 3474 return cmdline_parse_core(p, &required_kernelcore); 3475 } 3476 3477 /* 3478 * movablecore=size sets the amount of memory for use for allocations that 3479 * can be reclaimed or migrated. 3480 */ 3481 static int __init cmdline_parse_movablecore(char *p) 3482 { 3483 return cmdline_parse_core(p, &required_movablecore); 3484 } 3485 3486 early_param("kernelcore", cmdline_parse_kernelcore); 3487 early_param("movablecore", cmdline_parse_movablecore); 3488 3489 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 3490 3491 /** 3492 * set_dma_reserve - set the specified number of pages reserved in the first zone 3493 * @new_dma_reserve: The number of pages to mark reserved 3494 * 3495 * The per-cpu batchsize and zone watermarks are determined by present_pages. 3496 * In the DMA zone, a significant percentage may be consumed by kernel image 3497 * and other unfreeable allocations which can skew the watermarks badly. This 3498 * function may optionally be used to account for unfreeable pages in the 3499 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and 3500 * smaller per-cpu batchsize. 3501 */ 3502 void __init set_dma_reserve(unsigned long new_dma_reserve) 3503 { 3504 dma_reserve = new_dma_reserve; 3505 } 3506 3507 #ifndef CONFIG_NEED_MULTIPLE_NODES 3508 static bootmem_data_t contig_bootmem_data; 3509 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; 3510 3511 EXPORT_SYMBOL(contig_page_data); 3512 #endif 3513 3514 void __init free_area_init(unsigned long *zones_size) 3515 { 3516 free_area_init_node(0, NODE_DATA(0), zones_size, 3517 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 3518 } 3519 3520 static int page_alloc_cpu_notify(struct notifier_block *self, 3521 unsigned long action, void *hcpu) 3522 { 3523 int cpu = (unsigned long)hcpu; 3524 3525 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 3526 local_irq_disable(); 3527 __drain_pages(cpu); 3528 vm_events_fold_cpu(cpu); 3529 local_irq_enable(); 3530 refresh_cpu_vm_stats(cpu); 3531 } 3532 return NOTIFY_OK; 3533 } 3534 3535 void __init page_alloc_init(void) 3536 { 3537 hotcpu_notifier(page_alloc_cpu_notify, 0); 3538 } 3539 3540 /* 3541 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio 3542 * or min_free_kbytes changes. 3543 */ 3544 static void calculate_totalreserve_pages(void) 3545 { 3546 struct pglist_data *pgdat; 3547 unsigned long reserve_pages = 0; 3548 enum zone_type i, j; 3549 3550 for_each_online_pgdat(pgdat) { 3551 for (i = 0; i < MAX_NR_ZONES; i++) { 3552 struct zone *zone = pgdat->node_zones + i; 3553 unsigned long max = 0; 3554 3555 /* Find valid and maximum lowmem_reserve in the zone */ 3556 for (j = i; j < MAX_NR_ZONES; j++) { 3557 if (zone->lowmem_reserve[j] > max) 3558 max = zone->lowmem_reserve[j]; 3559 } 3560 3561 /* we treat pages_high as reserved pages. */ 3562 max += zone->pages_high; 3563 3564 if (max > zone->present_pages) 3565 max = zone->present_pages; 3566 reserve_pages += max; 3567 } 3568 } 3569 totalreserve_pages = reserve_pages; 3570 } 3571 3572 /* 3573 * setup_per_zone_lowmem_reserve - called whenever 3574 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone 3575 * has a correct pages reserved value, so an adequate number of 3576 * pages are left in the zone after a successful __alloc_pages(). 3577 */ 3578 static void setup_per_zone_lowmem_reserve(void) 3579 { 3580 struct pglist_data *pgdat; 3581 enum zone_type j, idx; 3582 3583 for_each_online_pgdat(pgdat) { 3584 for (j = 0; j < MAX_NR_ZONES; j++) { 3585 struct zone *zone = pgdat->node_zones + j; 3586 unsigned long present_pages = zone->present_pages; 3587 3588 zone->lowmem_reserve[j] = 0; 3589 3590 idx = j; 3591 while (idx) { 3592 struct zone *lower_zone; 3593 3594 idx--; 3595 3596 if (sysctl_lowmem_reserve_ratio[idx] < 1) 3597 sysctl_lowmem_reserve_ratio[idx] = 1; 3598 3599 lower_zone = pgdat->node_zones + idx; 3600 lower_zone->lowmem_reserve[j] = present_pages / 3601 sysctl_lowmem_reserve_ratio[idx]; 3602 present_pages += lower_zone->present_pages; 3603 } 3604 } 3605 } 3606 3607 /* update totalreserve_pages */ 3608 calculate_totalreserve_pages(); 3609 } 3610 3611 /** 3612 * setup_per_zone_pages_min - called when min_free_kbytes changes. 3613 * 3614 * Ensures that the pages_{min,low,high} values for each zone are set correctly 3615 * with respect to min_free_kbytes. 3616 */ 3617 void setup_per_zone_pages_min(void) 3618 { 3619 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 3620 unsigned long lowmem_pages = 0; 3621 struct zone *zone; 3622 unsigned long flags; 3623 3624 /* Calculate total number of !ZONE_HIGHMEM pages */ 3625 for_each_zone(zone) { 3626 if (!is_highmem(zone)) 3627 lowmem_pages += zone->present_pages; 3628 } 3629 3630 for_each_zone(zone) { 3631 u64 tmp; 3632 3633 spin_lock_irqsave(&zone->lru_lock, flags); 3634 tmp = (u64)pages_min * zone->present_pages; 3635 do_div(tmp, lowmem_pages); 3636 if (is_highmem(zone)) { 3637 /* 3638 * __GFP_HIGH and PF_MEMALLOC allocations usually don't 3639 * need highmem pages, so cap pages_min to a small 3640 * value here. 3641 * 3642 * The (pages_high-pages_low) and (pages_low-pages_min) 3643 * deltas controls asynch page reclaim, and so should 3644 * not be capped for highmem. 3645 */ 3646 int min_pages; 3647 3648 min_pages = zone->present_pages / 1024; 3649 if (min_pages < SWAP_CLUSTER_MAX) 3650 min_pages = SWAP_CLUSTER_MAX; 3651 if (min_pages > 128) 3652 min_pages = 128; 3653 zone->pages_min = min_pages; 3654 } else { 3655 /* 3656 * If it's a lowmem zone, reserve a number of pages 3657 * proportionate to the zone's size. 3658 */ 3659 zone->pages_min = tmp; 3660 } 3661 3662 zone->pages_low = zone->pages_min + (tmp >> 2); 3663 zone->pages_high = zone->pages_min + (tmp >> 1); 3664 spin_unlock_irqrestore(&zone->lru_lock, flags); 3665 } 3666 3667 /* update totalreserve_pages */ 3668 calculate_totalreserve_pages(); 3669 } 3670 3671 /* 3672 * Initialise min_free_kbytes. 3673 * 3674 * For small machines we want it small (128k min). For large machines 3675 * we want it large (64MB max). But it is not linear, because network 3676 * bandwidth does not increase linearly with machine size. We use 3677 * 3678 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 3679 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 3680 * 3681 * which yields 3682 * 3683 * 16MB: 512k 3684 * 32MB: 724k 3685 * 64MB: 1024k 3686 * 128MB: 1448k 3687 * 256MB: 2048k 3688 * 512MB: 2896k 3689 * 1024MB: 4096k 3690 * 2048MB: 5792k 3691 * 4096MB: 8192k 3692 * 8192MB: 11584k 3693 * 16384MB: 16384k 3694 */ 3695 static int __init init_per_zone_pages_min(void) 3696 { 3697 unsigned long lowmem_kbytes; 3698 3699 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 3700 3701 min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 3702 if (min_free_kbytes < 128) 3703 min_free_kbytes = 128; 3704 if (min_free_kbytes > 65536) 3705 min_free_kbytes = 65536; 3706 setup_per_zone_pages_min(); 3707 setup_per_zone_lowmem_reserve(); 3708 return 0; 3709 } 3710 module_init(init_per_zone_pages_min) 3711 3712 /* 3713 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 3714 * that we can call two helper functions whenever min_free_kbytes 3715 * changes. 3716 */ 3717 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 3718 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 3719 { 3720 proc_dointvec(table, write, file, buffer, length, ppos); 3721 if (write) 3722 setup_per_zone_pages_min(); 3723 return 0; 3724 } 3725 3726 #ifdef CONFIG_NUMA 3727 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, 3728 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 3729 { 3730 struct zone *zone; 3731 int rc; 3732 3733 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 3734 if (rc) 3735 return rc; 3736 3737 for_each_zone(zone) 3738 zone->min_unmapped_pages = (zone->present_pages * 3739 sysctl_min_unmapped_ratio) / 100; 3740 return 0; 3741 } 3742 3743 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, 3744 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 3745 { 3746 struct zone *zone; 3747 int rc; 3748 3749 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 3750 if (rc) 3751 return rc; 3752 3753 for_each_zone(zone) 3754 zone->min_slab_pages = (zone->present_pages * 3755 sysctl_min_slab_ratio) / 100; 3756 return 0; 3757 } 3758 #endif 3759 3760 /* 3761 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 3762 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 3763 * whenever sysctl_lowmem_reserve_ratio changes. 3764 * 3765 * The reserve ratio obviously has absolutely no relation with the 3766 * pages_min watermarks. The lowmem reserve ratio can only make sense 3767 * if in function of the boot time zone sizes. 3768 */ 3769 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 3770 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 3771 { 3772 proc_dointvec_minmax(table, write, file, buffer, length, ppos); 3773 setup_per_zone_lowmem_reserve(); 3774 return 0; 3775 } 3776 3777 /* 3778 * percpu_pagelist_fraction - changes the pcp->high for each zone on each 3779 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist 3780 * can have before it gets flushed back to buddy allocator. 3781 */ 3782 3783 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, 3784 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 3785 { 3786 struct zone *zone; 3787 unsigned int cpu; 3788 int ret; 3789 3790 ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 3791 if (!write || (ret == -EINVAL)) 3792 return ret; 3793 for_each_zone(zone) { 3794 for_each_online_cpu(cpu) { 3795 unsigned long high; 3796 high = zone->present_pages / percpu_pagelist_fraction; 3797 setup_pagelist_highmark(zone_pcp(zone, cpu), high); 3798 } 3799 } 3800 return 0; 3801 } 3802 3803 int hashdist = HASHDIST_DEFAULT; 3804 3805 #ifdef CONFIG_NUMA 3806 static int __init set_hashdist(char *str) 3807 { 3808 if (!str) 3809 return 0; 3810 hashdist = simple_strtoul(str, &str, 0); 3811 return 1; 3812 } 3813 __setup("hashdist=", set_hashdist); 3814 #endif 3815 3816 /* 3817 * allocate a large system hash table from bootmem 3818 * - it is assumed that the hash table must contain an exact power-of-2 3819 * quantity of entries 3820 * - limit is the number of hash buckets, not the total allocation size 3821 */ 3822 void *__init alloc_large_system_hash(const char *tablename, 3823 unsigned long bucketsize, 3824 unsigned long numentries, 3825 int scale, 3826 int flags, 3827 unsigned int *_hash_shift, 3828 unsigned int *_hash_mask, 3829 unsigned long limit) 3830 { 3831 unsigned long long max = limit; 3832 unsigned long log2qty, size; 3833 void *table = NULL; 3834 3835 /* allow the kernel cmdline to have a say */ 3836 if (!numentries) { 3837 /* round applicable memory size up to nearest megabyte */ 3838 numentries = nr_kernel_pages; 3839 numentries += (1UL << (20 - PAGE_SHIFT)) - 1; 3840 numentries >>= 20 - PAGE_SHIFT; 3841 numentries <<= 20 - PAGE_SHIFT; 3842 3843 /* limit to 1 bucket per 2^scale bytes of low memory */ 3844 if (scale > PAGE_SHIFT) 3845 numentries >>= (scale - PAGE_SHIFT); 3846 else 3847 numentries <<= (PAGE_SHIFT - scale); 3848 3849 /* Make sure we've got at least a 0-order allocation.. */ 3850 if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 3851 numentries = PAGE_SIZE / bucketsize; 3852 } 3853 numentries = roundup_pow_of_two(numentries); 3854 3855 /* limit allocation size to 1/16 total memory by default */ 3856 if (max == 0) { 3857 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 3858 do_div(max, bucketsize); 3859 } 3860 3861 if (numentries > max) 3862 numentries = max; 3863 3864 log2qty = ilog2(numentries); 3865 3866 do { 3867 size = bucketsize << log2qty; 3868 if (flags & HASH_EARLY) 3869 table = alloc_bootmem(size); 3870 else if (hashdist) 3871 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 3872 else { 3873 unsigned long order; 3874 for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) 3875 ; 3876 table = (void*) __get_free_pages(GFP_ATOMIC, order); 3877 /* 3878 * If bucketsize is not a power-of-two, we may free 3879 * some pages at the end of hash table. 3880 */ 3881 if (table) { 3882 unsigned long alloc_end = (unsigned long)table + 3883 (PAGE_SIZE << order); 3884 unsigned long used = (unsigned long)table + 3885 PAGE_ALIGN(size); 3886 split_page(virt_to_page(table), order); 3887 while (used < alloc_end) { 3888 free_page(used); 3889 used += PAGE_SIZE; 3890 } 3891 } 3892 } 3893 } while (!table && size > PAGE_SIZE && --log2qty); 3894 3895 if (!table) 3896 panic("Failed to allocate %s hash table\n", tablename); 3897 3898 printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n", 3899 tablename, 3900 (1U << log2qty), 3901 ilog2(size) - PAGE_SHIFT, 3902 size); 3903 3904 if (_hash_shift) 3905 *_hash_shift = log2qty; 3906 if (_hash_mask) 3907 *_hash_mask = (1 << log2qty) - 1; 3908 3909 return table; 3910 } 3911 3912 #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE 3913 struct page *pfn_to_page(unsigned long pfn) 3914 { 3915 return __pfn_to_page(pfn); 3916 } 3917 unsigned long page_to_pfn(struct page *page) 3918 { 3919 return __page_to_pfn(page); 3920 } 3921 EXPORT_SYMBOL(pfn_to_page); 3922 EXPORT_SYMBOL(page_to_pfn); 3923 #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ 3924 3925 3926