1 /* 2 * linux/mm/page_alloc.c 3 * 4 * Manages the free list, the system allocates free pages here. 5 * Note that kmalloc() lives in slab.c 6 * 7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 8 * Swap reorganised 29.12.95, Stephen Tweedie 9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 15 */ 16 17 #include <linux/stddef.h> 18 #include <linux/mm.h> 19 #include <linux/swap.h> 20 #include <linux/interrupt.h> 21 #include <linux/pagemap.h> 22 #include <linux/bootmem.h> 23 #include <linux/compiler.h> 24 #include <linux/kernel.h> 25 #include <linux/module.h> 26 #include <linux/suspend.h> 27 #include <linux/pagevec.h> 28 #include <linux/blkdev.h> 29 #include <linux/slab.h> 30 #include <linux/oom.h> 31 #include <linux/notifier.h> 32 #include <linux/topology.h> 33 #include <linux/sysctl.h> 34 #include <linux/cpu.h> 35 #include <linux/cpuset.h> 36 #include <linux/memory_hotplug.h> 37 #include <linux/nodemask.h> 38 #include <linux/vmalloc.h> 39 #include <linux/mempolicy.h> 40 #include <linux/stop_machine.h> 41 #include <linux/sort.h> 42 #include <linux/pfn.h> 43 #include <linux/backing-dev.h> 44 #include <linux/fault-inject.h> 45 #include <linux/page-isolation.h> 46 #include <linux/memcontrol.h> 47 48 #include <asm/tlbflush.h> 49 #include <asm/div64.h> 50 #include "internal.h" 51 52 /* 53 * Array of node states. 54 */ 55 nodemask_t node_states[NR_NODE_STATES] __read_mostly = { 56 [N_POSSIBLE] = NODE_MASK_ALL, 57 [N_ONLINE] = { { [0] = 1UL } }, 58 #ifndef CONFIG_NUMA 59 [N_NORMAL_MEMORY] = { { [0] = 1UL } }, 60 #ifdef CONFIG_HIGHMEM 61 [N_HIGH_MEMORY] = { { [0] = 1UL } }, 62 #endif 63 [N_CPU] = { { [0] = 1UL } }, 64 #endif /* NUMA */ 65 }; 66 EXPORT_SYMBOL(node_states); 67 68 unsigned long totalram_pages __read_mostly; 69 unsigned long totalreserve_pages __read_mostly; 70 long nr_swap_pages; 71 int percpu_pagelist_fraction; 72 73 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 74 int pageblock_order __read_mostly; 75 #endif 76 77 static void __free_pages_ok(struct page *page, unsigned int order); 78 79 /* 80 * results with 256, 32 in the lowmem_reserve sysctl: 81 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 82 * 1G machine -> (16M dma, 784M normal, 224M high) 83 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 84 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 85 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 86 * 87 * TBD: should special case ZONE_DMA32 machines here - in those we normally 88 * don't need any ZONE_NORMAL reservation 89 */ 90 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 91 #ifdef CONFIG_ZONE_DMA 92 256, 93 #endif 94 #ifdef CONFIG_ZONE_DMA32 95 256, 96 #endif 97 #ifdef CONFIG_HIGHMEM 98 32, 99 #endif 100 32, 101 }; 102 103 EXPORT_SYMBOL(totalram_pages); 104 105 static char * const zone_names[MAX_NR_ZONES] = { 106 #ifdef CONFIG_ZONE_DMA 107 "DMA", 108 #endif 109 #ifdef CONFIG_ZONE_DMA32 110 "DMA32", 111 #endif 112 "Normal", 113 #ifdef CONFIG_HIGHMEM 114 "HighMem", 115 #endif 116 "Movable", 117 }; 118 119 int min_free_kbytes = 1024; 120 121 unsigned long __meminitdata nr_kernel_pages; 122 unsigned long __meminitdata nr_all_pages; 123 static unsigned long __meminitdata dma_reserve; 124 125 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP 126 /* 127 * MAX_ACTIVE_REGIONS determines the maximum number of distinct 128 * ranges of memory (RAM) that may be registered with add_active_range(). 129 * Ranges passed to add_active_range() will be merged if possible 130 * so the number of times add_active_range() can be called is 131 * related to the number of nodes and the number of holes 132 */ 133 #ifdef CONFIG_MAX_ACTIVE_REGIONS 134 /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */ 135 #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS 136 #else 137 #if MAX_NUMNODES >= 32 138 /* If there can be many nodes, allow up to 50 holes per node */ 139 #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50) 140 #else 141 /* By default, allow up to 256 distinct regions */ 142 #define MAX_ACTIVE_REGIONS 256 143 #endif 144 #endif 145 146 static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS]; 147 static int __meminitdata nr_nodemap_entries; 148 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 149 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 150 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE 151 static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; 152 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; 153 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ 154 unsigned long __initdata required_kernelcore; 155 static unsigned long __initdata required_movablecore; 156 unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 157 158 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 159 int movable_zone; 160 EXPORT_SYMBOL(movable_zone); 161 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 162 163 #if MAX_NUMNODES > 1 164 int nr_node_ids __read_mostly = MAX_NUMNODES; 165 EXPORT_SYMBOL(nr_node_ids); 166 #endif 167 168 int page_group_by_mobility_disabled __read_mostly; 169 170 static void set_pageblock_migratetype(struct page *page, int migratetype) 171 { 172 set_pageblock_flags_group(page, (unsigned long)migratetype, 173 PB_migrate, PB_migrate_end); 174 } 175 176 #ifdef CONFIG_DEBUG_VM 177 static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 178 { 179 int ret = 0; 180 unsigned seq; 181 unsigned long pfn = page_to_pfn(page); 182 183 do { 184 seq = zone_span_seqbegin(zone); 185 if (pfn >= zone->zone_start_pfn + zone->spanned_pages) 186 ret = 1; 187 else if (pfn < zone->zone_start_pfn) 188 ret = 1; 189 } while (zone_span_seqretry(zone, seq)); 190 191 return ret; 192 } 193 194 static int page_is_consistent(struct zone *zone, struct page *page) 195 { 196 if (!pfn_valid_within(page_to_pfn(page))) 197 return 0; 198 if (zone != page_zone(page)) 199 return 0; 200 201 return 1; 202 } 203 /* 204 * Temporary debugging check for pages not lying within a given zone. 205 */ 206 static int bad_range(struct zone *zone, struct page *page) 207 { 208 if (page_outside_zone_boundaries(zone, page)) 209 return 1; 210 if (!page_is_consistent(zone, page)) 211 return 1; 212 213 return 0; 214 } 215 #else 216 static inline int bad_range(struct zone *zone, struct page *page) 217 { 218 return 0; 219 } 220 #endif 221 222 static void bad_page(struct page *page) 223 { 224 printk(KERN_EMERG "Bad page state in process '%s'\n" 225 KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n" 226 KERN_EMERG "Trying to fix it up, but a reboot is needed\n" 227 KERN_EMERG "Backtrace:\n", 228 current->comm, page, (int)(2*sizeof(unsigned long)), 229 (unsigned long)page->flags, page->mapping, 230 page_mapcount(page), page_count(page)); 231 dump_stack(); 232 page->flags &= ~(1 << PG_lru | 233 1 << PG_private | 234 1 << PG_locked | 235 1 << PG_active | 236 1 << PG_dirty | 237 1 << PG_reclaim | 238 1 << PG_slab | 239 1 << PG_swapcache | 240 1 << PG_writeback | 241 1 << PG_buddy ); 242 set_page_count(page, 0); 243 reset_page_mapcount(page); 244 page->mapping = NULL; 245 add_taint(TAINT_BAD_PAGE); 246 } 247 248 /* 249 * Higher-order pages are called "compound pages". They are structured thusly: 250 * 251 * The first PAGE_SIZE page is called the "head page". 252 * 253 * The remaining PAGE_SIZE pages are called "tail pages". 254 * 255 * All pages have PG_compound set. All pages have their ->private pointing at 256 * the head page (even the head page has this). 257 * 258 * The first tail page's ->lru.next holds the address of the compound page's 259 * put_page() function. Its ->lru.prev holds the order of allocation. 260 * This usage means that zero-order pages may not be compound. 261 */ 262 263 static void free_compound_page(struct page *page) 264 { 265 __free_pages_ok(page, compound_order(page)); 266 } 267 268 static void prep_compound_page(struct page *page, unsigned long order) 269 { 270 int i; 271 int nr_pages = 1 << order; 272 273 set_compound_page_dtor(page, free_compound_page); 274 set_compound_order(page, order); 275 __SetPageHead(page); 276 for (i = 1; i < nr_pages; i++) { 277 struct page *p = page + i; 278 279 __SetPageTail(p); 280 p->first_page = page; 281 } 282 } 283 284 static void destroy_compound_page(struct page *page, unsigned long order) 285 { 286 int i; 287 int nr_pages = 1 << order; 288 289 if (unlikely(compound_order(page) != order)) 290 bad_page(page); 291 292 if (unlikely(!PageHead(page))) 293 bad_page(page); 294 __ClearPageHead(page); 295 for (i = 1; i < nr_pages; i++) { 296 struct page *p = page + i; 297 298 if (unlikely(!PageTail(p) | 299 (p->first_page != page))) 300 bad_page(page); 301 __ClearPageTail(p); 302 } 303 } 304 305 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) 306 { 307 int i; 308 309 /* 310 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO 311 * and __GFP_HIGHMEM from hard or soft interrupt context. 312 */ 313 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); 314 for (i = 0; i < (1 << order); i++) 315 clear_highpage(page + i); 316 } 317 318 static inline void set_page_order(struct page *page, int order) 319 { 320 set_page_private(page, order); 321 __SetPageBuddy(page); 322 } 323 324 static inline void rmv_page_order(struct page *page) 325 { 326 __ClearPageBuddy(page); 327 set_page_private(page, 0); 328 } 329 330 /* 331 * Locate the struct page for both the matching buddy in our 332 * pair (buddy1) and the combined O(n+1) page they form (page). 333 * 334 * 1) Any buddy B1 will have an order O twin B2 which satisfies 335 * the following equation: 336 * B2 = B1 ^ (1 << O) 337 * For example, if the starting buddy (buddy2) is #8 its order 338 * 1 buddy is #10: 339 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 340 * 341 * 2) Any buddy B will have an order O+1 parent P which 342 * satisfies the following equation: 343 * P = B & ~(1 << O) 344 * 345 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER 346 */ 347 static inline struct page * 348 __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) 349 { 350 unsigned long buddy_idx = page_idx ^ (1 << order); 351 352 return page + (buddy_idx - page_idx); 353 } 354 355 static inline unsigned long 356 __find_combined_index(unsigned long page_idx, unsigned int order) 357 { 358 return (page_idx & ~(1 << order)); 359 } 360 361 /* 362 * This function checks whether a page is free && is the buddy 363 * we can do coalesce a page and its buddy if 364 * (a) the buddy is not in a hole && 365 * (b) the buddy is in the buddy system && 366 * (c) a page and its buddy have the same order && 367 * (d) a page and its buddy are in the same zone. 368 * 369 * For recording whether a page is in the buddy system, we use PG_buddy. 370 * Setting, clearing, and testing PG_buddy is serialized by zone->lock. 371 * 372 * For recording page's order, we use page_private(page). 373 */ 374 static inline int page_is_buddy(struct page *page, struct page *buddy, 375 int order) 376 { 377 if (!pfn_valid_within(page_to_pfn(buddy))) 378 return 0; 379 380 if (page_zone_id(page) != page_zone_id(buddy)) 381 return 0; 382 383 if (PageBuddy(buddy) && page_order(buddy) == order) { 384 BUG_ON(page_count(buddy) != 0); 385 return 1; 386 } 387 return 0; 388 } 389 390 /* 391 * Freeing function for a buddy system allocator. 392 * 393 * The concept of a buddy system is to maintain direct-mapped table 394 * (containing bit values) for memory blocks of various "orders". 395 * The bottom level table contains the map for the smallest allocatable 396 * units of memory (here, pages), and each level above it describes 397 * pairs of units from the levels below, hence, "buddies". 398 * At a high level, all that happens here is marking the table entry 399 * at the bottom level available, and propagating the changes upward 400 * as necessary, plus some accounting needed to play nicely with other 401 * parts of the VM system. 402 * At each level, we keep a list of pages, which are heads of continuous 403 * free pages of length of (1 << order) and marked with PG_buddy. Page's 404 * order is recorded in page_private(page) field. 405 * So when we are allocating or freeing one, we can derive the state of the 406 * other. That is, if we allocate a small block, and both were 407 * free, the remainder of the region must be split into blocks. 408 * If a block is freed, and its buddy is also free, then this 409 * triggers coalescing into a block of larger size. 410 * 411 * -- wli 412 */ 413 414 static inline void __free_one_page(struct page *page, 415 struct zone *zone, unsigned int order) 416 { 417 unsigned long page_idx; 418 int order_size = 1 << order; 419 int migratetype = get_pageblock_migratetype(page); 420 421 if (unlikely(PageCompound(page))) 422 destroy_compound_page(page, order); 423 424 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 425 426 VM_BUG_ON(page_idx & (order_size - 1)); 427 VM_BUG_ON(bad_range(zone, page)); 428 429 __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); 430 while (order < MAX_ORDER-1) { 431 unsigned long combined_idx; 432 struct page *buddy; 433 434 buddy = __page_find_buddy(page, page_idx, order); 435 if (!page_is_buddy(page, buddy, order)) 436 break; /* Move the buddy up one level. */ 437 438 list_del(&buddy->lru); 439 zone->free_area[order].nr_free--; 440 rmv_page_order(buddy); 441 combined_idx = __find_combined_index(page_idx, order); 442 page = page + (combined_idx - page_idx); 443 page_idx = combined_idx; 444 order++; 445 } 446 set_page_order(page, order); 447 list_add(&page->lru, 448 &zone->free_area[order].free_list[migratetype]); 449 zone->free_area[order].nr_free++; 450 } 451 452 static inline int free_pages_check(struct page *page) 453 { 454 if (unlikely(page_mapcount(page) | 455 (page->mapping != NULL) | 456 (page_count(page) != 0) | 457 (page->flags & ( 458 1 << PG_lru | 459 1 << PG_private | 460 1 << PG_locked | 461 1 << PG_active | 462 1 << PG_slab | 463 1 << PG_swapcache | 464 1 << PG_writeback | 465 1 << PG_reserved | 466 1 << PG_buddy )))) 467 bad_page(page); 468 if (PageDirty(page)) 469 __ClearPageDirty(page); 470 /* 471 * For now, we report if PG_reserved was found set, but do not 472 * clear it, and do not free the page. But we shall soon need 473 * to do more, for when the ZERO_PAGE count wraps negative. 474 */ 475 return PageReserved(page); 476 } 477 478 /* 479 * Frees a list of pages. 480 * Assumes all pages on list are in same zone, and of same order. 481 * count is the number of pages to free. 482 * 483 * If the zone was previously in an "all pages pinned" state then look to 484 * see if this freeing clears that state. 485 * 486 * And clear the zone's pages_scanned counter, to hold off the "all pages are 487 * pinned" detection logic. 488 */ 489 static void free_pages_bulk(struct zone *zone, int count, 490 struct list_head *list, int order) 491 { 492 spin_lock(&zone->lock); 493 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 494 zone->pages_scanned = 0; 495 while (count--) { 496 struct page *page; 497 498 VM_BUG_ON(list_empty(list)); 499 page = list_entry(list->prev, struct page, lru); 500 /* have to delete it as __free_one_page list manipulates */ 501 list_del(&page->lru); 502 __free_one_page(page, zone, order); 503 } 504 spin_unlock(&zone->lock); 505 } 506 507 static void free_one_page(struct zone *zone, struct page *page, int order) 508 { 509 spin_lock(&zone->lock); 510 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 511 zone->pages_scanned = 0; 512 __free_one_page(page, zone, order); 513 spin_unlock(&zone->lock); 514 } 515 516 static void __free_pages_ok(struct page *page, unsigned int order) 517 { 518 unsigned long flags; 519 int i; 520 int reserved = 0; 521 522 for (i = 0 ; i < (1 << order) ; ++i) 523 reserved += free_pages_check(page + i); 524 if (reserved) 525 return; 526 527 if (!PageHighMem(page)) 528 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); 529 arch_free_page(page, order); 530 kernel_map_pages(page, 1 << order, 0); 531 532 local_irq_save(flags); 533 __count_vm_events(PGFREE, 1 << order); 534 free_one_page(page_zone(page), page, order); 535 local_irq_restore(flags); 536 } 537 538 /* 539 * permit the bootmem allocator to evade page validation on high-order frees 540 */ 541 void __init __free_pages_bootmem(struct page *page, unsigned int order) 542 { 543 if (order == 0) { 544 __ClearPageReserved(page); 545 set_page_count(page, 0); 546 set_page_refcounted(page); 547 __free_page(page); 548 } else { 549 int loop; 550 551 prefetchw(page); 552 for (loop = 0; loop < BITS_PER_LONG; loop++) { 553 struct page *p = &page[loop]; 554 555 if (loop + 1 < BITS_PER_LONG) 556 prefetchw(p + 1); 557 __ClearPageReserved(p); 558 set_page_count(p, 0); 559 } 560 561 set_page_refcounted(page); 562 __free_pages(page, order); 563 } 564 } 565 566 567 /* 568 * The order of subdivision here is critical for the IO subsystem. 569 * Please do not alter this order without good reasons and regression 570 * testing. Specifically, as large blocks of memory are subdivided, 571 * the order in which smaller blocks are delivered depends on the order 572 * they're subdivided in this function. This is the primary factor 573 * influencing the order in which pages are delivered to the IO 574 * subsystem according to empirical testing, and this is also justified 575 * by considering the behavior of a buddy system containing a single 576 * large block of memory acted on by a series of small allocations. 577 * This behavior is a critical factor in sglist merging's success. 578 * 579 * -- wli 580 */ 581 static inline void expand(struct zone *zone, struct page *page, 582 int low, int high, struct free_area *area, 583 int migratetype) 584 { 585 unsigned long size = 1 << high; 586 587 while (high > low) { 588 area--; 589 high--; 590 size >>= 1; 591 VM_BUG_ON(bad_range(zone, &page[size])); 592 list_add(&page[size].lru, &area->free_list[migratetype]); 593 area->nr_free++; 594 set_page_order(&page[size], high); 595 } 596 } 597 598 /* 599 * This page is about to be returned from the page allocator 600 */ 601 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 602 { 603 if (unlikely(page_mapcount(page) | 604 (page->mapping != NULL) | 605 (page_count(page) != 0) | 606 (page->flags & ( 607 1 << PG_lru | 608 1 << PG_private | 609 1 << PG_locked | 610 1 << PG_active | 611 1 << PG_dirty | 612 1 << PG_slab | 613 1 << PG_swapcache | 614 1 << PG_writeback | 615 1 << PG_reserved | 616 1 << PG_buddy )))) 617 bad_page(page); 618 619 /* 620 * For now, we report if PG_reserved was found set, but do not 621 * clear it, and do not allocate the page: as a safety net. 622 */ 623 if (PageReserved(page)) 624 return 1; 625 626 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_readahead | 627 1 << PG_referenced | 1 << PG_arch_1 | 628 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); 629 set_page_private(page, 0); 630 set_page_refcounted(page); 631 632 arch_alloc_page(page, order); 633 kernel_map_pages(page, 1 << order, 1); 634 635 if (gfp_flags & __GFP_ZERO) 636 prep_zero_page(page, order, gfp_flags); 637 638 if (order && (gfp_flags & __GFP_COMP)) 639 prep_compound_page(page, order); 640 641 return 0; 642 } 643 644 /* 645 * Go through the free lists for the given migratetype and remove 646 * the smallest available page from the freelists 647 */ 648 static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 649 int migratetype) 650 { 651 unsigned int current_order; 652 struct free_area * area; 653 struct page *page; 654 655 /* Find a page of the appropriate size in the preferred list */ 656 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 657 area = &(zone->free_area[current_order]); 658 if (list_empty(&area->free_list[migratetype])) 659 continue; 660 661 page = list_entry(area->free_list[migratetype].next, 662 struct page, lru); 663 list_del(&page->lru); 664 rmv_page_order(page); 665 area->nr_free--; 666 __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); 667 expand(zone, page, order, current_order, area, migratetype); 668 return page; 669 } 670 671 return NULL; 672 } 673 674 675 /* 676 * This array describes the order lists are fallen back to when 677 * the free lists for the desirable migrate type are depleted 678 */ 679 static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { 680 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 681 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 682 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 683 [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */ 684 }; 685 686 /* 687 * Move the free pages in a range to the free lists of the requested type. 688 * Note that start_page and end_pages are not aligned on a pageblock 689 * boundary. If alignment is required, use move_freepages_block() 690 */ 691 int move_freepages(struct zone *zone, 692 struct page *start_page, struct page *end_page, 693 int migratetype) 694 { 695 struct page *page; 696 unsigned long order; 697 int pages_moved = 0; 698 699 #ifndef CONFIG_HOLES_IN_ZONE 700 /* 701 * page_zone is not safe to call in this context when 702 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant 703 * anyway as we check zone boundaries in move_freepages_block(). 704 * Remove at a later date when no bug reports exist related to 705 * grouping pages by mobility 706 */ 707 BUG_ON(page_zone(start_page) != page_zone(end_page)); 708 #endif 709 710 for (page = start_page; page <= end_page;) { 711 if (!pfn_valid_within(page_to_pfn(page))) { 712 page++; 713 continue; 714 } 715 716 if (!PageBuddy(page)) { 717 page++; 718 continue; 719 } 720 721 order = page_order(page); 722 list_del(&page->lru); 723 list_add(&page->lru, 724 &zone->free_area[order].free_list[migratetype]); 725 page += 1 << order; 726 pages_moved += 1 << order; 727 } 728 729 return pages_moved; 730 } 731 732 int move_freepages_block(struct zone *zone, struct page *page, int migratetype) 733 { 734 unsigned long start_pfn, end_pfn; 735 struct page *start_page, *end_page; 736 737 start_pfn = page_to_pfn(page); 738 start_pfn = start_pfn & ~(pageblock_nr_pages-1); 739 start_page = pfn_to_page(start_pfn); 740 end_page = start_page + pageblock_nr_pages - 1; 741 end_pfn = start_pfn + pageblock_nr_pages - 1; 742 743 /* Do not cross zone boundaries */ 744 if (start_pfn < zone->zone_start_pfn) 745 start_page = page; 746 if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages) 747 return 0; 748 749 return move_freepages(zone, start_page, end_page, migratetype); 750 } 751 752 /* Remove an element from the buddy allocator from the fallback list */ 753 static struct page *__rmqueue_fallback(struct zone *zone, int order, 754 int start_migratetype) 755 { 756 struct free_area * area; 757 int current_order; 758 struct page *page; 759 int migratetype, i; 760 761 /* Find the largest possible block of pages in the other list */ 762 for (current_order = MAX_ORDER-1; current_order >= order; 763 --current_order) { 764 for (i = 0; i < MIGRATE_TYPES - 1; i++) { 765 migratetype = fallbacks[start_migratetype][i]; 766 767 /* MIGRATE_RESERVE handled later if necessary */ 768 if (migratetype == MIGRATE_RESERVE) 769 continue; 770 771 area = &(zone->free_area[current_order]); 772 if (list_empty(&area->free_list[migratetype])) 773 continue; 774 775 page = list_entry(area->free_list[migratetype].next, 776 struct page, lru); 777 area->nr_free--; 778 779 /* 780 * If breaking a large block of pages, move all free 781 * pages to the preferred allocation list. If falling 782 * back for a reclaimable kernel allocation, be more 783 * agressive about taking ownership of free pages 784 */ 785 if (unlikely(current_order >= (pageblock_order >> 1)) || 786 start_migratetype == MIGRATE_RECLAIMABLE) { 787 unsigned long pages; 788 pages = move_freepages_block(zone, page, 789 start_migratetype); 790 791 /* Claim the whole block if over half of it is free */ 792 if (pages >= (1 << (pageblock_order-1))) 793 set_pageblock_migratetype(page, 794 start_migratetype); 795 796 migratetype = start_migratetype; 797 } 798 799 /* Remove the page from the freelists */ 800 list_del(&page->lru); 801 rmv_page_order(page); 802 __mod_zone_page_state(zone, NR_FREE_PAGES, 803 -(1UL << order)); 804 805 if (current_order == pageblock_order) 806 set_pageblock_migratetype(page, 807 start_migratetype); 808 809 expand(zone, page, order, current_order, area, migratetype); 810 return page; 811 } 812 } 813 814 /* Use MIGRATE_RESERVE rather than fail an allocation */ 815 return __rmqueue_smallest(zone, order, MIGRATE_RESERVE); 816 } 817 818 /* 819 * Do the hard work of removing an element from the buddy allocator. 820 * Call me with the zone->lock already held. 821 */ 822 static struct page *__rmqueue(struct zone *zone, unsigned int order, 823 int migratetype) 824 { 825 struct page *page; 826 827 page = __rmqueue_smallest(zone, order, migratetype); 828 829 if (unlikely(!page)) 830 page = __rmqueue_fallback(zone, order, migratetype); 831 832 return page; 833 } 834 835 /* 836 * Obtain a specified number of elements from the buddy allocator, all under 837 * a single hold of the lock, for efficiency. Add them to the supplied list. 838 * Returns the number of new pages which were placed at *list. 839 */ 840 static int rmqueue_bulk(struct zone *zone, unsigned int order, 841 unsigned long count, struct list_head *list, 842 int migratetype) 843 { 844 int i; 845 846 spin_lock(&zone->lock); 847 for (i = 0; i < count; ++i) { 848 struct page *page = __rmqueue(zone, order, migratetype); 849 if (unlikely(page == NULL)) 850 break; 851 852 /* 853 * Split buddy pages returned by expand() are received here 854 * in physical page order. The page is added to the callers and 855 * list and the list head then moves forward. From the callers 856 * perspective, the linked list is ordered by page number in 857 * some conditions. This is useful for IO devices that can 858 * merge IO requests if the physical pages are ordered 859 * properly. 860 */ 861 list_add(&page->lru, list); 862 set_page_private(page, migratetype); 863 list = &page->lru; 864 } 865 spin_unlock(&zone->lock); 866 return i; 867 } 868 869 #ifdef CONFIG_NUMA 870 /* 871 * Called from the vmstat counter updater to drain pagesets of this 872 * currently executing processor on remote nodes after they have 873 * expired. 874 * 875 * Note that this function must be called with the thread pinned to 876 * a single processor. 877 */ 878 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 879 { 880 unsigned long flags; 881 int to_drain; 882 883 local_irq_save(flags); 884 if (pcp->count >= pcp->batch) 885 to_drain = pcp->batch; 886 else 887 to_drain = pcp->count; 888 free_pages_bulk(zone, to_drain, &pcp->list, 0); 889 pcp->count -= to_drain; 890 local_irq_restore(flags); 891 } 892 #endif 893 894 /* 895 * Drain pages of the indicated processor. 896 * 897 * The processor must either be the current processor and the 898 * thread pinned to the current processor or a processor that 899 * is not online. 900 */ 901 static void drain_pages(unsigned int cpu) 902 { 903 unsigned long flags; 904 struct zone *zone; 905 906 for_each_zone(zone) { 907 struct per_cpu_pageset *pset; 908 struct per_cpu_pages *pcp; 909 910 if (!populated_zone(zone)) 911 continue; 912 913 pset = zone_pcp(zone, cpu); 914 915 pcp = &pset->pcp; 916 local_irq_save(flags); 917 free_pages_bulk(zone, pcp->count, &pcp->list, 0); 918 pcp->count = 0; 919 local_irq_restore(flags); 920 } 921 } 922 923 /* 924 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 925 */ 926 void drain_local_pages(void *arg) 927 { 928 drain_pages(smp_processor_id()); 929 } 930 931 /* 932 * Spill all the per-cpu pages from all CPUs back into the buddy allocator 933 */ 934 void drain_all_pages(void) 935 { 936 on_each_cpu(drain_local_pages, NULL, 0, 1); 937 } 938 939 #ifdef CONFIG_HIBERNATION 940 941 void mark_free_pages(struct zone *zone) 942 { 943 unsigned long pfn, max_zone_pfn; 944 unsigned long flags; 945 int order, t; 946 struct list_head *curr; 947 948 if (!zone->spanned_pages) 949 return; 950 951 spin_lock_irqsave(&zone->lock, flags); 952 953 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 954 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 955 if (pfn_valid(pfn)) { 956 struct page *page = pfn_to_page(pfn); 957 958 if (!swsusp_page_is_forbidden(page)) 959 swsusp_unset_page_free(page); 960 } 961 962 for_each_migratetype_order(order, t) { 963 list_for_each(curr, &zone->free_area[order].free_list[t]) { 964 unsigned long i; 965 966 pfn = page_to_pfn(list_entry(curr, struct page, lru)); 967 for (i = 0; i < (1UL << order); i++) 968 swsusp_set_page_free(pfn_to_page(pfn + i)); 969 } 970 } 971 spin_unlock_irqrestore(&zone->lock, flags); 972 } 973 #endif /* CONFIG_PM */ 974 975 /* 976 * Free a 0-order page 977 */ 978 static void free_hot_cold_page(struct page *page, int cold) 979 { 980 struct zone *zone = page_zone(page); 981 struct per_cpu_pages *pcp; 982 unsigned long flags; 983 984 if (PageAnon(page)) 985 page->mapping = NULL; 986 if (free_pages_check(page)) 987 return; 988 989 if (!PageHighMem(page)) 990 debug_check_no_locks_freed(page_address(page), PAGE_SIZE); 991 VM_BUG_ON(page_get_page_cgroup(page)); 992 arch_free_page(page, 0); 993 kernel_map_pages(page, 1, 0); 994 995 pcp = &zone_pcp(zone, get_cpu())->pcp; 996 local_irq_save(flags); 997 __count_vm_event(PGFREE); 998 if (cold) 999 list_add_tail(&page->lru, &pcp->list); 1000 else 1001 list_add(&page->lru, &pcp->list); 1002 set_page_private(page, get_pageblock_migratetype(page)); 1003 pcp->count++; 1004 if (pcp->count >= pcp->high) { 1005 free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 1006 pcp->count -= pcp->batch; 1007 } 1008 local_irq_restore(flags); 1009 put_cpu(); 1010 } 1011 1012 void free_hot_page(struct page *page) 1013 { 1014 free_hot_cold_page(page, 0); 1015 } 1016 1017 void free_cold_page(struct page *page) 1018 { 1019 free_hot_cold_page(page, 1); 1020 } 1021 1022 /* 1023 * split_page takes a non-compound higher-order page, and splits it into 1024 * n (1<<order) sub-pages: page[0..n] 1025 * Each sub-page must be freed individually. 1026 * 1027 * Note: this is probably too low level an operation for use in drivers. 1028 * Please consult with lkml before using this in your driver. 1029 */ 1030 void split_page(struct page *page, unsigned int order) 1031 { 1032 int i; 1033 1034 VM_BUG_ON(PageCompound(page)); 1035 VM_BUG_ON(!page_count(page)); 1036 for (i = 1; i < (1 << order); i++) 1037 set_page_refcounted(page + i); 1038 } 1039 1040 /* 1041 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 1042 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1043 * or two. 1044 */ 1045 static struct page *buffered_rmqueue(struct zonelist *zonelist, 1046 struct zone *zone, int order, gfp_t gfp_flags) 1047 { 1048 unsigned long flags; 1049 struct page *page; 1050 int cold = !!(gfp_flags & __GFP_COLD); 1051 int cpu; 1052 int migratetype = allocflags_to_migratetype(gfp_flags); 1053 1054 again: 1055 cpu = get_cpu(); 1056 if (likely(order == 0)) { 1057 struct per_cpu_pages *pcp; 1058 1059 pcp = &zone_pcp(zone, cpu)->pcp; 1060 local_irq_save(flags); 1061 if (!pcp->count) { 1062 pcp->count = rmqueue_bulk(zone, 0, 1063 pcp->batch, &pcp->list, migratetype); 1064 if (unlikely(!pcp->count)) 1065 goto failed; 1066 } 1067 1068 /* Find a page of the appropriate migrate type */ 1069 if (cold) { 1070 list_for_each_entry_reverse(page, &pcp->list, lru) 1071 if (page_private(page) == migratetype) 1072 break; 1073 } else { 1074 list_for_each_entry(page, &pcp->list, lru) 1075 if (page_private(page) == migratetype) 1076 break; 1077 } 1078 1079 /* Allocate more to the pcp list if necessary */ 1080 if (unlikely(&page->lru == &pcp->list)) { 1081 pcp->count += rmqueue_bulk(zone, 0, 1082 pcp->batch, &pcp->list, migratetype); 1083 page = list_entry(pcp->list.next, struct page, lru); 1084 } 1085 1086 list_del(&page->lru); 1087 pcp->count--; 1088 } else { 1089 spin_lock_irqsave(&zone->lock, flags); 1090 page = __rmqueue(zone, order, migratetype); 1091 spin_unlock(&zone->lock); 1092 if (!page) 1093 goto failed; 1094 } 1095 1096 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1097 zone_statistics(zonelist, zone); 1098 local_irq_restore(flags); 1099 put_cpu(); 1100 1101 VM_BUG_ON(bad_range(zone, page)); 1102 if (prep_new_page(page, order, gfp_flags)) 1103 goto again; 1104 return page; 1105 1106 failed: 1107 local_irq_restore(flags); 1108 put_cpu(); 1109 return NULL; 1110 } 1111 1112 #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ 1113 #define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ 1114 #define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ 1115 #define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ 1116 #define ALLOC_HARDER 0x10 /* try to alloc harder */ 1117 #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ 1118 #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ 1119 1120 #ifdef CONFIG_FAIL_PAGE_ALLOC 1121 1122 static struct fail_page_alloc_attr { 1123 struct fault_attr attr; 1124 1125 u32 ignore_gfp_highmem; 1126 u32 ignore_gfp_wait; 1127 u32 min_order; 1128 1129 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 1130 1131 struct dentry *ignore_gfp_highmem_file; 1132 struct dentry *ignore_gfp_wait_file; 1133 struct dentry *min_order_file; 1134 1135 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 1136 1137 } fail_page_alloc = { 1138 .attr = FAULT_ATTR_INITIALIZER, 1139 .ignore_gfp_wait = 1, 1140 .ignore_gfp_highmem = 1, 1141 .min_order = 1, 1142 }; 1143 1144 static int __init setup_fail_page_alloc(char *str) 1145 { 1146 return setup_fault_attr(&fail_page_alloc.attr, str); 1147 } 1148 __setup("fail_page_alloc=", setup_fail_page_alloc); 1149 1150 static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1151 { 1152 if (order < fail_page_alloc.min_order) 1153 return 0; 1154 if (gfp_mask & __GFP_NOFAIL) 1155 return 0; 1156 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 1157 return 0; 1158 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) 1159 return 0; 1160 1161 return should_fail(&fail_page_alloc.attr, 1 << order); 1162 } 1163 1164 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 1165 1166 static int __init fail_page_alloc_debugfs(void) 1167 { 1168 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 1169 struct dentry *dir; 1170 int err; 1171 1172 err = init_fault_attr_dentries(&fail_page_alloc.attr, 1173 "fail_page_alloc"); 1174 if (err) 1175 return err; 1176 dir = fail_page_alloc.attr.dentries.dir; 1177 1178 fail_page_alloc.ignore_gfp_wait_file = 1179 debugfs_create_bool("ignore-gfp-wait", mode, dir, 1180 &fail_page_alloc.ignore_gfp_wait); 1181 1182 fail_page_alloc.ignore_gfp_highmem_file = 1183 debugfs_create_bool("ignore-gfp-highmem", mode, dir, 1184 &fail_page_alloc.ignore_gfp_highmem); 1185 fail_page_alloc.min_order_file = 1186 debugfs_create_u32("min-order", mode, dir, 1187 &fail_page_alloc.min_order); 1188 1189 if (!fail_page_alloc.ignore_gfp_wait_file || 1190 !fail_page_alloc.ignore_gfp_highmem_file || 1191 !fail_page_alloc.min_order_file) { 1192 err = -ENOMEM; 1193 debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); 1194 debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); 1195 debugfs_remove(fail_page_alloc.min_order_file); 1196 cleanup_fault_attr_dentries(&fail_page_alloc.attr); 1197 } 1198 1199 return err; 1200 } 1201 1202 late_initcall(fail_page_alloc_debugfs); 1203 1204 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 1205 1206 #else /* CONFIG_FAIL_PAGE_ALLOC */ 1207 1208 static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1209 { 1210 return 0; 1211 } 1212 1213 #endif /* CONFIG_FAIL_PAGE_ALLOC */ 1214 1215 /* 1216 * Return 1 if free pages are above 'mark'. This takes into account the order 1217 * of the allocation. 1218 */ 1219 int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1220 int classzone_idx, int alloc_flags) 1221 { 1222 /* free_pages my go negative - that's OK */ 1223 long min = mark; 1224 long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1; 1225 int o; 1226 1227 if (alloc_flags & ALLOC_HIGH) 1228 min -= min / 2; 1229 if (alloc_flags & ALLOC_HARDER) 1230 min -= min / 4; 1231 1232 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 1233 return 0; 1234 for (o = 0; o < order; o++) { 1235 /* At the next order, this order's pages become unavailable */ 1236 free_pages -= z->free_area[o].nr_free << o; 1237 1238 /* Require fewer higher order pages to be free */ 1239 min >>= 1; 1240 1241 if (free_pages <= min) 1242 return 0; 1243 } 1244 return 1; 1245 } 1246 1247 #ifdef CONFIG_NUMA 1248 /* 1249 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to 1250 * skip over zones that are not allowed by the cpuset, or that have 1251 * been recently (in last second) found to be nearly full. See further 1252 * comments in mmzone.h. Reduces cache footprint of zonelist scans 1253 * that have to skip over a lot of full or unallowed zones. 1254 * 1255 * If the zonelist cache is present in the passed in zonelist, then 1256 * returns a pointer to the allowed node mask (either the current 1257 * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) 1258 * 1259 * If the zonelist cache is not available for this zonelist, does 1260 * nothing and returns NULL. 1261 * 1262 * If the fullzones BITMAP in the zonelist cache is stale (more than 1263 * a second since last zap'd) then we zap it out (clear its bits.) 1264 * 1265 * We hold off even calling zlc_setup, until after we've checked the 1266 * first zone in the zonelist, on the theory that most allocations will 1267 * be satisfied from that first zone, so best to examine that zone as 1268 * quickly as we can. 1269 */ 1270 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1271 { 1272 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1273 nodemask_t *allowednodes; /* zonelist_cache approximation */ 1274 1275 zlc = zonelist->zlcache_ptr; 1276 if (!zlc) 1277 return NULL; 1278 1279 if (jiffies - zlc->last_full_zap > 1 * HZ) { 1280 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1281 zlc->last_full_zap = jiffies; 1282 } 1283 1284 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? 1285 &cpuset_current_mems_allowed : 1286 &node_states[N_HIGH_MEMORY]; 1287 return allowednodes; 1288 } 1289 1290 /* 1291 * Given 'z' scanning a zonelist, run a couple of quick checks to see 1292 * if it is worth looking at further for free memory: 1293 * 1) Check that the zone isn't thought to be full (doesn't have its 1294 * bit set in the zonelist_cache fullzones BITMAP). 1295 * 2) Check that the zones node (obtained from the zonelist_cache 1296 * z_to_n[] mapping) is allowed in the passed in allowednodes mask. 1297 * Return true (non-zero) if zone is worth looking at further, or 1298 * else return false (zero) if it is not. 1299 * 1300 * This check -ignores- the distinction between various watermarks, 1301 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is 1302 * found to be full for any variation of these watermarks, it will 1303 * be considered full for up to one second by all requests, unless 1304 * we are so low on memory on all allowed nodes that we are forced 1305 * into the second scan of the zonelist. 1306 * 1307 * In the second scan we ignore this zonelist cache and exactly 1308 * apply the watermarks to all zones, even it is slower to do so. 1309 * We are low on memory in the second scan, and should leave no stone 1310 * unturned looking for a free page. 1311 */ 1312 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, 1313 nodemask_t *allowednodes) 1314 { 1315 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1316 int i; /* index of *z in zonelist zones */ 1317 int n; /* node that zone *z is on */ 1318 1319 zlc = zonelist->zlcache_ptr; 1320 if (!zlc) 1321 return 1; 1322 1323 i = z - zonelist->zones; 1324 n = zlc->z_to_n[i]; 1325 1326 /* This zone is worth trying if it is allowed but not full */ 1327 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); 1328 } 1329 1330 /* 1331 * Given 'z' scanning a zonelist, set the corresponding bit in 1332 * zlc->fullzones, so that subsequent attempts to allocate a page 1333 * from that zone don't waste time re-examining it. 1334 */ 1335 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) 1336 { 1337 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1338 int i; /* index of *z in zonelist zones */ 1339 1340 zlc = zonelist->zlcache_ptr; 1341 if (!zlc) 1342 return; 1343 1344 i = z - zonelist->zones; 1345 1346 set_bit(i, zlc->fullzones); 1347 } 1348 1349 #else /* CONFIG_NUMA */ 1350 1351 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1352 { 1353 return NULL; 1354 } 1355 1356 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, 1357 nodemask_t *allowednodes) 1358 { 1359 return 1; 1360 } 1361 1362 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) 1363 { 1364 } 1365 #endif /* CONFIG_NUMA */ 1366 1367 /* 1368 * get_page_from_freelist goes through the zonelist trying to allocate 1369 * a page. 1370 */ 1371 static struct page * 1372 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, 1373 struct zonelist *zonelist, int alloc_flags) 1374 { 1375 struct zone **z; 1376 struct page *page = NULL; 1377 int classzone_idx = zone_idx(zonelist->zones[0]); 1378 struct zone *zone; 1379 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1380 int zlc_active = 0; /* set if using zonelist_cache */ 1381 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1382 enum zone_type highest_zoneidx = -1; /* Gets set for policy zonelists */ 1383 1384 zonelist_scan: 1385 /* 1386 * Scan zonelist, looking for a zone with enough free. 1387 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1388 */ 1389 z = zonelist->zones; 1390 1391 do { 1392 /* 1393 * In NUMA, this could be a policy zonelist which contains 1394 * zones that may not be allowed by the current gfp_mask. 1395 * Check the zone is allowed by the current flags 1396 */ 1397 if (unlikely(alloc_should_filter_zonelist(zonelist))) { 1398 if (highest_zoneidx == -1) 1399 highest_zoneidx = gfp_zone(gfp_mask); 1400 if (zone_idx(*z) > highest_zoneidx) 1401 continue; 1402 } 1403 1404 if (NUMA_BUILD && zlc_active && 1405 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1406 continue; 1407 zone = *z; 1408 if ((alloc_flags & ALLOC_CPUSET) && 1409 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1410 goto try_next_zone; 1411 1412 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1413 unsigned long mark; 1414 if (alloc_flags & ALLOC_WMARK_MIN) 1415 mark = zone->pages_min; 1416 else if (alloc_flags & ALLOC_WMARK_LOW) 1417 mark = zone->pages_low; 1418 else 1419 mark = zone->pages_high; 1420 if (!zone_watermark_ok(zone, order, mark, 1421 classzone_idx, alloc_flags)) { 1422 if (!zone_reclaim_mode || 1423 !zone_reclaim(zone, gfp_mask, order)) 1424 goto this_zone_full; 1425 } 1426 } 1427 1428 page = buffered_rmqueue(zonelist, zone, order, gfp_mask); 1429 if (page) 1430 break; 1431 this_zone_full: 1432 if (NUMA_BUILD) 1433 zlc_mark_zone_full(zonelist, z); 1434 try_next_zone: 1435 if (NUMA_BUILD && !did_zlc_setup) { 1436 /* we do zlc_setup after the first zone is tried */ 1437 allowednodes = zlc_setup(zonelist, alloc_flags); 1438 zlc_active = 1; 1439 did_zlc_setup = 1; 1440 } 1441 } while (*(++z) != NULL); 1442 1443 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { 1444 /* Disable zlc cache for second zonelist scan */ 1445 zlc_active = 0; 1446 goto zonelist_scan; 1447 } 1448 return page; 1449 } 1450 1451 /* 1452 * This is the 'heart' of the zoned buddy allocator. 1453 */ 1454 struct page * 1455 __alloc_pages(gfp_t gfp_mask, unsigned int order, 1456 struct zonelist *zonelist) 1457 { 1458 const gfp_t wait = gfp_mask & __GFP_WAIT; 1459 struct zone **z; 1460 struct page *page; 1461 struct reclaim_state reclaim_state; 1462 struct task_struct *p = current; 1463 int do_retry; 1464 int alloc_flags; 1465 int did_some_progress; 1466 1467 might_sleep_if(wait); 1468 1469 if (should_fail_alloc_page(gfp_mask, order)) 1470 return NULL; 1471 1472 restart: 1473 z = zonelist->zones; /* the list of zones suitable for gfp_mask */ 1474 1475 if (unlikely(*z == NULL)) { 1476 /* 1477 * Happens if we have an empty zonelist as a result of 1478 * GFP_THISNODE being used on a memoryless node 1479 */ 1480 return NULL; 1481 } 1482 1483 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 1484 zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); 1485 if (page) 1486 goto got_pg; 1487 1488 /* 1489 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 1490 * __GFP_NOWARN set) should not cause reclaim since the subsystem 1491 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim 1492 * using a larger set of nodes after it has established that the 1493 * allowed per node queues are empty and that nodes are 1494 * over allocated. 1495 */ 1496 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 1497 goto nopage; 1498 1499 for (z = zonelist->zones; *z; z++) 1500 wakeup_kswapd(*z, order); 1501 1502 /* 1503 * OK, we're below the kswapd watermark and have kicked background 1504 * reclaim. Now things get more complex, so set up alloc_flags according 1505 * to how we want to proceed. 1506 * 1507 * The caller may dip into page reserves a bit more if the caller 1508 * cannot run direct reclaim, or if the caller has realtime scheduling 1509 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 1510 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). 1511 */ 1512 alloc_flags = ALLOC_WMARK_MIN; 1513 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) 1514 alloc_flags |= ALLOC_HARDER; 1515 if (gfp_mask & __GFP_HIGH) 1516 alloc_flags |= ALLOC_HIGH; 1517 if (wait) 1518 alloc_flags |= ALLOC_CPUSET; 1519 1520 /* 1521 * Go through the zonelist again. Let __GFP_HIGH and allocations 1522 * coming from realtime tasks go deeper into reserves. 1523 * 1524 * This is the last chance, in general, before the goto nopage. 1525 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 1526 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1527 */ 1528 page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags); 1529 if (page) 1530 goto got_pg; 1531 1532 /* This allocation should allow future memory freeing. */ 1533 1534 rebalance: 1535 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 1536 && !in_interrupt()) { 1537 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 1538 nofail_alloc: 1539 /* go through the zonelist yet again, ignoring mins */ 1540 page = get_page_from_freelist(gfp_mask, order, 1541 zonelist, ALLOC_NO_WATERMARKS); 1542 if (page) 1543 goto got_pg; 1544 if (gfp_mask & __GFP_NOFAIL) { 1545 congestion_wait(WRITE, HZ/50); 1546 goto nofail_alloc; 1547 } 1548 } 1549 goto nopage; 1550 } 1551 1552 /* Atomic allocations - we can't balance anything */ 1553 if (!wait) 1554 goto nopage; 1555 1556 cond_resched(); 1557 1558 /* We now go into synchronous reclaim */ 1559 cpuset_memory_pressure_bump(); 1560 p->flags |= PF_MEMALLOC; 1561 reclaim_state.reclaimed_slab = 0; 1562 p->reclaim_state = &reclaim_state; 1563 1564 did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask); 1565 1566 p->reclaim_state = NULL; 1567 p->flags &= ~PF_MEMALLOC; 1568 1569 cond_resched(); 1570 1571 if (order != 0) 1572 drain_all_pages(); 1573 1574 if (likely(did_some_progress)) { 1575 page = get_page_from_freelist(gfp_mask, order, 1576 zonelist, alloc_flags); 1577 if (page) 1578 goto got_pg; 1579 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 1580 if (!try_set_zone_oom(zonelist)) { 1581 schedule_timeout_uninterruptible(1); 1582 goto restart; 1583 } 1584 1585 /* 1586 * Go through the zonelist yet one more time, keep 1587 * very high watermark here, this is only to catch 1588 * a parallel oom killing, we must fail if we're still 1589 * under heavy pressure. 1590 */ 1591 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 1592 zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); 1593 if (page) { 1594 clear_zonelist_oom(zonelist); 1595 goto got_pg; 1596 } 1597 1598 /* The OOM killer will not help higher order allocs so fail */ 1599 if (order > PAGE_ALLOC_COSTLY_ORDER) { 1600 clear_zonelist_oom(zonelist); 1601 goto nopage; 1602 } 1603 1604 out_of_memory(zonelist, gfp_mask, order); 1605 clear_zonelist_oom(zonelist); 1606 goto restart; 1607 } 1608 1609 /* 1610 * Don't let big-order allocations loop unless the caller explicitly 1611 * requests that. Wait for some write requests to complete then retry. 1612 * 1613 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order 1614 * <= 3, but that may not be true in other implementations. 1615 */ 1616 do_retry = 0; 1617 if (!(gfp_mask & __GFP_NORETRY)) { 1618 if ((order <= PAGE_ALLOC_COSTLY_ORDER) || 1619 (gfp_mask & __GFP_REPEAT)) 1620 do_retry = 1; 1621 if (gfp_mask & __GFP_NOFAIL) 1622 do_retry = 1; 1623 } 1624 if (do_retry) { 1625 congestion_wait(WRITE, HZ/50); 1626 goto rebalance; 1627 } 1628 1629 nopage: 1630 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { 1631 printk(KERN_WARNING "%s: page allocation failure." 1632 " order:%d, mode:0x%x\n", 1633 p->comm, order, gfp_mask); 1634 dump_stack(); 1635 show_mem(); 1636 } 1637 got_pg: 1638 return page; 1639 } 1640 1641 EXPORT_SYMBOL(__alloc_pages); 1642 1643 /* 1644 * Common helper functions. 1645 */ 1646 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 1647 { 1648 struct page * page; 1649 page = alloc_pages(gfp_mask, order); 1650 if (!page) 1651 return 0; 1652 return (unsigned long) page_address(page); 1653 } 1654 1655 EXPORT_SYMBOL(__get_free_pages); 1656 1657 unsigned long get_zeroed_page(gfp_t gfp_mask) 1658 { 1659 struct page * page; 1660 1661 /* 1662 * get_zeroed_page() returns a 32-bit address, which cannot represent 1663 * a highmem page 1664 */ 1665 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); 1666 1667 page = alloc_pages(gfp_mask | __GFP_ZERO, 0); 1668 if (page) 1669 return (unsigned long) page_address(page); 1670 return 0; 1671 } 1672 1673 EXPORT_SYMBOL(get_zeroed_page); 1674 1675 void __pagevec_free(struct pagevec *pvec) 1676 { 1677 int i = pagevec_count(pvec); 1678 1679 while (--i >= 0) 1680 free_hot_cold_page(pvec->pages[i], pvec->cold); 1681 } 1682 1683 void __free_pages(struct page *page, unsigned int order) 1684 { 1685 if (put_page_testzero(page)) { 1686 if (order == 0) 1687 free_hot_page(page); 1688 else 1689 __free_pages_ok(page, order); 1690 } 1691 } 1692 1693 EXPORT_SYMBOL(__free_pages); 1694 1695 void free_pages(unsigned long addr, unsigned int order) 1696 { 1697 if (addr != 0) { 1698 VM_BUG_ON(!virt_addr_valid((void *)addr)); 1699 __free_pages(virt_to_page((void *)addr), order); 1700 } 1701 } 1702 1703 EXPORT_SYMBOL(free_pages); 1704 1705 static unsigned int nr_free_zone_pages(int offset) 1706 { 1707 /* Just pick one node, since fallback list is circular */ 1708 pg_data_t *pgdat = NODE_DATA(numa_node_id()); 1709 unsigned int sum = 0; 1710 1711 struct zonelist *zonelist = pgdat->node_zonelists + offset; 1712 struct zone **zonep = zonelist->zones; 1713 struct zone *zone; 1714 1715 for (zone = *zonep++; zone; zone = *zonep++) { 1716 unsigned long size = zone->present_pages; 1717 unsigned long high = zone->pages_high; 1718 if (size > high) 1719 sum += size - high; 1720 } 1721 1722 return sum; 1723 } 1724 1725 /* 1726 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL 1727 */ 1728 unsigned int nr_free_buffer_pages(void) 1729 { 1730 return nr_free_zone_pages(gfp_zone(GFP_USER)); 1731 } 1732 EXPORT_SYMBOL_GPL(nr_free_buffer_pages); 1733 1734 /* 1735 * Amount of free RAM allocatable within all zones 1736 */ 1737 unsigned int nr_free_pagecache_pages(void) 1738 { 1739 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); 1740 } 1741 1742 static inline void show_node(struct zone *zone) 1743 { 1744 if (NUMA_BUILD) 1745 printk("Node %d ", zone_to_nid(zone)); 1746 } 1747 1748 void si_meminfo(struct sysinfo *val) 1749 { 1750 val->totalram = totalram_pages; 1751 val->sharedram = 0; 1752 val->freeram = global_page_state(NR_FREE_PAGES); 1753 val->bufferram = nr_blockdev_pages(); 1754 val->totalhigh = totalhigh_pages; 1755 val->freehigh = nr_free_highpages(); 1756 val->mem_unit = PAGE_SIZE; 1757 } 1758 1759 EXPORT_SYMBOL(si_meminfo); 1760 1761 #ifdef CONFIG_NUMA 1762 void si_meminfo_node(struct sysinfo *val, int nid) 1763 { 1764 pg_data_t *pgdat = NODE_DATA(nid); 1765 1766 val->totalram = pgdat->node_present_pages; 1767 val->freeram = node_page_state(nid, NR_FREE_PAGES); 1768 #ifdef CONFIG_HIGHMEM 1769 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; 1770 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], 1771 NR_FREE_PAGES); 1772 #else 1773 val->totalhigh = 0; 1774 val->freehigh = 0; 1775 #endif 1776 val->mem_unit = PAGE_SIZE; 1777 } 1778 #endif 1779 1780 #define K(x) ((x) << (PAGE_SHIFT-10)) 1781 1782 /* 1783 * Show free area list (used inside shift_scroll-lock stuff) 1784 * We also calculate the percentage fragmentation. We do this by counting the 1785 * memory on each free list with the exception of the first item on the list. 1786 */ 1787 void show_free_areas(void) 1788 { 1789 int cpu; 1790 struct zone *zone; 1791 1792 for_each_zone(zone) { 1793 if (!populated_zone(zone)) 1794 continue; 1795 1796 show_node(zone); 1797 printk("%s per-cpu:\n", zone->name); 1798 1799 for_each_online_cpu(cpu) { 1800 struct per_cpu_pageset *pageset; 1801 1802 pageset = zone_pcp(zone, cpu); 1803 1804 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", 1805 cpu, pageset->pcp.high, 1806 pageset->pcp.batch, pageset->pcp.count); 1807 } 1808 } 1809 1810 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n" 1811 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", 1812 global_page_state(NR_ACTIVE), 1813 global_page_state(NR_INACTIVE), 1814 global_page_state(NR_FILE_DIRTY), 1815 global_page_state(NR_WRITEBACK), 1816 global_page_state(NR_UNSTABLE_NFS), 1817 global_page_state(NR_FREE_PAGES), 1818 global_page_state(NR_SLAB_RECLAIMABLE) + 1819 global_page_state(NR_SLAB_UNRECLAIMABLE), 1820 global_page_state(NR_FILE_MAPPED), 1821 global_page_state(NR_PAGETABLE), 1822 global_page_state(NR_BOUNCE)); 1823 1824 for_each_zone(zone) { 1825 int i; 1826 1827 if (!populated_zone(zone)) 1828 continue; 1829 1830 show_node(zone); 1831 printk("%s" 1832 " free:%lukB" 1833 " min:%lukB" 1834 " low:%lukB" 1835 " high:%lukB" 1836 " active:%lukB" 1837 " inactive:%lukB" 1838 " present:%lukB" 1839 " pages_scanned:%lu" 1840 " all_unreclaimable? %s" 1841 "\n", 1842 zone->name, 1843 K(zone_page_state(zone, NR_FREE_PAGES)), 1844 K(zone->pages_min), 1845 K(zone->pages_low), 1846 K(zone->pages_high), 1847 K(zone_page_state(zone, NR_ACTIVE)), 1848 K(zone_page_state(zone, NR_INACTIVE)), 1849 K(zone->present_pages), 1850 zone->pages_scanned, 1851 (zone_is_all_unreclaimable(zone) ? "yes" : "no") 1852 ); 1853 printk("lowmem_reserve[]:"); 1854 for (i = 0; i < MAX_NR_ZONES; i++) 1855 printk(" %lu", zone->lowmem_reserve[i]); 1856 printk("\n"); 1857 } 1858 1859 for_each_zone(zone) { 1860 unsigned long nr[MAX_ORDER], flags, order, total = 0; 1861 1862 if (!populated_zone(zone)) 1863 continue; 1864 1865 show_node(zone); 1866 printk("%s: ", zone->name); 1867 1868 spin_lock_irqsave(&zone->lock, flags); 1869 for (order = 0; order < MAX_ORDER; order++) { 1870 nr[order] = zone->free_area[order].nr_free; 1871 total += nr[order] << order; 1872 } 1873 spin_unlock_irqrestore(&zone->lock, flags); 1874 for (order = 0; order < MAX_ORDER; order++) 1875 printk("%lu*%lukB ", nr[order], K(1UL) << order); 1876 printk("= %lukB\n", K(total)); 1877 } 1878 1879 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); 1880 1881 show_swap_cache_info(); 1882 } 1883 1884 /* 1885 * Builds allocation fallback zone lists. 1886 * 1887 * Add all populated zones of a node to the zonelist. 1888 */ 1889 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, 1890 int nr_zones, enum zone_type zone_type) 1891 { 1892 struct zone *zone; 1893 1894 BUG_ON(zone_type >= MAX_NR_ZONES); 1895 zone_type++; 1896 1897 do { 1898 zone_type--; 1899 zone = pgdat->node_zones + zone_type; 1900 if (populated_zone(zone)) { 1901 zonelist->zones[nr_zones++] = zone; 1902 check_highest_zone(zone_type); 1903 } 1904 1905 } while (zone_type); 1906 return nr_zones; 1907 } 1908 1909 1910 /* 1911 * zonelist_order: 1912 * 0 = automatic detection of better ordering. 1913 * 1 = order by ([node] distance, -zonetype) 1914 * 2 = order by (-zonetype, [node] distance) 1915 * 1916 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create 1917 * the same zonelist. So only NUMA can configure this param. 1918 */ 1919 #define ZONELIST_ORDER_DEFAULT 0 1920 #define ZONELIST_ORDER_NODE 1 1921 #define ZONELIST_ORDER_ZONE 2 1922 1923 /* zonelist order in the kernel. 1924 * set_zonelist_order() will set this to NODE or ZONE. 1925 */ 1926 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; 1927 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; 1928 1929 1930 #ifdef CONFIG_NUMA 1931 /* The value user specified ....changed by config */ 1932 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; 1933 /* string for sysctl */ 1934 #define NUMA_ZONELIST_ORDER_LEN 16 1935 char numa_zonelist_order[16] = "default"; 1936 1937 /* 1938 * interface for configure zonelist ordering. 1939 * command line option "numa_zonelist_order" 1940 * = "[dD]efault - default, automatic configuration. 1941 * = "[nN]ode - order by node locality, then by zone within node 1942 * = "[zZ]one - order by zone, then by locality within zone 1943 */ 1944 1945 static int __parse_numa_zonelist_order(char *s) 1946 { 1947 if (*s == 'd' || *s == 'D') { 1948 user_zonelist_order = ZONELIST_ORDER_DEFAULT; 1949 } else if (*s == 'n' || *s == 'N') { 1950 user_zonelist_order = ZONELIST_ORDER_NODE; 1951 } else if (*s == 'z' || *s == 'Z') { 1952 user_zonelist_order = ZONELIST_ORDER_ZONE; 1953 } else { 1954 printk(KERN_WARNING 1955 "Ignoring invalid numa_zonelist_order value: " 1956 "%s\n", s); 1957 return -EINVAL; 1958 } 1959 return 0; 1960 } 1961 1962 static __init int setup_numa_zonelist_order(char *s) 1963 { 1964 if (s) 1965 return __parse_numa_zonelist_order(s); 1966 return 0; 1967 } 1968 early_param("numa_zonelist_order", setup_numa_zonelist_order); 1969 1970 /* 1971 * sysctl handler for numa_zonelist_order 1972 */ 1973 int numa_zonelist_order_handler(ctl_table *table, int write, 1974 struct file *file, void __user *buffer, size_t *length, 1975 loff_t *ppos) 1976 { 1977 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 1978 int ret; 1979 1980 if (write) 1981 strncpy(saved_string, (char*)table->data, 1982 NUMA_ZONELIST_ORDER_LEN); 1983 ret = proc_dostring(table, write, file, buffer, length, ppos); 1984 if (ret) 1985 return ret; 1986 if (write) { 1987 int oldval = user_zonelist_order; 1988 if (__parse_numa_zonelist_order((char*)table->data)) { 1989 /* 1990 * bogus value. restore saved string 1991 */ 1992 strncpy((char*)table->data, saved_string, 1993 NUMA_ZONELIST_ORDER_LEN); 1994 user_zonelist_order = oldval; 1995 } else if (oldval != user_zonelist_order) 1996 build_all_zonelists(); 1997 } 1998 return 0; 1999 } 2000 2001 2002 #define MAX_NODE_LOAD (num_online_nodes()) 2003 static int node_load[MAX_NUMNODES]; 2004 2005 /** 2006 * find_next_best_node - find the next node that should appear in a given node's fallback list 2007 * @node: node whose fallback list we're appending 2008 * @used_node_mask: nodemask_t of already used nodes 2009 * 2010 * We use a number of factors to determine which is the next node that should 2011 * appear on a given node's fallback list. The node should not have appeared 2012 * already in @node's fallback list, and it should be the next closest node 2013 * according to the distance array (which contains arbitrary distance values 2014 * from each node to each node in the system), and should also prefer nodes 2015 * with no CPUs, since presumably they'll have very little allocation pressure 2016 * on them otherwise. 2017 * It returns -1 if no node is found. 2018 */ 2019 static int find_next_best_node(int node, nodemask_t *used_node_mask) 2020 { 2021 int n, val; 2022 int min_val = INT_MAX; 2023 int best_node = -1; 2024 2025 /* Use the local node if we haven't already */ 2026 if (!node_isset(node, *used_node_mask)) { 2027 node_set(node, *used_node_mask); 2028 return node; 2029 } 2030 2031 for_each_node_state(n, N_HIGH_MEMORY) { 2032 cpumask_t tmp; 2033 2034 /* Don't want a node to appear more than once */ 2035 if (node_isset(n, *used_node_mask)) 2036 continue; 2037 2038 /* Use the distance array to find the distance */ 2039 val = node_distance(node, n); 2040 2041 /* Penalize nodes under us ("prefer the next node") */ 2042 val += (n < node); 2043 2044 /* Give preference to headless and unused nodes */ 2045 tmp = node_to_cpumask(n); 2046 if (!cpus_empty(tmp)) 2047 val += PENALTY_FOR_NODE_WITH_CPUS; 2048 2049 /* Slight preference for less loaded node */ 2050 val *= (MAX_NODE_LOAD*MAX_NUMNODES); 2051 val += node_load[n]; 2052 2053 if (val < min_val) { 2054 min_val = val; 2055 best_node = n; 2056 } 2057 } 2058 2059 if (best_node >= 0) 2060 node_set(best_node, *used_node_mask); 2061 2062 return best_node; 2063 } 2064 2065 2066 /* 2067 * Build zonelists ordered by node and zones within node. 2068 * This results in maximum locality--normal zone overflows into local 2069 * DMA zone, if any--but risks exhausting DMA zone. 2070 */ 2071 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) 2072 { 2073 enum zone_type i; 2074 int j; 2075 struct zonelist *zonelist; 2076 2077 for (i = 0; i < MAX_NR_ZONES; i++) { 2078 zonelist = pgdat->node_zonelists + i; 2079 for (j = 0; zonelist->zones[j] != NULL; j++) 2080 ; 2081 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); 2082 zonelist->zones[j] = NULL; 2083 } 2084 } 2085 2086 /* 2087 * Build gfp_thisnode zonelists 2088 */ 2089 static void build_thisnode_zonelists(pg_data_t *pgdat) 2090 { 2091 enum zone_type i; 2092 int j; 2093 struct zonelist *zonelist; 2094 2095 for (i = 0; i < MAX_NR_ZONES; i++) { 2096 zonelist = pgdat->node_zonelists + MAX_NR_ZONES + i; 2097 j = build_zonelists_node(pgdat, zonelist, 0, i); 2098 zonelist->zones[j] = NULL; 2099 } 2100 } 2101 2102 /* 2103 * Build zonelists ordered by zone and nodes within zones. 2104 * This results in conserving DMA zone[s] until all Normal memory is 2105 * exhausted, but results in overflowing to remote node while memory 2106 * may still exist in local DMA zone. 2107 */ 2108 static int node_order[MAX_NUMNODES]; 2109 2110 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) 2111 { 2112 enum zone_type i; 2113 int pos, j, node; 2114 int zone_type; /* needs to be signed */ 2115 struct zone *z; 2116 struct zonelist *zonelist; 2117 2118 for (i = 0; i < MAX_NR_ZONES; i++) { 2119 zonelist = pgdat->node_zonelists + i; 2120 pos = 0; 2121 for (zone_type = i; zone_type >= 0; zone_type--) { 2122 for (j = 0; j < nr_nodes; j++) { 2123 node = node_order[j]; 2124 z = &NODE_DATA(node)->node_zones[zone_type]; 2125 if (populated_zone(z)) { 2126 zonelist->zones[pos++] = z; 2127 check_highest_zone(zone_type); 2128 } 2129 } 2130 } 2131 zonelist->zones[pos] = NULL; 2132 } 2133 } 2134 2135 static int default_zonelist_order(void) 2136 { 2137 int nid, zone_type; 2138 unsigned long low_kmem_size,total_size; 2139 struct zone *z; 2140 int average_size; 2141 /* 2142 * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem. 2143 * If they are really small and used heavily, the system can fall 2144 * into OOM very easily. 2145 * This function detect ZONE_DMA/DMA32 size and confgigures zone order. 2146 */ 2147 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ 2148 low_kmem_size = 0; 2149 total_size = 0; 2150 for_each_online_node(nid) { 2151 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 2152 z = &NODE_DATA(nid)->node_zones[zone_type]; 2153 if (populated_zone(z)) { 2154 if (zone_type < ZONE_NORMAL) 2155 low_kmem_size += z->present_pages; 2156 total_size += z->present_pages; 2157 } 2158 } 2159 } 2160 if (!low_kmem_size || /* there are no DMA area. */ 2161 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ 2162 return ZONELIST_ORDER_NODE; 2163 /* 2164 * look into each node's config. 2165 * If there is a node whose DMA/DMA32 memory is very big area on 2166 * local memory, NODE_ORDER may be suitable. 2167 */ 2168 average_size = total_size / 2169 (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); 2170 for_each_online_node(nid) { 2171 low_kmem_size = 0; 2172 total_size = 0; 2173 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 2174 z = &NODE_DATA(nid)->node_zones[zone_type]; 2175 if (populated_zone(z)) { 2176 if (zone_type < ZONE_NORMAL) 2177 low_kmem_size += z->present_pages; 2178 total_size += z->present_pages; 2179 } 2180 } 2181 if (low_kmem_size && 2182 total_size > average_size && /* ignore small node */ 2183 low_kmem_size > total_size * 70/100) 2184 return ZONELIST_ORDER_NODE; 2185 } 2186 return ZONELIST_ORDER_ZONE; 2187 } 2188 2189 static void set_zonelist_order(void) 2190 { 2191 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) 2192 current_zonelist_order = default_zonelist_order(); 2193 else 2194 current_zonelist_order = user_zonelist_order; 2195 } 2196 2197 static void build_zonelists(pg_data_t *pgdat) 2198 { 2199 int j, node, load; 2200 enum zone_type i; 2201 nodemask_t used_mask; 2202 int local_node, prev_node; 2203 struct zonelist *zonelist; 2204 int order = current_zonelist_order; 2205 2206 /* initialize zonelists */ 2207 for (i = 0; i < MAX_ZONELISTS; i++) { 2208 zonelist = pgdat->node_zonelists + i; 2209 zonelist->zones[0] = NULL; 2210 } 2211 2212 /* NUMA-aware ordering of nodes */ 2213 local_node = pgdat->node_id; 2214 load = num_online_nodes(); 2215 prev_node = local_node; 2216 nodes_clear(used_mask); 2217 2218 memset(node_load, 0, sizeof(node_load)); 2219 memset(node_order, 0, sizeof(node_order)); 2220 j = 0; 2221 2222 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 2223 int distance = node_distance(local_node, node); 2224 2225 /* 2226 * If another node is sufficiently far away then it is better 2227 * to reclaim pages in a zone before going off node. 2228 */ 2229 if (distance > RECLAIM_DISTANCE) 2230 zone_reclaim_mode = 1; 2231 2232 /* 2233 * We don't want to pressure a particular node. 2234 * So adding penalty to the first node in same 2235 * distance group to make it round-robin. 2236 */ 2237 if (distance != node_distance(local_node, prev_node)) 2238 node_load[node] = load; 2239 2240 prev_node = node; 2241 load--; 2242 if (order == ZONELIST_ORDER_NODE) 2243 build_zonelists_in_node_order(pgdat, node); 2244 else 2245 node_order[j++] = node; /* remember order */ 2246 } 2247 2248 if (order == ZONELIST_ORDER_ZONE) { 2249 /* calculate node order -- i.e., DMA last! */ 2250 build_zonelists_in_zone_order(pgdat, j); 2251 } 2252 2253 build_thisnode_zonelists(pgdat); 2254 } 2255 2256 /* Construct the zonelist performance cache - see further mmzone.h */ 2257 static void build_zonelist_cache(pg_data_t *pgdat) 2258 { 2259 int i; 2260 2261 for (i = 0; i < MAX_NR_ZONES; i++) { 2262 struct zonelist *zonelist; 2263 struct zonelist_cache *zlc; 2264 struct zone **z; 2265 2266 zonelist = pgdat->node_zonelists + i; 2267 zonelist->zlcache_ptr = zlc = &zonelist->zlcache; 2268 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 2269 for (z = zonelist->zones; *z; z++) 2270 zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z); 2271 } 2272 } 2273 2274 2275 #else /* CONFIG_NUMA */ 2276 2277 static void set_zonelist_order(void) 2278 { 2279 current_zonelist_order = ZONELIST_ORDER_ZONE; 2280 } 2281 2282 static void build_zonelists(pg_data_t *pgdat) 2283 { 2284 int node, local_node; 2285 enum zone_type i,j; 2286 2287 local_node = pgdat->node_id; 2288 for (i = 0; i < MAX_NR_ZONES; i++) { 2289 struct zonelist *zonelist; 2290 2291 zonelist = pgdat->node_zonelists + i; 2292 2293 j = build_zonelists_node(pgdat, zonelist, 0, i); 2294 /* 2295 * Now we build the zonelist so that it contains the zones 2296 * of all the other nodes. 2297 * We don't want to pressure a particular node, so when 2298 * building the zones for node N, we make sure that the 2299 * zones coming right after the local ones are those from 2300 * node N+1 (modulo N) 2301 */ 2302 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 2303 if (!node_online(node)) 2304 continue; 2305 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); 2306 } 2307 for (node = 0; node < local_node; node++) { 2308 if (!node_online(node)) 2309 continue; 2310 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); 2311 } 2312 2313 zonelist->zones[j] = NULL; 2314 } 2315 } 2316 2317 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ 2318 static void build_zonelist_cache(pg_data_t *pgdat) 2319 { 2320 int i; 2321 2322 for (i = 0; i < MAX_NR_ZONES; i++) 2323 pgdat->node_zonelists[i].zlcache_ptr = NULL; 2324 } 2325 2326 #endif /* CONFIG_NUMA */ 2327 2328 /* return values int ....just for stop_machine_run() */ 2329 static int __build_all_zonelists(void *dummy) 2330 { 2331 int nid; 2332 2333 for_each_online_node(nid) { 2334 pg_data_t *pgdat = NODE_DATA(nid); 2335 2336 build_zonelists(pgdat); 2337 build_zonelist_cache(pgdat); 2338 } 2339 return 0; 2340 } 2341 2342 void build_all_zonelists(void) 2343 { 2344 set_zonelist_order(); 2345 2346 if (system_state == SYSTEM_BOOTING) { 2347 __build_all_zonelists(NULL); 2348 cpuset_init_current_mems_allowed(); 2349 } else { 2350 /* we have to stop all cpus to guarantee there is no user 2351 of zonelist */ 2352 stop_machine_run(__build_all_zonelists, NULL, NR_CPUS); 2353 /* cpuset refresh routine should be here */ 2354 } 2355 vm_total_pages = nr_free_pagecache_pages(); 2356 /* 2357 * Disable grouping by mobility if the number of pages in the 2358 * system is too low to allow the mechanism to work. It would be 2359 * more accurate, but expensive to check per-zone. This check is 2360 * made on memory-hotadd so a system can start with mobility 2361 * disabled and enable it later 2362 */ 2363 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) 2364 page_group_by_mobility_disabled = 1; 2365 else 2366 page_group_by_mobility_disabled = 0; 2367 2368 printk("Built %i zonelists in %s order, mobility grouping %s. " 2369 "Total pages: %ld\n", 2370 num_online_nodes(), 2371 zonelist_order_name[current_zonelist_order], 2372 page_group_by_mobility_disabled ? "off" : "on", 2373 vm_total_pages); 2374 #ifdef CONFIG_NUMA 2375 printk("Policy zone: %s\n", zone_names[policy_zone]); 2376 #endif 2377 } 2378 2379 /* 2380 * Helper functions to size the waitqueue hash table. 2381 * Essentially these want to choose hash table sizes sufficiently 2382 * large so that collisions trying to wait on pages are rare. 2383 * But in fact, the number of active page waitqueues on typical 2384 * systems is ridiculously low, less than 200. So this is even 2385 * conservative, even though it seems large. 2386 * 2387 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to 2388 * waitqueues, i.e. the size of the waitq table given the number of pages. 2389 */ 2390 #define PAGES_PER_WAITQUEUE 256 2391 2392 #ifndef CONFIG_MEMORY_HOTPLUG 2393 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 2394 { 2395 unsigned long size = 1; 2396 2397 pages /= PAGES_PER_WAITQUEUE; 2398 2399 while (size < pages) 2400 size <<= 1; 2401 2402 /* 2403 * Once we have dozens or even hundreds of threads sleeping 2404 * on IO we've got bigger problems than wait queue collision. 2405 * Limit the size of the wait table to a reasonable size. 2406 */ 2407 size = min(size, 4096UL); 2408 2409 return max(size, 4UL); 2410 } 2411 #else 2412 /* 2413 * A zone's size might be changed by hot-add, so it is not possible to determine 2414 * a suitable size for its wait_table. So we use the maximum size now. 2415 * 2416 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: 2417 * 2418 * i386 (preemption config) : 4096 x 16 = 64Kbyte. 2419 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. 2420 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. 2421 * 2422 * The maximum entries are prepared when a zone's memory is (512K + 256) pages 2423 * or more by the traditional way. (See above). It equals: 2424 * 2425 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. 2426 * ia64(16K page size) : = ( 8G + 4M)byte. 2427 * powerpc (64K page size) : = (32G +16M)byte. 2428 */ 2429 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 2430 { 2431 return 4096UL; 2432 } 2433 #endif 2434 2435 /* 2436 * This is an integer logarithm so that shifts can be used later 2437 * to extract the more random high bits from the multiplicative 2438 * hash function before the remainder is taken. 2439 */ 2440 static inline unsigned long wait_table_bits(unsigned long size) 2441 { 2442 return ffz(~size); 2443 } 2444 2445 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 2446 2447 /* 2448 * Mark a number of pageblocks as MIGRATE_RESERVE. The number 2449 * of blocks reserved is based on zone->pages_min. The memory within the 2450 * reserve will tend to store contiguous free pages. Setting min_free_kbytes 2451 * higher will lead to a bigger reserve which will get freed as contiguous 2452 * blocks as reclaim kicks in 2453 */ 2454 static void setup_zone_migrate_reserve(struct zone *zone) 2455 { 2456 unsigned long start_pfn, pfn, end_pfn; 2457 struct page *page; 2458 unsigned long reserve, block_migratetype; 2459 2460 /* Get the start pfn, end pfn and the number of blocks to reserve */ 2461 start_pfn = zone->zone_start_pfn; 2462 end_pfn = start_pfn + zone->spanned_pages; 2463 reserve = roundup(zone->pages_min, pageblock_nr_pages) >> 2464 pageblock_order; 2465 2466 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 2467 if (!pfn_valid(pfn)) 2468 continue; 2469 page = pfn_to_page(pfn); 2470 2471 /* Blocks with reserved pages will never free, skip them. */ 2472 if (PageReserved(page)) 2473 continue; 2474 2475 block_migratetype = get_pageblock_migratetype(page); 2476 2477 /* If this block is reserved, account for it */ 2478 if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) { 2479 reserve--; 2480 continue; 2481 } 2482 2483 /* Suitable for reserving if this block is movable */ 2484 if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) { 2485 set_pageblock_migratetype(page, MIGRATE_RESERVE); 2486 move_freepages_block(zone, page, MIGRATE_RESERVE); 2487 reserve--; 2488 continue; 2489 } 2490 2491 /* 2492 * If the reserve is met and this is a previous reserved block, 2493 * take it back 2494 */ 2495 if (block_migratetype == MIGRATE_RESERVE) { 2496 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 2497 move_freepages_block(zone, page, MIGRATE_MOVABLE); 2498 } 2499 } 2500 } 2501 2502 /* 2503 * Initially all pages are reserved - free ones are freed 2504 * up by free_all_bootmem() once the early boot process is 2505 * done. Non-atomic initialization, single-pass. 2506 */ 2507 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, 2508 unsigned long start_pfn, enum memmap_context context) 2509 { 2510 struct page *page; 2511 unsigned long end_pfn = start_pfn + size; 2512 unsigned long pfn; 2513 2514 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 2515 /* 2516 * There can be holes in boot-time mem_map[]s 2517 * handed to this function. They do not 2518 * exist on hotplugged memory. 2519 */ 2520 if (context == MEMMAP_EARLY) { 2521 if (!early_pfn_valid(pfn)) 2522 continue; 2523 if (!early_pfn_in_nid(pfn, nid)) 2524 continue; 2525 } 2526 page = pfn_to_page(pfn); 2527 set_page_links(page, zone, nid, pfn); 2528 init_page_count(page); 2529 reset_page_mapcount(page); 2530 page_assign_page_cgroup(page, NULL); 2531 SetPageReserved(page); 2532 2533 /* 2534 * Mark the block movable so that blocks are reserved for 2535 * movable at startup. This will force kernel allocations 2536 * to reserve their blocks rather than leaking throughout 2537 * the address space during boot when many long-lived 2538 * kernel allocations are made. Later some blocks near 2539 * the start are marked MIGRATE_RESERVE by 2540 * setup_zone_migrate_reserve() 2541 */ 2542 if ((pfn & (pageblock_nr_pages-1))) 2543 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 2544 2545 INIT_LIST_HEAD(&page->lru); 2546 #ifdef WANT_PAGE_VIRTUAL 2547 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 2548 if (!is_highmem_idx(zone)) 2549 set_page_address(page, __va(pfn << PAGE_SHIFT)); 2550 #endif 2551 } 2552 } 2553 2554 static void __meminit zone_init_free_lists(struct zone *zone) 2555 { 2556 int order, t; 2557 for_each_migratetype_order(order, t) { 2558 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); 2559 zone->free_area[order].nr_free = 0; 2560 } 2561 } 2562 2563 #ifndef __HAVE_ARCH_MEMMAP_INIT 2564 #define memmap_init(size, nid, zone, start_pfn) \ 2565 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) 2566 #endif 2567 2568 static int zone_batchsize(struct zone *zone) 2569 { 2570 int batch; 2571 2572 /* 2573 * The per-cpu-pages pools are set to around 1000th of the 2574 * size of the zone. But no more than 1/2 of a meg. 2575 * 2576 * OK, so we don't know how big the cache is. So guess. 2577 */ 2578 batch = zone->present_pages / 1024; 2579 if (batch * PAGE_SIZE > 512 * 1024) 2580 batch = (512 * 1024) / PAGE_SIZE; 2581 batch /= 4; /* We effectively *= 4 below */ 2582 if (batch < 1) 2583 batch = 1; 2584 2585 /* 2586 * Clamp the batch to a 2^n - 1 value. Having a power 2587 * of 2 value was found to be more likely to have 2588 * suboptimal cache aliasing properties in some cases. 2589 * 2590 * For example if 2 tasks are alternately allocating 2591 * batches of pages, one task can end up with a lot 2592 * of pages of one half of the possible page colors 2593 * and the other with pages of the other colors. 2594 */ 2595 batch = (1 << (fls(batch + batch/2)-1)) - 1; 2596 2597 return batch; 2598 } 2599 2600 inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 2601 { 2602 struct per_cpu_pages *pcp; 2603 2604 memset(p, 0, sizeof(*p)); 2605 2606 pcp = &p->pcp; 2607 pcp->count = 0; 2608 pcp->high = 6 * batch; 2609 pcp->batch = max(1UL, 1 * batch); 2610 INIT_LIST_HEAD(&pcp->list); 2611 } 2612 2613 /* 2614 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist 2615 * to the value high for the pageset p. 2616 */ 2617 2618 static void setup_pagelist_highmark(struct per_cpu_pageset *p, 2619 unsigned long high) 2620 { 2621 struct per_cpu_pages *pcp; 2622 2623 pcp = &p->pcp; 2624 pcp->high = high; 2625 pcp->batch = max(1UL, high/4); 2626 if ((high/4) > (PAGE_SHIFT * 8)) 2627 pcp->batch = PAGE_SHIFT * 8; 2628 } 2629 2630 2631 #ifdef CONFIG_NUMA 2632 /* 2633 * Boot pageset table. One per cpu which is going to be used for all 2634 * zones and all nodes. The parameters will be set in such a way 2635 * that an item put on a list will immediately be handed over to 2636 * the buddy list. This is safe since pageset manipulation is done 2637 * with interrupts disabled. 2638 * 2639 * Some NUMA counter updates may also be caught by the boot pagesets. 2640 * 2641 * The boot_pagesets must be kept even after bootup is complete for 2642 * unused processors and/or zones. They do play a role for bootstrapping 2643 * hotplugged processors. 2644 * 2645 * zoneinfo_show() and maybe other functions do 2646 * not check if the processor is online before following the pageset pointer. 2647 * Other parts of the kernel may not check if the zone is available. 2648 */ 2649 static struct per_cpu_pageset boot_pageset[NR_CPUS]; 2650 2651 /* 2652 * Dynamically allocate memory for the 2653 * per cpu pageset array in struct zone. 2654 */ 2655 static int __cpuinit process_zones(int cpu) 2656 { 2657 struct zone *zone, *dzone; 2658 int node = cpu_to_node(cpu); 2659 2660 node_set_state(node, N_CPU); /* this node has a cpu */ 2661 2662 for_each_zone(zone) { 2663 2664 if (!populated_zone(zone)) 2665 continue; 2666 2667 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 2668 GFP_KERNEL, node); 2669 if (!zone_pcp(zone, cpu)) 2670 goto bad; 2671 2672 setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); 2673 2674 if (percpu_pagelist_fraction) 2675 setup_pagelist_highmark(zone_pcp(zone, cpu), 2676 (zone->present_pages / percpu_pagelist_fraction)); 2677 } 2678 2679 return 0; 2680 bad: 2681 for_each_zone(dzone) { 2682 if (!populated_zone(dzone)) 2683 continue; 2684 if (dzone == zone) 2685 break; 2686 kfree(zone_pcp(dzone, cpu)); 2687 zone_pcp(dzone, cpu) = NULL; 2688 } 2689 return -ENOMEM; 2690 } 2691 2692 static inline void free_zone_pagesets(int cpu) 2693 { 2694 struct zone *zone; 2695 2696 for_each_zone(zone) { 2697 struct per_cpu_pageset *pset = zone_pcp(zone, cpu); 2698 2699 /* Free per_cpu_pageset if it is slab allocated */ 2700 if (pset != &boot_pageset[cpu]) 2701 kfree(pset); 2702 zone_pcp(zone, cpu) = NULL; 2703 } 2704 } 2705 2706 static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, 2707 unsigned long action, 2708 void *hcpu) 2709 { 2710 int cpu = (long)hcpu; 2711 int ret = NOTIFY_OK; 2712 2713 switch (action) { 2714 case CPU_UP_PREPARE: 2715 case CPU_UP_PREPARE_FROZEN: 2716 if (process_zones(cpu)) 2717 ret = NOTIFY_BAD; 2718 break; 2719 case CPU_UP_CANCELED: 2720 case CPU_UP_CANCELED_FROZEN: 2721 case CPU_DEAD: 2722 case CPU_DEAD_FROZEN: 2723 free_zone_pagesets(cpu); 2724 break; 2725 default: 2726 break; 2727 } 2728 return ret; 2729 } 2730 2731 static struct notifier_block __cpuinitdata pageset_notifier = 2732 { &pageset_cpuup_callback, NULL, 0 }; 2733 2734 void __init setup_per_cpu_pageset(void) 2735 { 2736 int err; 2737 2738 /* Initialize per_cpu_pageset for cpu 0. 2739 * A cpuup callback will do this for every cpu 2740 * as it comes online 2741 */ 2742 err = process_zones(smp_processor_id()); 2743 BUG_ON(err); 2744 register_cpu_notifier(&pageset_notifier); 2745 } 2746 2747 #endif 2748 2749 static noinline __init_refok 2750 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 2751 { 2752 int i; 2753 struct pglist_data *pgdat = zone->zone_pgdat; 2754 size_t alloc_size; 2755 2756 /* 2757 * The per-page waitqueue mechanism uses hashed waitqueues 2758 * per zone. 2759 */ 2760 zone->wait_table_hash_nr_entries = 2761 wait_table_hash_nr_entries(zone_size_pages); 2762 zone->wait_table_bits = 2763 wait_table_bits(zone->wait_table_hash_nr_entries); 2764 alloc_size = zone->wait_table_hash_nr_entries 2765 * sizeof(wait_queue_head_t); 2766 2767 if (system_state == SYSTEM_BOOTING) { 2768 zone->wait_table = (wait_queue_head_t *) 2769 alloc_bootmem_node(pgdat, alloc_size); 2770 } else { 2771 /* 2772 * This case means that a zone whose size was 0 gets new memory 2773 * via memory hot-add. 2774 * But it may be the case that a new node was hot-added. In 2775 * this case vmalloc() will not be able to use this new node's 2776 * memory - this wait_table must be initialized to use this new 2777 * node itself as well. 2778 * To use this new node's memory, further consideration will be 2779 * necessary. 2780 */ 2781 zone->wait_table = vmalloc(alloc_size); 2782 } 2783 if (!zone->wait_table) 2784 return -ENOMEM; 2785 2786 for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) 2787 init_waitqueue_head(zone->wait_table + i); 2788 2789 return 0; 2790 } 2791 2792 static __meminit void zone_pcp_init(struct zone *zone) 2793 { 2794 int cpu; 2795 unsigned long batch = zone_batchsize(zone); 2796 2797 for (cpu = 0; cpu < NR_CPUS; cpu++) { 2798 #ifdef CONFIG_NUMA 2799 /* Early boot. Slab allocator not functional yet */ 2800 zone_pcp(zone, cpu) = &boot_pageset[cpu]; 2801 setup_pageset(&boot_pageset[cpu],0); 2802 #else 2803 setup_pageset(zone_pcp(zone,cpu), batch); 2804 #endif 2805 } 2806 if (zone->present_pages) 2807 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", 2808 zone->name, zone->present_pages, batch); 2809 } 2810 2811 __meminit int init_currently_empty_zone(struct zone *zone, 2812 unsigned long zone_start_pfn, 2813 unsigned long size, 2814 enum memmap_context context) 2815 { 2816 struct pglist_data *pgdat = zone->zone_pgdat; 2817 int ret; 2818 ret = zone_wait_table_init(zone, size); 2819 if (ret) 2820 return ret; 2821 pgdat->nr_zones = zone_idx(zone) + 1; 2822 2823 zone->zone_start_pfn = zone_start_pfn; 2824 2825 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); 2826 2827 zone_init_free_lists(zone); 2828 2829 return 0; 2830 } 2831 2832 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP 2833 /* 2834 * Basic iterator support. Return the first range of PFNs for a node 2835 * Note: nid == MAX_NUMNODES returns first region regardless of node 2836 */ 2837 static int __meminit first_active_region_index_in_nid(int nid) 2838 { 2839 int i; 2840 2841 for (i = 0; i < nr_nodemap_entries; i++) 2842 if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) 2843 return i; 2844 2845 return -1; 2846 } 2847 2848 /* 2849 * Basic iterator support. Return the next active range of PFNs for a node 2850 * Note: nid == MAX_NUMNODES returns next region regardless of node 2851 */ 2852 static int __meminit next_active_region_index_in_nid(int index, int nid) 2853 { 2854 for (index = index + 1; index < nr_nodemap_entries; index++) 2855 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) 2856 return index; 2857 2858 return -1; 2859 } 2860 2861 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 2862 /* 2863 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. 2864 * Architectures may implement their own version but if add_active_range() 2865 * was used and there are no special requirements, this is a convenient 2866 * alternative 2867 */ 2868 int __meminit early_pfn_to_nid(unsigned long pfn) 2869 { 2870 int i; 2871 2872 for (i = 0; i < nr_nodemap_entries; i++) { 2873 unsigned long start_pfn = early_node_map[i].start_pfn; 2874 unsigned long end_pfn = early_node_map[i].end_pfn; 2875 2876 if (start_pfn <= pfn && pfn < end_pfn) 2877 return early_node_map[i].nid; 2878 } 2879 2880 return 0; 2881 } 2882 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ 2883 2884 /* Basic iterator support to walk early_node_map[] */ 2885 #define for_each_active_range_index_in_nid(i, nid) \ 2886 for (i = first_active_region_index_in_nid(nid); i != -1; \ 2887 i = next_active_region_index_in_nid(i, nid)) 2888 2889 /** 2890 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range 2891 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. 2892 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node 2893 * 2894 * If an architecture guarantees that all ranges registered with 2895 * add_active_ranges() contain no holes and may be freed, this 2896 * this function may be used instead of calling free_bootmem() manually. 2897 */ 2898 void __init free_bootmem_with_active_regions(int nid, 2899 unsigned long max_low_pfn) 2900 { 2901 int i; 2902 2903 for_each_active_range_index_in_nid(i, nid) { 2904 unsigned long size_pages = 0; 2905 unsigned long end_pfn = early_node_map[i].end_pfn; 2906 2907 if (early_node_map[i].start_pfn >= max_low_pfn) 2908 continue; 2909 2910 if (end_pfn > max_low_pfn) 2911 end_pfn = max_low_pfn; 2912 2913 size_pages = end_pfn - early_node_map[i].start_pfn; 2914 free_bootmem_node(NODE_DATA(early_node_map[i].nid), 2915 PFN_PHYS(early_node_map[i].start_pfn), 2916 size_pages << PAGE_SHIFT); 2917 } 2918 } 2919 2920 /** 2921 * sparse_memory_present_with_active_regions - Call memory_present for each active range 2922 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. 2923 * 2924 * If an architecture guarantees that all ranges registered with 2925 * add_active_ranges() contain no holes and may be freed, this 2926 * function may be used instead of calling memory_present() manually. 2927 */ 2928 void __init sparse_memory_present_with_active_regions(int nid) 2929 { 2930 int i; 2931 2932 for_each_active_range_index_in_nid(i, nid) 2933 memory_present(early_node_map[i].nid, 2934 early_node_map[i].start_pfn, 2935 early_node_map[i].end_pfn); 2936 } 2937 2938 /** 2939 * push_node_boundaries - Push node boundaries to at least the requested boundary 2940 * @nid: The nid of the node to push the boundary for 2941 * @start_pfn: The start pfn of the node 2942 * @end_pfn: The end pfn of the node 2943 * 2944 * In reserve-based hot-add, mem_map is allocated that is unused until hotadd 2945 * time. Specifically, on x86_64, SRAT will report ranges that can potentially 2946 * be hotplugged even though no physical memory exists. This function allows 2947 * an arch to push out the node boundaries so mem_map is allocated that can 2948 * be used later. 2949 */ 2950 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE 2951 void __init push_node_boundaries(unsigned int nid, 2952 unsigned long start_pfn, unsigned long end_pfn) 2953 { 2954 printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n", 2955 nid, start_pfn, end_pfn); 2956 2957 /* Initialise the boundary for this node if necessary */ 2958 if (node_boundary_end_pfn[nid] == 0) 2959 node_boundary_start_pfn[nid] = -1UL; 2960 2961 /* Update the boundaries */ 2962 if (node_boundary_start_pfn[nid] > start_pfn) 2963 node_boundary_start_pfn[nid] = start_pfn; 2964 if (node_boundary_end_pfn[nid] < end_pfn) 2965 node_boundary_end_pfn[nid] = end_pfn; 2966 } 2967 2968 /* If necessary, push the node boundary out for reserve hotadd */ 2969 static void __meminit account_node_boundary(unsigned int nid, 2970 unsigned long *start_pfn, unsigned long *end_pfn) 2971 { 2972 printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", 2973 nid, *start_pfn, *end_pfn); 2974 2975 /* Return if boundary information has not been provided */ 2976 if (node_boundary_end_pfn[nid] == 0) 2977 return; 2978 2979 /* Check the boundaries and update if necessary */ 2980 if (node_boundary_start_pfn[nid] < *start_pfn) 2981 *start_pfn = node_boundary_start_pfn[nid]; 2982 if (node_boundary_end_pfn[nid] > *end_pfn) 2983 *end_pfn = node_boundary_end_pfn[nid]; 2984 } 2985 #else 2986 void __init push_node_boundaries(unsigned int nid, 2987 unsigned long start_pfn, unsigned long end_pfn) {} 2988 2989 static void __meminit account_node_boundary(unsigned int nid, 2990 unsigned long *start_pfn, unsigned long *end_pfn) {} 2991 #endif 2992 2993 2994 /** 2995 * get_pfn_range_for_nid - Return the start and end page frames for a node 2996 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 2997 * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 2998 * @end_pfn: Passed by reference. On return, it will have the node end_pfn. 2999 * 3000 * It returns the start and end page frame of a node based on information 3001 * provided by an arch calling add_active_range(). If called for a node 3002 * with no available memory, a warning is printed and the start and end 3003 * PFNs will be 0. 3004 */ 3005 void __meminit get_pfn_range_for_nid(unsigned int nid, 3006 unsigned long *start_pfn, unsigned long *end_pfn) 3007 { 3008 int i; 3009 *start_pfn = -1UL; 3010 *end_pfn = 0; 3011 3012 for_each_active_range_index_in_nid(i, nid) { 3013 *start_pfn = min(*start_pfn, early_node_map[i].start_pfn); 3014 *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); 3015 } 3016 3017 if (*start_pfn == -1UL) 3018 *start_pfn = 0; 3019 3020 /* Push the node boundaries out if requested */ 3021 account_node_boundary(nid, start_pfn, end_pfn); 3022 } 3023 3024 /* 3025 * This finds a zone that can be used for ZONE_MOVABLE pages. The 3026 * assumption is made that zones within a node are ordered in monotonic 3027 * increasing memory addresses so that the "highest" populated zone is used 3028 */ 3029 void __init find_usable_zone_for_movable(void) 3030 { 3031 int zone_index; 3032 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { 3033 if (zone_index == ZONE_MOVABLE) 3034 continue; 3035 3036 if (arch_zone_highest_possible_pfn[zone_index] > 3037 arch_zone_lowest_possible_pfn[zone_index]) 3038 break; 3039 } 3040 3041 VM_BUG_ON(zone_index == -1); 3042 movable_zone = zone_index; 3043 } 3044 3045 /* 3046 * The zone ranges provided by the architecture do not include ZONE_MOVABLE 3047 * because it is sized independant of architecture. Unlike the other zones, 3048 * the starting point for ZONE_MOVABLE is not fixed. It may be different 3049 * in each node depending on the size of each node and how evenly kernelcore 3050 * is distributed. This helper function adjusts the zone ranges 3051 * provided by the architecture for a given node by using the end of the 3052 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 3053 * zones within a node are in order of monotonic increases memory addresses 3054 */ 3055 void __meminit adjust_zone_range_for_zone_movable(int nid, 3056 unsigned long zone_type, 3057 unsigned long node_start_pfn, 3058 unsigned long node_end_pfn, 3059 unsigned long *zone_start_pfn, 3060 unsigned long *zone_end_pfn) 3061 { 3062 /* Only adjust if ZONE_MOVABLE is on this node */ 3063 if (zone_movable_pfn[nid]) { 3064 /* Size ZONE_MOVABLE */ 3065 if (zone_type == ZONE_MOVABLE) { 3066 *zone_start_pfn = zone_movable_pfn[nid]; 3067 *zone_end_pfn = min(node_end_pfn, 3068 arch_zone_highest_possible_pfn[movable_zone]); 3069 3070 /* Adjust for ZONE_MOVABLE starting within this range */ 3071 } else if (*zone_start_pfn < zone_movable_pfn[nid] && 3072 *zone_end_pfn > zone_movable_pfn[nid]) { 3073 *zone_end_pfn = zone_movable_pfn[nid]; 3074 3075 /* Check if this whole range is within ZONE_MOVABLE */ 3076 } else if (*zone_start_pfn >= zone_movable_pfn[nid]) 3077 *zone_start_pfn = *zone_end_pfn; 3078 } 3079 } 3080 3081 /* 3082 * Return the number of pages a zone spans in a node, including holes 3083 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 3084 */ 3085 static unsigned long __meminit zone_spanned_pages_in_node(int nid, 3086 unsigned long zone_type, 3087 unsigned long *ignored) 3088 { 3089 unsigned long node_start_pfn, node_end_pfn; 3090 unsigned long zone_start_pfn, zone_end_pfn; 3091 3092 /* Get the start and end of the node and zone */ 3093 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); 3094 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; 3095 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; 3096 adjust_zone_range_for_zone_movable(nid, zone_type, 3097 node_start_pfn, node_end_pfn, 3098 &zone_start_pfn, &zone_end_pfn); 3099 3100 /* Check that this node has pages within the zone's required range */ 3101 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) 3102 return 0; 3103 3104 /* Move the zone boundaries inside the node if necessary */ 3105 zone_end_pfn = min(zone_end_pfn, node_end_pfn); 3106 zone_start_pfn = max(zone_start_pfn, node_start_pfn); 3107 3108 /* Return the spanned pages */ 3109 return zone_end_pfn - zone_start_pfn; 3110 } 3111 3112 /* 3113 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 3114 * then all holes in the requested range will be accounted for. 3115 */ 3116 unsigned long __meminit __absent_pages_in_range(int nid, 3117 unsigned long range_start_pfn, 3118 unsigned long range_end_pfn) 3119 { 3120 int i = 0; 3121 unsigned long prev_end_pfn = 0, hole_pages = 0; 3122 unsigned long start_pfn; 3123 3124 /* Find the end_pfn of the first active range of pfns in the node */ 3125 i = first_active_region_index_in_nid(nid); 3126 if (i == -1) 3127 return 0; 3128 3129 prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn); 3130 3131 /* Account for ranges before physical memory on this node */ 3132 if (early_node_map[i].start_pfn > range_start_pfn) 3133 hole_pages = prev_end_pfn - range_start_pfn; 3134 3135 /* Find all holes for the zone within the node */ 3136 for (; i != -1; i = next_active_region_index_in_nid(i, nid)) { 3137 3138 /* No need to continue if prev_end_pfn is outside the zone */ 3139 if (prev_end_pfn >= range_end_pfn) 3140 break; 3141 3142 /* Make sure the end of the zone is not within the hole */ 3143 start_pfn = min(early_node_map[i].start_pfn, range_end_pfn); 3144 prev_end_pfn = max(prev_end_pfn, range_start_pfn); 3145 3146 /* Update the hole size cound and move on */ 3147 if (start_pfn > range_start_pfn) { 3148 BUG_ON(prev_end_pfn > start_pfn); 3149 hole_pages += start_pfn - prev_end_pfn; 3150 } 3151 prev_end_pfn = early_node_map[i].end_pfn; 3152 } 3153 3154 /* Account for ranges past physical memory on this node */ 3155 if (range_end_pfn > prev_end_pfn) 3156 hole_pages += range_end_pfn - 3157 max(range_start_pfn, prev_end_pfn); 3158 3159 return hole_pages; 3160 } 3161 3162 /** 3163 * absent_pages_in_range - Return number of page frames in holes within a range 3164 * @start_pfn: The start PFN to start searching for holes 3165 * @end_pfn: The end PFN to stop searching for holes 3166 * 3167 * It returns the number of pages frames in memory holes within a range. 3168 */ 3169 unsigned long __init absent_pages_in_range(unsigned long start_pfn, 3170 unsigned long end_pfn) 3171 { 3172 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); 3173 } 3174 3175 /* Return the number of page frames in holes in a zone on a node */ 3176 static unsigned long __meminit zone_absent_pages_in_node(int nid, 3177 unsigned long zone_type, 3178 unsigned long *ignored) 3179 { 3180 unsigned long node_start_pfn, node_end_pfn; 3181 unsigned long zone_start_pfn, zone_end_pfn; 3182 3183 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); 3184 zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type], 3185 node_start_pfn); 3186 zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type], 3187 node_end_pfn); 3188 3189 adjust_zone_range_for_zone_movable(nid, zone_type, 3190 node_start_pfn, node_end_pfn, 3191 &zone_start_pfn, &zone_end_pfn); 3192 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 3193 } 3194 3195 #else 3196 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, 3197 unsigned long zone_type, 3198 unsigned long *zones_size) 3199 { 3200 return zones_size[zone_type]; 3201 } 3202 3203 static inline unsigned long __meminit zone_absent_pages_in_node(int nid, 3204 unsigned long zone_type, 3205 unsigned long *zholes_size) 3206 { 3207 if (!zholes_size) 3208 return 0; 3209 3210 return zholes_size[zone_type]; 3211 } 3212 3213 #endif 3214 3215 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, 3216 unsigned long *zones_size, unsigned long *zholes_size) 3217 { 3218 unsigned long realtotalpages, totalpages = 0; 3219 enum zone_type i; 3220 3221 for (i = 0; i < MAX_NR_ZONES; i++) 3222 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, 3223 zones_size); 3224 pgdat->node_spanned_pages = totalpages; 3225 3226 realtotalpages = totalpages; 3227 for (i = 0; i < MAX_NR_ZONES; i++) 3228 realtotalpages -= 3229 zone_absent_pages_in_node(pgdat->node_id, i, 3230 zholes_size); 3231 pgdat->node_present_pages = realtotalpages; 3232 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, 3233 realtotalpages); 3234 } 3235 3236 #ifndef CONFIG_SPARSEMEM 3237 /* 3238 * Calculate the size of the zone->blockflags rounded to an unsigned long 3239 * Start by making sure zonesize is a multiple of pageblock_order by rounding 3240 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally 3241 * round what is now in bits to nearest long in bits, then return it in 3242 * bytes. 3243 */ 3244 static unsigned long __init usemap_size(unsigned long zonesize) 3245 { 3246 unsigned long usemapsize; 3247 3248 usemapsize = roundup(zonesize, pageblock_nr_pages); 3249 usemapsize = usemapsize >> pageblock_order; 3250 usemapsize *= NR_PAGEBLOCK_BITS; 3251 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); 3252 3253 return usemapsize / 8; 3254 } 3255 3256 static void __init setup_usemap(struct pglist_data *pgdat, 3257 struct zone *zone, unsigned long zonesize) 3258 { 3259 unsigned long usemapsize = usemap_size(zonesize); 3260 zone->pageblock_flags = NULL; 3261 if (usemapsize) { 3262 zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); 3263 memset(zone->pageblock_flags, 0, usemapsize); 3264 } 3265 } 3266 #else 3267 static void inline setup_usemap(struct pglist_data *pgdat, 3268 struct zone *zone, unsigned long zonesize) {} 3269 #endif /* CONFIG_SPARSEMEM */ 3270 3271 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 3272 3273 /* Return a sensible default order for the pageblock size. */ 3274 static inline int pageblock_default_order(void) 3275 { 3276 if (HPAGE_SHIFT > PAGE_SHIFT) 3277 return HUGETLB_PAGE_ORDER; 3278 3279 return MAX_ORDER-1; 3280 } 3281 3282 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 3283 static inline void __init set_pageblock_order(unsigned int order) 3284 { 3285 /* Check that pageblock_nr_pages has not already been setup */ 3286 if (pageblock_order) 3287 return; 3288 3289 /* 3290 * Assume the largest contiguous order of interest is a huge page. 3291 * This value may be variable depending on boot parameters on IA64 3292 */ 3293 pageblock_order = order; 3294 } 3295 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 3296 3297 /* 3298 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() 3299 * and pageblock_default_order() are unused as pageblock_order is set 3300 * at compile-time. See include/linux/pageblock-flags.h for the values of 3301 * pageblock_order based on the kernel config 3302 */ 3303 static inline int pageblock_default_order(unsigned int order) 3304 { 3305 return MAX_ORDER-1; 3306 } 3307 #define set_pageblock_order(x) do {} while (0) 3308 3309 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 3310 3311 /* 3312 * Set up the zone data structures: 3313 * - mark all pages reserved 3314 * - mark all memory queues empty 3315 * - clear the memory bitmaps 3316 */ 3317 static void __paginginit free_area_init_core(struct pglist_data *pgdat, 3318 unsigned long *zones_size, unsigned long *zholes_size) 3319 { 3320 enum zone_type j; 3321 int nid = pgdat->node_id; 3322 unsigned long zone_start_pfn = pgdat->node_start_pfn; 3323 int ret; 3324 3325 pgdat_resize_init(pgdat); 3326 pgdat->nr_zones = 0; 3327 init_waitqueue_head(&pgdat->kswapd_wait); 3328 pgdat->kswapd_max_order = 0; 3329 3330 for (j = 0; j < MAX_NR_ZONES; j++) { 3331 struct zone *zone = pgdat->node_zones + j; 3332 unsigned long size, realsize, memmap_pages; 3333 3334 size = zone_spanned_pages_in_node(nid, j, zones_size); 3335 realsize = size - zone_absent_pages_in_node(nid, j, 3336 zholes_size); 3337 3338 /* 3339 * Adjust realsize so that it accounts for how much memory 3340 * is used by this zone for memmap. This affects the watermark 3341 * and per-cpu initialisations 3342 */ 3343 memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT; 3344 if (realsize >= memmap_pages) { 3345 realsize -= memmap_pages; 3346 printk(KERN_DEBUG 3347 " %s zone: %lu pages used for memmap\n", 3348 zone_names[j], memmap_pages); 3349 } else 3350 printk(KERN_WARNING 3351 " %s zone: %lu pages exceeds realsize %lu\n", 3352 zone_names[j], memmap_pages, realsize); 3353 3354 /* Account for reserved pages */ 3355 if (j == 0 && realsize > dma_reserve) { 3356 realsize -= dma_reserve; 3357 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 3358 zone_names[0], dma_reserve); 3359 } 3360 3361 if (!is_highmem_idx(j)) 3362 nr_kernel_pages += realsize; 3363 nr_all_pages += realsize; 3364 3365 zone->spanned_pages = size; 3366 zone->present_pages = realsize; 3367 #ifdef CONFIG_NUMA 3368 zone->node = nid; 3369 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) 3370 / 100; 3371 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; 3372 #endif 3373 zone->name = zone_names[j]; 3374 spin_lock_init(&zone->lock); 3375 spin_lock_init(&zone->lru_lock); 3376 zone_seqlock_init(zone); 3377 zone->zone_pgdat = pgdat; 3378 3379 zone->prev_priority = DEF_PRIORITY; 3380 3381 zone_pcp_init(zone); 3382 INIT_LIST_HEAD(&zone->active_list); 3383 INIT_LIST_HEAD(&zone->inactive_list); 3384 zone->nr_scan_active = 0; 3385 zone->nr_scan_inactive = 0; 3386 zap_zone_vm_stats(zone); 3387 zone->flags = 0; 3388 if (!size) 3389 continue; 3390 3391 set_pageblock_order(pageblock_default_order()); 3392 setup_usemap(pgdat, zone, size); 3393 ret = init_currently_empty_zone(zone, zone_start_pfn, 3394 size, MEMMAP_EARLY); 3395 BUG_ON(ret); 3396 zone_start_pfn += size; 3397 } 3398 } 3399 3400 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) 3401 { 3402 /* Skip empty nodes */ 3403 if (!pgdat->node_spanned_pages) 3404 return; 3405 3406 #ifdef CONFIG_FLAT_NODE_MEM_MAP 3407 /* ia64 gets its own node_mem_map, before this, without bootmem */ 3408 if (!pgdat->node_mem_map) { 3409 unsigned long size, start, end; 3410 struct page *map; 3411 3412 /* 3413 * The zone's endpoints aren't required to be MAX_ORDER 3414 * aligned but the node_mem_map endpoints must be in order 3415 * for the buddy allocator to function correctly. 3416 */ 3417 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 3418 end = pgdat->node_start_pfn + pgdat->node_spanned_pages; 3419 end = ALIGN(end, MAX_ORDER_NR_PAGES); 3420 size = (end - start) * sizeof(struct page); 3421 map = alloc_remap(pgdat->node_id, size); 3422 if (!map) 3423 map = alloc_bootmem_node(pgdat, size); 3424 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); 3425 } 3426 #ifndef CONFIG_NEED_MULTIPLE_NODES 3427 /* 3428 * With no DISCONTIG, the global mem_map is just set as node 0's 3429 */ 3430 if (pgdat == NODE_DATA(0)) { 3431 mem_map = NODE_DATA(0)->node_mem_map; 3432 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP 3433 if (page_to_pfn(mem_map) != pgdat->node_start_pfn) 3434 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); 3435 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 3436 } 3437 #endif 3438 #endif /* CONFIG_FLAT_NODE_MEM_MAP */ 3439 } 3440 3441 void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat, 3442 unsigned long *zones_size, unsigned long node_start_pfn, 3443 unsigned long *zholes_size) 3444 { 3445 pgdat->node_id = nid; 3446 pgdat->node_start_pfn = node_start_pfn; 3447 calculate_node_totalpages(pgdat, zones_size, zholes_size); 3448 3449 alloc_node_mem_map(pgdat); 3450 3451 free_area_init_core(pgdat, zones_size, zholes_size); 3452 } 3453 3454 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP 3455 3456 #if MAX_NUMNODES > 1 3457 /* 3458 * Figure out the number of possible node ids. 3459 */ 3460 static void __init setup_nr_node_ids(void) 3461 { 3462 unsigned int node; 3463 unsigned int highest = 0; 3464 3465 for_each_node_mask(node, node_possible_map) 3466 highest = node; 3467 nr_node_ids = highest + 1; 3468 } 3469 #else 3470 static inline void setup_nr_node_ids(void) 3471 { 3472 } 3473 #endif 3474 3475 /** 3476 * add_active_range - Register a range of PFNs backed by physical memory 3477 * @nid: The node ID the range resides on 3478 * @start_pfn: The start PFN of the available physical memory 3479 * @end_pfn: The end PFN of the available physical memory 3480 * 3481 * These ranges are stored in an early_node_map[] and later used by 3482 * free_area_init_nodes() to calculate zone sizes and holes. If the 3483 * range spans a memory hole, it is up to the architecture to ensure 3484 * the memory is not freed by the bootmem allocator. If possible 3485 * the range being registered will be merged with existing ranges. 3486 */ 3487 void __init add_active_range(unsigned int nid, unsigned long start_pfn, 3488 unsigned long end_pfn) 3489 { 3490 int i; 3491 3492 printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) " 3493 "%d entries of %d used\n", 3494 nid, start_pfn, end_pfn, 3495 nr_nodemap_entries, MAX_ACTIVE_REGIONS); 3496 3497 /* Merge with existing active regions if possible */ 3498 for (i = 0; i < nr_nodemap_entries; i++) { 3499 if (early_node_map[i].nid != nid) 3500 continue; 3501 3502 /* Skip if an existing region covers this new one */ 3503 if (start_pfn >= early_node_map[i].start_pfn && 3504 end_pfn <= early_node_map[i].end_pfn) 3505 return; 3506 3507 /* Merge forward if suitable */ 3508 if (start_pfn <= early_node_map[i].end_pfn && 3509 end_pfn > early_node_map[i].end_pfn) { 3510 early_node_map[i].end_pfn = end_pfn; 3511 return; 3512 } 3513 3514 /* Merge backward if suitable */ 3515 if (start_pfn < early_node_map[i].end_pfn && 3516 end_pfn >= early_node_map[i].start_pfn) { 3517 early_node_map[i].start_pfn = start_pfn; 3518 return; 3519 } 3520 } 3521 3522 /* Check that early_node_map is large enough */ 3523 if (i >= MAX_ACTIVE_REGIONS) { 3524 printk(KERN_CRIT "More than %d memory regions, truncating\n", 3525 MAX_ACTIVE_REGIONS); 3526 return; 3527 } 3528 3529 early_node_map[i].nid = nid; 3530 early_node_map[i].start_pfn = start_pfn; 3531 early_node_map[i].end_pfn = end_pfn; 3532 nr_nodemap_entries = i + 1; 3533 } 3534 3535 /** 3536 * shrink_active_range - Shrink an existing registered range of PFNs 3537 * @nid: The node id the range is on that should be shrunk 3538 * @old_end_pfn: The old end PFN of the range 3539 * @new_end_pfn: The new PFN of the range 3540 * 3541 * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. 3542 * The map is kept at the end physical page range that has already been 3543 * registered with add_active_range(). This function allows an arch to shrink 3544 * an existing registered range. 3545 */ 3546 void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn, 3547 unsigned long new_end_pfn) 3548 { 3549 int i; 3550 3551 /* Find the old active region end and shrink */ 3552 for_each_active_range_index_in_nid(i, nid) 3553 if (early_node_map[i].end_pfn == old_end_pfn) { 3554 early_node_map[i].end_pfn = new_end_pfn; 3555 break; 3556 } 3557 } 3558 3559 /** 3560 * remove_all_active_ranges - Remove all currently registered regions 3561 * 3562 * During discovery, it may be found that a table like SRAT is invalid 3563 * and an alternative discovery method must be used. This function removes 3564 * all currently registered regions. 3565 */ 3566 void __init remove_all_active_ranges(void) 3567 { 3568 memset(early_node_map, 0, sizeof(early_node_map)); 3569 nr_nodemap_entries = 0; 3570 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE 3571 memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn)); 3572 memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn)); 3573 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ 3574 } 3575 3576 /* Compare two active node_active_regions */ 3577 static int __init cmp_node_active_region(const void *a, const void *b) 3578 { 3579 struct node_active_region *arange = (struct node_active_region *)a; 3580 struct node_active_region *brange = (struct node_active_region *)b; 3581 3582 /* Done this way to avoid overflows */ 3583 if (arange->start_pfn > brange->start_pfn) 3584 return 1; 3585 if (arange->start_pfn < brange->start_pfn) 3586 return -1; 3587 3588 return 0; 3589 } 3590 3591 /* sort the node_map by start_pfn */ 3592 static void __init sort_node_map(void) 3593 { 3594 sort(early_node_map, (size_t)nr_nodemap_entries, 3595 sizeof(struct node_active_region), 3596 cmp_node_active_region, NULL); 3597 } 3598 3599 /* Find the lowest pfn for a node */ 3600 unsigned long __init find_min_pfn_for_node(unsigned long nid) 3601 { 3602 int i; 3603 unsigned long min_pfn = ULONG_MAX; 3604 3605 /* Assuming a sorted map, the first range found has the starting pfn */ 3606 for_each_active_range_index_in_nid(i, nid) 3607 min_pfn = min(min_pfn, early_node_map[i].start_pfn); 3608 3609 if (min_pfn == ULONG_MAX) { 3610 printk(KERN_WARNING 3611 "Could not find start_pfn for node %lu\n", nid); 3612 return 0; 3613 } 3614 3615 return min_pfn; 3616 } 3617 3618 /** 3619 * find_min_pfn_with_active_regions - Find the minimum PFN registered 3620 * 3621 * It returns the minimum PFN based on information provided via 3622 * add_active_range(). 3623 */ 3624 unsigned long __init find_min_pfn_with_active_regions(void) 3625 { 3626 return find_min_pfn_for_node(MAX_NUMNODES); 3627 } 3628 3629 /** 3630 * find_max_pfn_with_active_regions - Find the maximum PFN registered 3631 * 3632 * It returns the maximum PFN based on information provided via 3633 * add_active_range(). 3634 */ 3635 unsigned long __init find_max_pfn_with_active_regions(void) 3636 { 3637 int i; 3638 unsigned long max_pfn = 0; 3639 3640 for (i = 0; i < nr_nodemap_entries; i++) 3641 max_pfn = max(max_pfn, early_node_map[i].end_pfn); 3642 3643 return max_pfn; 3644 } 3645 3646 /* 3647 * early_calculate_totalpages() 3648 * Sum pages in active regions for movable zone. 3649 * Populate N_HIGH_MEMORY for calculating usable_nodes. 3650 */ 3651 static unsigned long __init early_calculate_totalpages(void) 3652 { 3653 int i; 3654 unsigned long totalpages = 0; 3655 3656 for (i = 0; i < nr_nodemap_entries; i++) { 3657 unsigned long pages = early_node_map[i].end_pfn - 3658 early_node_map[i].start_pfn; 3659 totalpages += pages; 3660 if (pages) 3661 node_set_state(early_node_map[i].nid, N_HIGH_MEMORY); 3662 } 3663 return totalpages; 3664 } 3665 3666 /* 3667 * Find the PFN the Movable zone begins in each node. Kernel memory 3668 * is spread evenly between nodes as long as the nodes have enough 3669 * memory. When they don't, some nodes will have more kernelcore than 3670 * others 3671 */ 3672 void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) 3673 { 3674 int i, nid; 3675 unsigned long usable_startpfn; 3676 unsigned long kernelcore_node, kernelcore_remaining; 3677 unsigned long totalpages = early_calculate_totalpages(); 3678 int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); 3679 3680 /* 3681 * If movablecore was specified, calculate what size of 3682 * kernelcore that corresponds so that memory usable for 3683 * any allocation type is evenly spread. If both kernelcore 3684 * and movablecore are specified, then the value of kernelcore 3685 * will be used for required_kernelcore if it's greater than 3686 * what movablecore would have allowed. 3687 */ 3688 if (required_movablecore) { 3689 unsigned long corepages; 3690 3691 /* 3692 * Round-up so that ZONE_MOVABLE is at least as large as what 3693 * was requested by the user 3694 */ 3695 required_movablecore = 3696 roundup(required_movablecore, MAX_ORDER_NR_PAGES); 3697 corepages = totalpages - required_movablecore; 3698 3699 required_kernelcore = max(required_kernelcore, corepages); 3700 } 3701 3702 /* If kernelcore was not specified, there is no ZONE_MOVABLE */ 3703 if (!required_kernelcore) 3704 return; 3705 3706 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 3707 find_usable_zone_for_movable(); 3708 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 3709 3710 restart: 3711 /* Spread kernelcore memory as evenly as possible throughout nodes */ 3712 kernelcore_node = required_kernelcore / usable_nodes; 3713 for_each_node_state(nid, N_HIGH_MEMORY) { 3714 /* 3715 * Recalculate kernelcore_node if the division per node 3716 * now exceeds what is necessary to satisfy the requested 3717 * amount of memory for the kernel 3718 */ 3719 if (required_kernelcore < kernelcore_node) 3720 kernelcore_node = required_kernelcore / usable_nodes; 3721 3722 /* 3723 * As the map is walked, we track how much memory is usable 3724 * by the kernel using kernelcore_remaining. When it is 3725 * 0, the rest of the node is usable by ZONE_MOVABLE 3726 */ 3727 kernelcore_remaining = kernelcore_node; 3728 3729 /* Go through each range of PFNs within this node */ 3730 for_each_active_range_index_in_nid(i, nid) { 3731 unsigned long start_pfn, end_pfn; 3732 unsigned long size_pages; 3733 3734 start_pfn = max(early_node_map[i].start_pfn, 3735 zone_movable_pfn[nid]); 3736 end_pfn = early_node_map[i].end_pfn; 3737 if (start_pfn >= end_pfn) 3738 continue; 3739 3740 /* Account for what is only usable for kernelcore */ 3741 if (start_pfn < usable_startpfn) { 3742 unsigned long kernel_pages; 3743 kernel_pages = min(end_pfn, usable_startpfn) 3744 - start_pfn; 3745 3746 kernelcore_remaining -= min(kernel_pages, 3747 kernelcore_remaining); 3748 required_kernelcore -= min(kernel_pages, 3749 required_kernelcore); 3750 3751 /* Continue if range is now fully accounted */ 3752 if (end_pfn <= usable_startpfn) { 3753 3754 /* 3755 * Push zone_movable_pfn to the end so 3756 * that if we have to rebalance 3757 * kernelcore across nodes, we will 3758 * not double account here 3759 */ 3760 zone_movable_pfn[nid] = end_pfn; 3761 continue; 3762 } 3763 start_pfn = usable_startpfn; 3764 } 3765 3766 /* 3767 * The usable PFN range for ZONE_MOVABLE is from 3768 * start_pfn->end_pfn. Calculate size_pages as the 3769 * number of pages used as kernelcore 3770 */ 3771 size_pages = end_pfn - start_pfn; 3772 if (size_pages > kernelcore_remaining) 3773 size_pages = kernelcore_remaining; 3774 zone_movable_pfn[nid] = start_pfn + size_pages; 3775 3776 /* 3777 * Some kernelcore has been met, update counts and 3778 * break if the kernelcore for this node has been 3779 * satisified 3780 */ 3781 required_kernelcore -= min(required_kernelcore, 3782 size_pages); 3783 kernelcore_remaining -= size_pages; 3784 if (!kernelcore_remaining) 3785 break; 3786 } 3787 } 3788 3789 /* 3790 * If there is still required_kernelcore, we do another pass with one 3791 * less node in the count. This will push zone_movable_pfn[nid] further 3792 * along on the nodes that still have memory until kernelcore is 3793 * satisified 3794 */ 3795 usable_nodes--; 3796 if (usable_nodes && required_kernelcore > usable_nodes) 3797 goto restart; 3798 3799 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 3800 for (nid = 0; nid < MAX_NUMNODES; nid++) 3801 zone_movable_pfn[nid] = 3802 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 3803 } 3804 3805 /* Any regular memory on that node ? */ 3806 static void check_for_regular_memory(pg_data_t *pgdat) 3807 { 3808 #ifdef CONFIG_HIGHMEM 3809 enum zone_type zone_type; 3810 3811 for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { 3812 struct zone *zone = &pgdat->node_zones[zone_type]; 3813 if (zone->present_pages) 3814 node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); 3815 } 3816 #endif 3817 } 3818 3819 /** 3820 * free_area_init_nodes - Initialise all pg_data_t and zone data 3821 * @max_zone_pfn: an array of max PFNs for each zone 3822 * 3823 * This will call free_area_init_node() for each active node in the system. 3824 * Using the page ranges provided by add_active_range(), the size of each 3825 * zone in each node and their holes is calculated. If the maximum PFN 3826 * between two adjacent zones match, it is assumed that the zone is empty. 3827 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed 3828 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone 3829 * starts where the previous one ended. For example, ZONE_DMA32 starts 3830 * at arch_max_dma_pfn. 3831 */ 3832 void __init free_area_init_nodes(unsigned long *max_zone_pfn) 3833 { 3834 unsigned long nid; 3835 enum zone_type i; 3836 3837 /* Sort early_node_map as initialisation assumes it is sorted */ 3838 sort_node_map(); 3839 3840 /* Record where the zone boundaries are */ 3841 memset(arch_zone_lowest_possible_pfn, 0, 3842 sizeof(arch_zone_lowest_possible_pfn)); 3843 memset(arch_zone_highest_possible_pfn, 0, 3844 sizeof(arch_zone_highest_possible_pfn)); 3845 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); 3846 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; 3847 for (i = 1; i < MAX_NR_ZONES; i++) { 3848 if (i == ZONE_MOVABLE) 3849 continue; 3850 arch_zone_lowest_possible_pfn[i] = 3851 arch_zone_highest_possible_pfn[i-1]; 3852 arch_zone_highest_possible_pfn[i] = 3853 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); 3854 } 3855 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; 3856 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; 3857 3858 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 3859 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 3860 find_zone_movable_pfns_for_nodes(zone_movable_pfn); 3861 3862 /* Print out the zone ranges */ 3863 printk("Zone PFN ranges:\n"); 3864 for (i = 0; i < MAX_NR_ZONES; i++) { 3865 if (i == ZONE_MOVABLE) 3866 continue; 3867 printk(" %-8s %8lu -> %8lu\n", 3868 zone_names[i], 3869 arch_zone_lowest_possible_pfn[i], 3870 arch_zone_highest_possible_pfn[i]); 3871 } 3872 3873 /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 3874 printk("Movable zone start PFN for each node\n"); 3875 for (i = 0; i < MAX_NUMNODES; i++) { 3876 if (zone_movable_pfn[i]) 3877 printk(" Node %d: %lu\n", i, zone_movable_pfn[i]); 3878 } 3879 3880 /* Print out the early_node_map[] */ 3881 printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); 3882 for (i = 0; i < nr_nodemap_entries; i++) 3883 printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid, 3884 early_node_map[i].start_pfn, 3885 early_node_map[i].end_pfn); 3886 3887 /* Initialise every node */ 3888 setup_nr_node_ids(); 3889 for_each_online_node(nid) { 3890 pg_data_t *pgdat = NODE_DATA(nid); 3891 free_area_init_node(nid, pgdat, NULL, 3892 find_min_pfn_for_node(nid), NULL); 3893 3894 /* Any memory on that node */ 3895 if (pgdat->node_present_pages) 3896 node_set_state(nid, N_HIGH_MEMORY); 3897 check_for_regular_memory(pgdat); 3898 } 3899 } 3900 3901 static int __init cmdline_parse_core(char *p, unsigned long *core) 3902 { 3903 unsigned long long coremem; 3904 if (!p) 3905 return -EINVAL; 3906 3907 coremem = memparse(p, &p); 3908 *core = coremem >> PAGE_SHIFT; 3909 3910 /* Paranoid check that UL is enough for the coremem value */ 3911 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); 3912 3913 return 0; 3914 } 3915 3916 /* 3917 * kernelcore=size sets the amount of memory for use for allocations that 3918 * cannot be reclaimed or migrated. 3919 */ 3920 static int __init cmdline_parse_kernelcore(char *p) 3921 { 3922 return cmdline_parse_core(p, &required_kernelcore); 3923 } 3924 3925 /* 3926 * movablecore=size sets the amount of memory for use for allocations that 3927 * can be reclaimed or migrated. 3928 */ 3929 static int __init cmdline_parse_movablecore(char *p) 3930 { 3931 return cmdline_parse_core(p, &required_movablecore); 3932 } 3933 3934 early_param("kernelcore", cmdline_parse_kernelcore); 3935 early_param("movablecore", cmdline_parse_movablecore); 3936 3937 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 3938 3939 /** 3940 * set_dma_reserve - set the specified number of pages reserved in the first zone 3941 * @new_dma_reserve: The number of pages to mark reserved 3942 * 3943 * The per-cpu batchsize and zone watermarks are determined by present_pages. 3944 * In the DMA zone, a significant percentage may be consumed by kernel image 3945 * and other unfreeable allocations which can skew the watermarks badly. This 3946 * function may optionally be used to account for unfreeable pages in the 3947 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and 3948 * smaller per-cpu batchsize. 3949 */ 3950 void __init set_dma_reserve(unsigned long new_dma_reserve) 3951 { 3952 dma_reserve = new_dma_reserve; 3953 } 3954 3955 #ifndef CONFIG_NEED_MULTIPLE_NODES 3956 static bootmem_data_t contig_bootmem_data; 3957 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; 3958 3959 EXPORT_SYMBOL(contig_page_data); 3960 #endif 3961 3962 void __init free_area_init(unsigned long *zones_size) 3963 { 3964 free_area_init_node(0, NODE_DATA(0), zones_size, 3965 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 3966 } 3967 3968 static int page_alloc_cpu_notify(struct notifier_block *self, 3969 unsigned long action, void *hcpu) 3970 { 3971 int cpu = (unsigned long)hcpu; 3972 3973 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 3974 drain_pages(cpu); 3975 3976 /* 3977 * Spill the event counters of the dead processor 3978 * into the current processors event counters. 3979 * This artificially elevates the count of the current 3980 * processor. 3981 */ 3982 vm_events_fold_cpu(cpu); 3983 3984 /* 3985 * Zero the differential counters of the dead processor 3986 * so that the vm statistics are consistent. 3987 * 3988 * This is only okay since the processor is dead and cannot 3989 * race with what we are doing. 3990 */ 3991 refresh_cpu_vm_stats(cpu); 3992 } 3993 return NOTIFY_OK; 3994 } 3995 3996 void __init page_alloc_init(void) 3997 { 3998 hotcpu_notifier(page_alloc_cpu_notify, 0); 3999 } 4000 4001 /* 4002 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio 4003 * or min_free_kbytes changes. 4004 */ 4005 static void calculate_totalreserve_pages(void) 4006 { 4007 struct pglist_data *pgdat; 4008 unsigned long reserve_pages = 0; 4009 enum zone_type i, j; 4010 4011 for_each_online_pgdat(pgdat) { 4012 for (i = 0; i < MAX_NR_ZONES; i++) { 4013 struct zone *zone = pgdat->node_zones + i; 4014 unsigned long max = 0; 4015 4016 /* Find valid and maximum lowmem_reserve in the zone */ 4017 for (j = i; j < MAX_NR_ZONES; j++) { 4018 if (zone->lowmem_reserve[j] > max) 4019 max = zone->lowmem_reserve[j]; 4020 } 4021 4022 /* we treat pages_high as reserved pages. */ 4023 max += zone->pages_high; 4024 4025 if (max > zone->present_pages) 4026 max = zone->present_pages; 4027 reserve_pages += max; 4028 } 4029 } 4030 totalreserve_pages = reserve_pages; 4031 } 4032 4033 /* 4034 * setup_per_zone_lowmem_reserve - called whenever 4035 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone 4036 * has a correct pages reserved value, so an adequate number of 4037 * pages are left in the zone after a successful __alloc_pages(). 4038 */ 4039 static void setup_per_zone_lowmem_reserve(void) 4040 { 4041 struct pglist_data *pgdat; 4042 enum zone_type j, idx; 4043 4044 for_each_online_pgdat(pgdat) { 4045 for (j = 0; j < MAX_NR_ZONES; j++) { 4046 struct zone *zone = pgdat->node_zones + j; 4047 unsigned long present_pages = zone->present_pages; 4048 4049 zone->lowmem_reserve[j] = 0; 4050 4051 idx = j; 4052 while (idx) { 4053 struct zone *lower_zone; 4054 4055 idx--; 4056 4057 if (sysctl_lowmem_reserve_ratio[idx] < 1) 4058 sysctl_lowmem_reserve_ratio[idx] = 1; 4059 4060 lower_zone = pgdat->node_zones + idx; 4061 lower_zone->lowmem_reserve[j] = present_pages / 4062 sysctl_lowmem_reserve_ratio[idx]; 4063 present_pages += lower_zone->present_pages; 4064 } 4065 } 4066 } 4067 4068 /* update totalreserve_pages */ 4069 calculate_totalreserve_pages(); 4070 } 4071 4072 /** 4073 * setup_per_zone_pages_min - called when min_free_kbytes changes. 4074 * 4075 * Ensures that the pages_{min,low,high} values for each zone are set correctly 4076 * with respect to min_free_kbytes. 4077 */ 4078 void setup_per_zone_pages_min(void) 4079 { 4080 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 4081 unsigned long lowmem_pages = 0; 4082 struct zone *zone; 4083 unsigned long flags; 4084 4085 /* Calculate total number of !ZONE_HIGHMEM pages */ 4086 for_each_zone(zone) { 4087 if (!is_highmem(zone)) 4088 lowmem_pages += zone->present_pages; 4089 } 4090 4091 for_each_zone(zone) { 4092 u64 tmp; 4093 4094 spin_lock_irqsave(&zone->lru_lock, flags); 4095 tmp = (u64)pages_min * zone->present_pages; 4096 do_div(tmp, lowmem_pages); 4097 if (is_highmem(zone)) { 4098 /* 4099 * __GFP_HIGH and PF_MEMALLOC allocations usually don't 4100 * need highmem pages, so cap pages_min to a small 4101 * value here. 4102 * 4103 * The (pages_high-pages_low) and (pages_low-pages_min) 4104 * deltas controls asynch page reclaim, and so should 4105 * not be capped for highmem. 4106 */ 4107 int min_pages; 4108 4109 min_pages = zone->present_pages / 1024; 4110 if (min_pages < SWAP_CLUSTER_MAX) 4111 min_pages = SWAP_CLUSTER_MAX; 4112 if (min_pages > 128) 4113 min_pages = 128; 4114 zone->pages_min = min_pages; 4115 } else { 4116 /* 4117 * If it's a lowmem zone, reserve a number of pages 4118 * proportionate to the zone's size. 4119 */ 4120 zone->pages_min = tmp; 4121 } 4122 4123 zone->pages_low = zone->pages_min + (tmp >> 2); 4124 zone->pages_high = zone->pages_min + (tmp >> 1); 4125 setup_zone_migrate_reserve(zone); 4126 spin_unlock_irqrestore(&zone->lru_lock, flags); 4127 } 4128 4129 /* update totalreserve_pages */ 4130 calculate_totalreserve_pages(); 4131 } 4132 4133 /* 4134 * Initialise min_free_kbytes. 4135 * 4136 * For small machines we want it small (128k min). For large machines 4137 * we want it large (64MB max). But it is not linear, because network 4138 * bandwidth does not increase linearly with machine size. We use 4139 * 4140 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 4141 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 4142 * 4143 * which yields 4144 * 4145 * 16MB: 512k 4146 * 32MB: 724k 4147 * 64MB: 1024k 4148 * 128MB: 1448k 4149 * 256MB: 2048k 4150 * 512MB: 2896k 4151 * 1024MB: 4096k 4152 * 2048MB: 5792k 4153 * 4096MB: 8192k 4154 * 8192MB: 11584k 4155 * 16384MB: 16384k 4156 */ 4157 static int __init init_per_zone_pages_min(void) 4158 { 4159 unsigned long lowmem_kbytes; 4160 4161 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 4162 4163 min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 4164 if (min_free_kbytes < 128) 4165 min_free_kbytes = 128; 4166 if (min_free_kbytes > 65536) 4167 min_free_kbytes = 65536; 4168 setup_per_zone_pages_min(); 4169 setup_per_zone_lowmem_reserve(); 4170 return 0; 4171 } 4172 module_init(init_per_zone_pages_min) 4173 4174 /* 4175 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 4176 * that we can call two helper functions whenever min_free_kbytes 4177 * changes. 4178 */ 4179 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 4180 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4181 { 4182 proc_dointvec(table, write, file, buffer, length, ppos); 4183 if (write) 4184 setup_per_zone_pages_min(); 4185 return 0; 4186 } 4187 4188 #ifdef CONFIG_NUMA 4189 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, 4190 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4191 { 4192 struct zone *zone; 4193 int rc; 4194 4195 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4196 if (rc) 4197 return rc; 4198 4199 for_each_zone(zone) 4200 zone->min_unmapped_pages = (zone->present_pages * 4201 sysctl_min_unmapped_ratio) / 100; 4202 return 0; 4203 } 4204 4205 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, 4206 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4207 { 4208 struct zone *zone; 4209 int rc; 4210 4211 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4212 if (rc) 4213 return rc; 4214 4215 for_each_zone(zone) 4216 zone->min_slab_pages = (zone->present_pages * 4217 sysctl_min_slab_ratio) / 100; 4218 return 0; 4219 } 4220 #endif 4221 4222 /* 4223 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 4224 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 4225 * whenever sysctl_lowmem_reserve_ratio changes. 4226 * 4227 * The reserve ratio obviously has absolutely no relation with the 4228 * pages_min watermarks. The lowmem reserve ratio can only make sense 4229 * if in function of the boot time zone sizes. 4230 */ 4231 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 4232 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4233 { 4234 proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4235 setup_per_zone_lowmem_reserve(); 4236 return 0; 4237 } 4238 4239 /* 4240 * percpu_pagelist_fraction - changes the pcp->high for each zone on each 4241 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist 4242 * can have before it gets flushed back to buddy allocator. 4243 */ 4244 4245 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, 4246 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4247 { 4248 struct zone *zone; 4249 unsigned int cpu; 4250 int ret; 4251 4252 ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4253 if (!write || (ret == -EINVAL)) 4254 return ret; 4255 for_each_zone(zone) { 4256 for_each_online_cpu(cpu) { 4257 unsigned long high; 4258 high = zone->present_pages / percpu_pagelist_fraction; 4259 setup_pagelist_highmark(zone_pcp(zone, cpu), high); 4260 } 4261 } 4262 return 0; 4263 } 4264 4265 int hashdist = HASHDIST_DEFAULT; 4266 4267 #ifdef CONFIG_NUMA 4268 static int __init set_hashdist(char *str) 4269 { 4270 if (!str) 4271 return 0; 4272 hashdist = simple_strtoul(str, &str, 0); 4273 return 1; 4274 } 4275 __setup("hashdist=", set_hashdist); 4276 #endif 4277 4278 /* 4279 * allocate a large system hash table from bootmem 4280 * - it is assumed that the hash table must contain an exact power-of-2 4281 * quantity of entries 4282 * - limit is the number of hash buckets, not the total allocation size 4283 */ 4284 void *__init alloc_large_system_hash(const char *tablename, 4285 unsigned long bucketsize, 4286 unsigned long numentries, 4287 int scale, 4288 int flags, 4289 unsigned int *_hash_shift, 4290 unsigned int *_hash_mask, 4291 unsigned long limit) 4292 { 4293 unsigned long long max = limit; 4294 unsigned long log2qty, size; 4295 void *table = NULL; 4296 4297 /* allow the kernel cmdline to have a say */ 4298 if (!numentries) { 4299 /* round applicable memory size up to nearest megabyte */ 4300 numentries = nr_kernel_pages; 4301 numentries += (1UL << (20 - PAGE_SHIFT)) - 1; 4302 numentries >>= 20 - PAGE_SHIFT; 4303 numentries <<= 20 - PAGE_SHIFT; 4304 4305 /* limit to 1 bucket per 2^scale bytes of low memory */ 4306 if (scale > PAGE_SHIFT) 4307 numentries >>= (scale - PAGE_SHIFT); 4308 else 4309 numentries <<= (PAGE_SHIFT - scale); 4310 4311 /* Make sure we've got at least a 0-order allocation.. */ 4312 if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 4313 numentries = PAGE_SIZE / bucketsize; 4314 } 4315 numentries = roundup_pow_of_two(numentries); 4316 4317 /* limit allocation size to 1/16 total memory by default */ 4318 if (max == 0) { 4319 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 4320 do_div(max, bucketsize); 4321 } 4322 4323 if (numentries > max) 4324 numentries = max; 4325 4326 log2qty = ilog2(numentries); 4327 4328 do { 4329 size = bucketsize << log2qty; 4330 if (flags & HASH_EARLY) 4331 table = alloc_bootmem(size); 4332 else if (hashdist) 4333 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 4334 else { 4335 unsigned long order; 4336 for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) 4337 ; 4338 table = (void*) __get_free_pages(GFP_ATOMIC, order); 4339 /* 4340 * If bucketsize is not a power-of-two, we may free 4341 * some pages at the end of hash table. 4342 */ 4343 if (table) { 4344 unsigned long alloc_end = (unsigned long)table + 4345 (PAGE_SIZE << order); 4346 unsigned long used = (unsigned long)table + 4347 PAGE_ALIGN(size); 4348 split_page(virt_to_page(table), order); 4349 while (used < alloc_end) { 4350 free_page(used); 4351 used += PAGE_SIZE; 4352 } 4353 } 4354 } 4355 } while (!table && size > PAGE_SIZE && --log2qty); 4356 4357 if (!table) 4358 panic("Failed to allocate %s hash table\n", tablename); 4359 4360 printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n", 4361 tablename, 4362 (1U << log2qty), 4363 ilog2(size) - PAGE_SHIFT, 4364 size); 4365 4366 if (_hash_shift) 4367 *_hash_shift = log2qty; 4368 if (_hash_mask) 4369 *_hash_mask = (1 << log2qty) - 1; 4370 4371 return table; 4372 } 4373 4374 #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE 4375 struct page *pfn_to_page(unsigned long pfn) 4376 { 4377 return __pfn_to_page(pfn); 4378 } 4379 unsigned long page_to_pfn(struct page *page) 4380 { 4381 return __page_to_pfn(page); 4382 } 4383 EXPORT_SYMBOL(pfn_to_page); 4384 EXPORT_SYMBOL(page_to_pfn); 4385 #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ 4386 4387 /* Return a pointer to the bitmap storing bits affecting a block of pages */ 4388 static inline unsigned long *get_pageblock_bitmap(struct zone *zone, 4389 unsigned long pfn) 4390 { 4391 #ifdef CONFIG_SPARSEMEM 4392 return __pfn_to_section(pfn)->pageblock_flags; 4393 #else 4394 return zone->pageblock_flags; 4395 #endif /* CONFIG_SPARSEMEM */ 4396 } 4397 4398 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) 4399 { 4400 #ifdef CONFIG_SPARSEMEM 4401 pfn &= (PAGES_PER_SECTION-1); 4402 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 4403 #else 4404 pfn = pfn - zone->zone_start_pfn; 4405 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 4406 #endif /* CONFIG_SPARSEMEM */ 4407 } 4408 4409 /** 4410 * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages 4411 * @page: The page within the block of interest 4412 * @start_bitidx: The first bit of interest to retrieve 4413 * @end_bitidx: The last bit of interest 4414 * returns pageblock_bits flags 4415 */ 4416 unsigned long get_pageblock_flags_group(struct page *page, 4417 int start_bitidx, int end_bitidx) 4418 { 4419 struct zone *zone; 4420 unsigned long *bitmap; 4421 unsigned long pfn, bitidx; 4422 unsigned long flags = 0; 4423 unsigned long value = 1; 4424 4425 zone = page_zone(page); 4426 pfn = page_to_pfn(page); 4427 bitmap = get_pageblock_bitmap(zone, pfn); 4428 bitidx = pfn_to_bitidx(zone, pfn); 4429 4430 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) 4431 if (test_bit(bitidx + start_bitidx, bitmap)) 4432 flags |= value; 4433 4434 return flags; 4435 } 4436 4437 /** 4438 * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages 4439 * @page: The page within the block of interest 4440 * @start_bitidx: The first bit of interest 4441 * @end_bitidx: The last bit of interest 4442 * @flags: The flags to set 4443 */ 4444 void set_pageblock_flags_group(struct page *page, unsigned long flags, 4445 int start_bitidx, int end_bitidx) 4446 { 4447 struct zone *zone; 4448 unsigned long *bitmap; 4449 unsigned long pfn, bitidx; 4450 unsigned long value = 1; 4451 4452 zone = page_zone(page); 4453 pfn = page_to_pfn(page); 4454 bitmap = get_pageblock_bitmap(zone, pfn); 4455 bitidx = pfn_to_bitidx(zone, pfn); 4456 4457 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) 4458 if (flags & value) 4459 __set_bit(bitidx + start_bitidx, bitmap); 4460 else 4461 __clear_bit(bitidx + start_bitidx, bitmap); 4462 } 4463 4464 /* 4465 * This is designed as sub function...plz see page_isolation.c also. 4466 * set/clear page block's type to be ISOLATE. 4467 * page allocater never alloc memory from ISOLATE block. 4468 */ 4469 4470 int set_migratetype_isolate(struct page *page) 4471 { 4472 struct zone *zone; 4473 unsigned long flags; 4474 int ret = -EBUSY; 4475 4476 zone = page_zone(page); 4477 spin_lock_irqsave(&zone->lock, flags); 4478 /* 4479 * In future, more migrate types will be able to be isolation target. 4480 */ 4481 if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE) 4482 goto out; 4483 set_pageblock_migratetype(page, MIGRATE_ISOLATE); 4484 move_freepages_block(zone, page, MIGRATE_ISOLATE); 4485 ret = 0; 4486 out: 4487 spin_unlock_irqrestore(&zone->lock, flags); 4488 if (!ret) 4489 drain_all_pages(); 4490 return ret; 4491 } 4492 4493 void unset_migratetype_isolate(struct page *page) 4494 { 4495 struct zone *zone; 4496 unsigned long flags; 4497 zone = page_zone(page); 4498 spin_lock_irqsave(&zone->lock, flags); 4499 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) 4500 goto out; 4501 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 4502 move_freepages_block(zone, page, MIGRATE_MOVABLE); 4503 out: 4504 spin_unlock_irqrestore(&zone->lock, flags); 4505 } 4506 4507 #ifdef CONFIG_MEMORY_HOTREMOVE 4508 /* 4509 * All pages in the range must be isolated before calling this. 4510 */ 4511 void 4512 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 4513 { 4514 struct page *page; 4515 struct zone *zone; 4516 int order, i; 4517 unsigned long pfn; 4518 unsigned long flags; 4519 /* find the first valid pfn */ 4520 for (pfn = start_pfn; pfn < end_pfn; pfn++) 4521 if (pfn_valid(pfn)) 4522 break; 4523 if (pfn == end_pfn) 4524 return; 4525 zone = page_zone(pfn_to_page(pfn)); 4526 spin_lock_irqsave(&zone->lock, flags); 4527 pfn = start_pfn; 4528 while (pfn < end_pfn) { 4529 if (!pfn_valid(pfn)) { 4530 pfn++; 4531 continue; 4532 } 4533 page = pfn_to_page(pfn); 4534 BUG_ON(page_count(page)); 4535 BUG_ON(!PageBuddy(page)); 4536 order = page_order(page); 4537 #ifdef CONFIG_DEBUG_VM 4538 printk(KERN_INFO "remove from free list %lx %d %lx\n", 4539 pfn, 1 << order, end_pfn); 4540 #endif 4541 list_del(&page->lru); 4542 rmv_page_order(page); 4543 zone->free_area[order].nr_free--; 4544 __mod_zone_page_state(zone, NR_FREE_PAGES, 4545 - (1UL << order)); 4546 for (i = 0; i < (1 << order); i++) 4547 SetPageReserved((page+i)); 4548 pfn += (1 << order); 4549 } 4550 spin_unlock_irqrestore(&zone->lock, flags); 4551 } 4552 #endif 4553