1 /* 2 * linux/mm/page_alloc.c 3 * 4 * Manages the free list, the system allocates free pages here. 5 * Note that kmalloc() lives in slab.c 6 * 7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 8 * Swap reorganised 29.12.95, Stephen Tweedie 9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 15 */ 16 17 #include <linux/config.h> 18 #include <linux/stddef.h> 19 #include <linux/mm.h> 20 #include <linux/swap.h> 21 #include <linux/interrupt.h> 22 #include <linux/pagemap.h> 23 #include <linux/bootmem.h> 24 #include <linux/compiler.h> 25 #include <linux/module.h> 26 #include <linux/suspend.h> 27 #include <linux/pagevec.h> 28 #include <linux/blkdev.h> 29 #include <linux/slab.h> 30 #include <linux/notifier.h> 31 #include <linux/topology.h> 32 #include <linux/sysctl.h> 33 #include <linux/cpu.h> 34 #include <linux/cpuset.h> 35 #include <linux/nodemask.h> 36 #include <linux/vmalloc.h> 37 38 #include <asm/tlbflush.h> 39 #include "internal.h" 40 41 /* 42 * MCD - HACK: Find somewhere to initialize this EARLY, or make this 43 * initializer cleaner 44 */ 45 nodemask_t node_online_map = { { [0] = 1UL } }; 46 EXPORT_SYMBOL(node_online_map); 47 nodemask_t node_possible_map = NODE_MASK_ALL; 48 EXPORT_SYMBOL(node_possible_map); 49 struct pglist_data *pgdat_list; 50 unsigned long totalram_pages; 51 unsigned long totalhigh_pages; 52 long nr_swap_pages; 53 54 /* 55 * results with 256, 32 in the lowmem_reserve sysctl: 56 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 57 * 1G machine -> (16M dma, 784M normal, 224M high) 58 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 59 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 60 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 61 */ 62 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 }; 63 64 EXPORT_SYMBOL(totalram_pages); 65 EXPORT_SYMBOL(nr_swap_pages); 66 67 /* 68 * Used by page_zone() to look up the address of the struct zone whose 69 * id is encoded in the upper bits of page->flags 70 */ 71 struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)]; 72 EXPORT_SYMBOL(zone_table); 73 74 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; 75 int min_free_kbytes = 1024; 76 77 unsigned long __initdata nr_kernel_pages; 78 unsigned long __initdata nr_all_pages; 79 80 /* 81 * Temporary debugging check for pages not lying within a given zone. 82 */ 83 static int bad_range(struct zone *zone, struct page *page) 84 { 85 if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) 86 return 1; 87 if (page_to_pfn(page) < zone->zone_start_pfn) 88 return 1; 89 #ifdef CONFIG_HOLES_IN_ZONE 90 if (!pfn_valid(page_to_pfn(page))) 91 return 1; 92 #endif 93 if (zone != page_zone(page)) 94 return 1; 95 return 0; 96 } 97 98 static void bad_page(const char *function, struct page *page) 99 { 100 printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", 101 function, current->comm, page); 102 printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", 103 (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags, 104 page->mapping, page_mapcount(page), page_count(page)); 105 printk(KERN_EMERG "Backtrace:\n"); 106 dump_stack(); 107 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); 108 page->flags &= ~(1 << PG_private | 109 1 << PG_locked | 110 1 << PG_lru | 111 1 << PG_active | 112 1 << PG_dirty | 113 1 << PG_swapcache | 114 1 << PG_writeback); 115 set_page_count(page, 0); 116 reset_page_mapcount(page); 117 page->mapping = NULL; 118 tainted |= TAINT_BAD_PAGE; 119 } 120 121 #ifndef CONFIG_HUGETLB_PAGE 122 #define prep_compound_page(page, order) do { } while (0) 123 #define destroy_compound_page(page, order) do { } while (0) 124 #else 125 /* 126 * Higher-order pages are called "compound pages". They are structured thusly: 127 * 128 * The first PAGE_SIZE page is called the "head page". 129 * 130 * The remaining PAGE_SIZE pages are called "tail pages". 131 * 132 * All pages have PG_compound set. All pages have their ->private pointing at 133 * the head page (even the head page has this). 134 * 135 * The first tail page's ->mapping, if non-zero, holds the address of the 136 * compound page's put_page() function. 137 * 138 * The order of the allocation is stored in the first tail page's ->index 139 * This is only for debug at present. This usage means that zero-order pages 140 * may not be compound. 141 */ 142 static void prep_compound_page(struct page *page, unsigned long order) 143 { 144 int i; 145 int nr_pages = 1 << order; 146 147 page[1].mapping = NULL; 148 page[1].index = order; 149 for (i = 0; i < nr_pages; i++) { 150 struct page *p = page + i; 151 152 SetPageCompound(p); 153 p->private = (unsigned long)page; 154 } 155 } 156 157 static void destroy_compound_page(struct page *page, unsigned long order) 158 { 159 int i; 160 int nr_pages = 1 << order; 161 162 if (!PageCompound(page)) 163 return; 164 165 if (page[1].index != order) 166 bad_page(__FUNCTION__, page); 167 168 for (i = 0; i < nr_pages; i++) { 169 struct page *p = page + i; 170 171 if (!PageCompound(p)) 172 bad_page(__FUNCTION__, page); 173 if (p->private != (unsigned long)page) 174 bad_page(__FUNCTION__, page); 175 ClearPageCompound(p); 176 } 177 } 178 #endif /* CONFIG_HUGETLB_PAGE */ 179 180 /* 181 * function for dealing with page's order in buddy system. 182 * zone->lock is already acquired when we use these. 183 * So, we don't need atomic page->flags operations here. 184 */ 185 static inline unsigned long page_order(struct page *page) { 186 return page->private; 187 } 188 189 static inline void set_page_order(struct page *page, int order) { 190 page->private = order; 191 __SetPagePrivate(page); 192 } 193 194 static inline void rmv_page_order(struct page *page) 195 { 196 __ClearPagePrivate(page); 197 page->private = 0; 198 } 199 200 /* 201 * Locate the struct page for both the matching buddy in our 202 * pair (buddy1) and the combined O(n+1) page they form (page). 203 * 204 * 1) Any buddy B1 will have an order O twin B2 which satisfies 205 * the following equation: 206 * B2 = B1 ^ (1 << O) 207 * For example, if the starting buddy (buddy2) is #8 its order 208 * 1 buddy is #10: 209 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 210 * 211 * 2) Any buddy B will have an order O+1 parent P which 212 * satisfies the following equation: 213 * P = B & ~(1 << O) 214 * 215 * Assumption: *_mem_map is contigious at least up to MAX_ORDER 216 */ 217 static inline struct page * 218 __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) 219 { 220 unsigned long buddy_idx = page_idx ^ (1 << order); 221 222 return page + (buddy_idx - page_idx); 223 } 224 225 static inline unsigned long 226 __find_combined_index(unsigned long page_idx, unsigned int order) 227 { 228 return (page_idx & ~(1 << order)); 229 } 230 231 /* 232 * This function checks whether a page is free && is the buddy 233 * we can do coalesce a page and its buddy if 234 * (a) the buddy is free && 235 * (b) the buddy is on the buddy system && 236 * (c) a page and its buddy have the same order. 237 * for recording page's order, we use page->private and PG_private. 238 * 239 */ 240 static inline int page_is_buddy(struct page *page, int order) 241 { 242 if (PagePrivate(page) && 243 (page_order(page) == order) && 244 !PageReserved(page) && 245 page_count(page) == 0) 246 return 1; 247 return 0; 248 } 249 250 /* 251 * Freeing function for a buddy system allocator. 252 * 253 * The concept of a buddy system is to maintain direct-mapped table 254 * (containing bit values) for memory blocks of various "orders". 255 * The bottom level table contains the map for the smallest allocatable 256 * units of memory (here, pages), and each level above it describes 257 * pairs of units from the levels below, hence, "buddies". 258 * At a high level, all that happens here is marking the table entry 259 * at the bottom level available, and propagating the changes upward 260 * as necessary, plus some accounting needed to play nicely with other 261 * parts of the VM system. 262 * At each level, we keep a list of pages, which are heads of continuous 263 * free pages of length of (1 << order) and marked with PG_Private.Page's 264 * order is recorded in page->private field. 265 * So when we are allocating or freeing one, we can derive the state of the 266 * other. That is, if we allocate a small block, and both were 267 * free, the remainder of the region must be split into blocks. 268 * If a block is freed, and its buddy is also free, then this 269 * triggers coalescing into a block of larger size. 270 * 271 * -- wli 272 */ 273 274 static inline void __free_pages_bulk (struct page *page, 275 struct zone *zone, unsigned int order) 276 { 277 unsigned long page_idx; 278 int order_size = 1 << order; 279 280 if (unlikely(order)) 281 destroy_compound_page(page, order); 282 283 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 284 285 BUG_ON(page_idx & (order_size - 1)); 286 BUG_ON(bad_range(zone, page)); 287 288 zone->free_pages += order_size; 289 while (order < MAX_ORDER-1) { 290 unsigned long combined_idx; 291 struct free_area *area; 292 struct page *buddy; 293 294 combined_idx = __find_combined_index(page_idx, order); 295 buddy = __page_find_buddy(page, page_idx, order); 296 297 if (bad_range(zone, buddy)) 298 break; 299 if (!page_is_buddy(buddy, order)) 300 break; /* Move the buddy up one level. */ 301 list_del(&buddy->lru); 302 area = zone->free_area + order; 303 area->nr_free--; 304 rmv_page_order(buddy); 305 page = page + (combined_idx - page_idx); 306 page_idx = combined_idx; 307 order++; 308 } 309 set_page_order(page, order); 310 list_add(&page->lru, &zone->free_area[order].free_list); 311 zone->free_area[order].nr_free++; 312 } 313 314 static inline void free_pages_check(const char *function, struct page *page) 315 { 316 if ( page_mapcount(page) || 317 page->mapping != NULL || 318 page_count(page) != 0 || 319 (page->flags & ( 320 1 << PG_lru | 321 1 << PG_private | 322 1 << PG_locked | 323 1 << PG_active | 324 1 << PG_reclaim | 325 1 << PG_slab | 326 1 << PG_swapcache | 327 1 << PG_writeback ))) 328 bad_page(function, page); 329 if (PageDirty(page)) 330 ClearPageDirty(page); 331 } 332 333 /* 334 * Frees a list of pages. 335 * Assumes all pages on list are in same zone, and of same order. 336 * count is the number of pages to free, or 0 for all on the list. 337 * 338 * If the zone was previously in an "all pages pinned" state then look to 339 * see if this freeing clears that state. 340 * 341 * And clear the zone's pages_scanned counter, to hold off the "all pages are 342 * pinned" detection logic. 343 */ 344 static int 345 free_pages_bulk(struct zone *zone, int count, 346 struct list_head *list, unsigned int order) 347 { 348 unsigned long flags; 349 struct page *page = NULL; 350 int ret = 0; 351 352 spin_lock_irqsave(&zone->lock, flags); 353 zone->all_unreclaimable = 0; 354 zone->pages_scanned = 0; 355 while (!list_empty(list) && count--) { 356 page = list_entry(list->prev, struct page, lru); 357 /* have to delete it as __free_pages_bulk list manipulates */ 358 list_del(&page->lru); 359 __free_pages_bulk(page, zone, order); 360 ret++; 361 } 362 spin_unlock_irqrestore(&zone->lock, flags); 363 return ret; 364 } 365 366 void __free_pages_ok(struct page *page, unsigned int order) 367 { 368 LIST_HEAD(list); 369 int i; 370 371 arch_free_page(page, order); 372 373 mod_page_state(pgfree, 1 << order); 374 375 #ifndef CONFIG_MMU 376 if (order > 0) 377 for (i = 1 ; i < (1 << order) ; ++i) 378 __put_page(page + i); 379 #endif 380 381 for (i = 0 ; i < (1 << order) ; ++i) 382 free_pages_check(__FUNCTION__, page + i); 383 list_add(&page->lru, &list); 384 kernel_map_pages(page, 1<<order, 0); 385 free_pages_bulk(page_zone(page), 1, &list, order); 386 } 387 388 389 /* 390 * The order of subdivision here is critical for the IO subsystem. 391 * Please do not alter this order without good reasons and regression 392 * testing. Specifically, as large blocks of memory are subdivided, 393 * the order in which smaller blocks are delivered depends on the order 394 * they're subdivided in this function. This is the primary factor 395 * influencing the order in which pages are delivered to the IO 396 * subsystem according to empirical testing, and this is also justified 397 * by considering the behavior of a buddy system containing a single 398 * large block of memory acted on by a series of small allocations. 399 * This behavior is a critical factor in sglist merging's success. 400 * 401 * -- wli 402 */ 403 static inline struct page * 404 expand(struct zone *zone, struct page *page, 405 int low, int high, struct free_area *area) 406 { 407 unsigned long size = 1 << high; 408 409 while (high > low) { 410 area--; 411 high--; 412 size >>= 1; 413 BUG_ON(bad_range(zone, &page[size])); 414 list_add(&page[size].lru, &area->free_list); 415 area->nr_free++; 416 set_page_order(&page[size], high); 417 } 418 return page; 419 } 420 421 void set_page_refs(struct page *page, int order) 422 { 423 #ifdef CONFIG_MMU 424 set_page_count(page, 1); 425 #else 426 int i; 427 428 /* 429 * We need to reference all the pages for this order, otherwise if 430 * anyone accesses one of the pages with (get/put) it will be freed. 431 * - eg: access_process_vm() 432 */ 433 for (i = 0; i < (1 << order); i++) 434 set_page_count(page + i, 1); 435 #endif /* CONFIG_MMU */ 436 } 437 438 /* 439 * This page is about to be returned from the page allocator 440 */ 441 static void prep_new_page(struct page *page, int order) 442 { 443 if (page->mapping || page_mapcount(page) || 444 (page->flags & ( 445 1 << PG_private | 446 1 << PG_locked | 447 1 << PG_lru | 448 1 << PG_active | 449 1 << PG_dirty | 450 1 << PG_reclaim | 451 1 << PG_swapcache | 452 1 << PG_writeback ))) 453 bad_page(__FUNCTION__, page); 454 455 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 456 1 << PG_referenced | 1 << PG_arch_1 | 457 1 << PG_checked | 1 << PG_mappedtodisk); 458 page->private = 0; 459 set_page_refs(page, order); 460 kernel_map_pages(page, 1 << order, 1); 461 } 462 463 /* 464 * Do the hard work of removing an element from the buddy allocator. 465 * Call me with the zone->lock already held. 466 */ 467 static struct page *__rmqueue(struct zone *zone, unsigned int order) 468 { 469 struct free_area * area; 470 unsigned int current_order; 471 struct page *page; 472 473 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 474 area = zone->free_area + current_order; 475 if (list_empty(&area->free_list)) 476 continue; 477 478 page = list_entry(area->free_list.next, struct page, lru); 479 list_del(&page->lru); 480 rmv_page_order(page); 481 area->nr_free--; 482 zone->free_pages -= 1UL << order; 483 return expand(zone, page, order, current_order, area); 484 } 485 486 return NULL; 487 } 488 489 /* 490 * Obtain a specified number of elements from the buddy allocator, all under 491 * a single hold of the lock, for efficiency. Add them to the supplied list. 492 * Returns the number of new pages which were placed at *list. 493 */ 494 static int rmqueue_bulk(struct zone *zone, unsigned int order, 495 unsigned long count, struct list_head *list) 496 { 497 unsigned long flags; 498 int i; 499 int allocated = 0; 500 struct page *page; 501 502 spin_lock_irqsave(&zone->lock, flags); 503 for (i = 0; i < count; ++i) { 504 page = __rmqueue(zone, order); 505 if (page == NULL) 506 break; 507 allocated++; 508 list_add_tail(&page->lru, list); 509 } 510 spin_unlock_irqrestore(&zone->lock, flags); 511 return allocated; 512 } 513 514 #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) 515 static void __drain_pages(unsigned int cpu) 516 { 517 struct zone *zone; 518 int i; 519 520 for_each_zone(zone) { 521 struct per_cpu_pageset *pset; 522 523 pset = &zone->pageset[cpu]; 524 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 525 struct per_cpu_pages *pcp; 526 527 pcp = &pset->pcp[i]; 528 pcp->count -= free_pages_bulk(zone, pcp->count, 529 &pcp->list, 0); 530 } 531 } 532 } 533 #endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */ 534 535 #ifdef CONFIG_PM 536 537 void mark_free_pages(struct zone *zone) 538 { 539 unsigned long zone_pfn, flags; 540 int order; 541 struct list_head *curr; 542 543 if (!zone->spanned_pages) 544 return; 545 546 spin_lock_irqsave(&zone->lock, flags); 547 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) 548 ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); 549 550 for (order = MAX_ORDER - 1; order >= 0; --order) 551 list_for_each(curr, &zone->free_area[order].free_list) { 552 unsigned long start_pfn, i; 553 554 start_pfn = page_to_pfn(list_entry(curr, struct page, lru)); 555 556 for (i=0; i < (1<<order); i++) 557 SetPageNosaveFree(pfn_to_page(start_pfn+i)); 558 } 559 spin_unlock_irqrestore(&zone->lock, flags); 560 } 561 562 /* 563 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 564 */ 565 void drain_local_pages(void) 566 { 567 unsigned long flags; 568 569 local_irq_save(flags); 570 __drain_pages(smp_processor_id()); 571 local_irq_restore(flags); 572 } 573 #endif /* CONFIG_PM */ 574 575 static void zone_statistics(struct zonelist *zonelist, struct zone *z) 576 { 577 #ifdef CONFIG_NUMA 578 unsigned long flags; 579 int cpu; 580 pg_data_t *pg = z->zone_pgdat; 581 pg_data_t *orig = zonelist->zones[0]->zone_pgdat; 582 struct per_cpu_pageset *p; 583 584 local_irq_save(flags); 585 cpu = smp_processor_id(); 586 p = &z->pageset[cpu]; 587 if (pg == orig) { 588 z->pageset[cpu].numa_hit++; 589 } else { 590 p->numa_miss++; 591 zonelist->zones[0]->pageset[cpu].numa_foreign++; 592 } 593 if (pg == NODE_DATA(numa_node_id())) 594 p->local_node++; 595 else 596 p->other_node++; 597 local_irq_restore(flags); 598 #endif 599 } 600 601 /* 602 * Free a 0-order page 603 */ 604 static void FASTCALL(free_hot_cold_page(struct page *page, int cold)); 605 static void fastcall free_hot_cold_page(struct page *page, int cold) 606 { 607 struct zone *zone = page_zone(page); 608 struct per_cpu_pages *pcp; 609 unsigned long flags; 610 611 arch_free_page(page, 0); 612 613 kernel_map_pages(page, 1, 0); 614 inc_page_state(pgfree); 615 if (PageAnon(page)) 616 page->mapping = NULL; 617 free_pages_check(__FUNCTION__, page); 618 pcp = &zone->pageset[get_cpu()].pcp[cold]; 619 local_irq_save(flags); 620 if (pcp->count >= pcp->high) 621 pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 622 list_add(&page->lru, &pcp->list); 623 pcp->count++; 624 local_irq_restore(flags); 625 put_cpu(); 626 } 627 628 void fastcall free_hot_page(struct page *page) 629 { 630 free_hot_cold_page(page, 0); 631 } 632 633 void fastcall free_cold_page(struct page *page) 634 { 635 free_hot_cold_page(page, 1); 636 } 637 638 static inline void prep_zero_page(struct page *page, int order, unsigned int __nocast gfp_flags) 639 { 640 int i; 641 642 BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); 643 for(i = 0; i < (1 << order); i++) 644 clear_highpage(page + i); 645 } 646 647 /* 648 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 649 * we cheat by calling it from here, in the order > 0 path. Saves a branch 650 * or two. 651 */ 652 static struct page * 653 buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags) 654 { 655 unsigned long flags; 656 struct page *page = NULL; 657 int cold = !!(gfp_flags & __GFP_COLD); 658 659 if (order == 0) { 660 struct per_cpu_pages *pcp; 661 662 pcp = &zone->pageset[get_cpu()].pcp[cold]; 663 local_irq_save(flags); 664 if (pcp->count <= pcp->low) 665 pcp->count += rmqueue_bulk(zone, 0, 666 pcp->batch, &pcp->list); 667 if (pcp->count) { 668 page = list_entry(pcp->list.next, struct page, lru); 669 list_del(&page->lru); 670 pcp->count--; 671 } 672 local_irq_restore(flags); 673 put_cpu(); 674 } 675 676 if (page == NULL) { 677 spin_lock_irqsave(&zone->lock, flags); 678 page = __rmqueue(zone, order); 679 spin_unlock_irqrestore(&zone->lock, flags); 680 } 681 682 if (page != NULL) { 683 BUG_ON(bad_range(zone, page)); 684 mod_page_state_zone(zone, pgalloc, 1 << order); 685 prep_new_page(page, order); 686 687 if (gfp_flags & __GFP_ZERO) 688 prep_zero_page(page, order, gfp_flags); 689 690 if (order && (gfp_flags & __GFP_COMP)) 691 prep_compound_page(page, order); 692 } 693 return page; 694 } 695 696 /* 697 * Return 1 if free pages are above 'mark'. This takes into account the order 698 * of the allocation. 699 */ 700 int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 701 int classzone_idx, int can_try_harder, int gfp_high) 702 { 703 /* free_pages my go negative - that's OK */ 704 long min = mark, free_pages = z->free_pages - (1 << order) + 1; 705 int o; 706 707 if (gfp_high) 708 min -= min / 2; 709 if (can_try_harder) 710 min -= min / 4; 711 712 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 713 return 0; 714 for (o = 0; o < order; o++) { 715 /* At the next order, this order's pages become unavailable */ 716 free_pages -= z->free_area[o].nr_free << o; 717 718 /* Require fewer higher order pages to be free */ 719 min >>= 1; 720 721 if (free_pages <= min) 722 return 0; 723 } 724 return 1; 725 } 726 727 /* 728 * This is the 'heart' of the zoned buddy allocator. 729 */ 730 struct page * fastcall 731 __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order, 732 struct zonelist *zonelist) 733 { 734 const int wait = gfp_mask & __GFP_WAIT; 735 struct zone **zones, *z; 736 struct page *page; 737 struct reclaim_state reclaim_state; 738 struct task_struct *p = current; 739 int i; 740 int classzone_idx; 741 int do_retry; 742 int can_try_harder; 743 int did_some_progress; 744 745 might_sleep_if(wait); 746 747 /* 748 * The caller may dip into page reserves a bit more if the caller 749 * cannot run direct reclaim, or is the caller has realtime scheduling 750 * policy 751 */ 752 can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait; 753 754 zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ 755 756 if (unlikely(zones[0] == NULL)) { 757 /* Should this ever happen?? */ 758 return NULL; 759 } 760 761 classzone_idx = zone_idx(zones[0]); 762 763 restart: 764 /* Go through the zonelist once, looking for a zone with enough free */ 765 for (i = 0; (z = zones[i]) != NULL; i++) { 766 767 if (!zone_watermark_ok(z, order, z->pages_low, 768 classzone_idx, 0, 0)) 769 continue; 770 771 if (!cpuset_zone_allowed(z)) 772 continue; 773 774 page = buffered_rmqueue(z, order, gfp_mask); 775 if (page) 776 goto got_pg; 777 } 778 779 for (i = 0; (z = zones[i]) != NULL; i++) 780 wakeup_kswapd(z, order); 781 782 /* 783 * Go through the zonelist again. Let __GFP_HIGH and allocations 784 * coming from realtime tasks to go deeper into reserves 785 * 786 * This is the last chance, in general, before the goto nopage. 787 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 788 */ 789 for (i = 0; (z = zones[i]) != NULL; i++) { 790 if (!zone_watermark_ok(z, order, z->pages_min, 791 classzone_idx, can_try_harder, 792 gfp_mask & __GFP_HIGH)) 793 continue; 794 795 if (wait && !cpuset_zone_allowed(z)) 796 continue; 797 798 page = buffered_rmqueue(z, order, gfp_mask); 799 if (page) 800 goto got_pg; 801 } 802 803 /* This allocation should allow future memory freeing. */ 804 805 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 806 && !in_interrupt()) { 807 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 808 /* go through the zonelist yet again, ignoring mins */ 809 for (i = 0; (z = zones[i]) != NULL; i++) { 810 if (!cpuset_zone_allowed(z)) 811 continue; 812 page = buffered_rmqueue(z, order, gfp_mask); 813 if (page) 814 goto got_pg; 815 } 816 } 817 goto nopage; 818 } 819 820 /* Atomic allocations - we can't balance anything */ 821 if (!wait) 822 goto nopage; 823 824 rebalance: 825 cond_resched(); 826 827 /* We now go into synchronous reclaim */ 828 p->flags |= PF_MEMALLOC; 829 reclaim_state.reclaimed_slab = 0; 830 p->reclaim_state = &reclaim_state; 831 832 did_some_progress = try_to_free_pages(zones, gfp_mask, order); 833 834 p->reclaim_state = NULL; 835 p->flags &= ~PF_MEMALLOC; 836 837 cond_resched(); 838 839 if (likely(did_some_progress)) { 840 /* 841 * Go through the zonelist yet one more time, keep 842 * very high watermark here, this is only to catch 843 * a parallel oom killing, we must fail if we're still 844 * under heavy pressure. 845 */ 846 for (i = 0; (z = zones[i]) != NULL; i++) { 847 if (!zone_watermark_ok(z, order, z->pages_min, 848 classzone_idx, can_try_harder, 849 gfp_mask & __GFP_HIGH)) 850 continue; 851 852 if (!cpuset_zone_allowed(z)) 853 continue; 854 855 page = buffered_rmqueue(z, order, gfp_mask); 856 if (page) 857 goto got_pg; 858 } 859 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 860 /* 861 * Go through the zonelist yet one more time, keep 862 * very high watermark here, this is only to catch 863 * a parallel oom killing, we must fail if we're still 864 * under heavy pressure. 865 */ 866 for (i = 0; (z = zones[i]) != NULL; i++) { 867 if (!zone_watermark_ok(z, order, z->pages_high, 868 classzone_idx, 0, 0)) 869 continue; 870 871 if (!cpuset_zone_allowed(z)) 872 continue; 873 874 page = buffered_rmqueue(z, order, gfp_mask); 875 if (page) 876 goto got_pg; 877 } 878 879 out_of_memory(gfp_mask); 880 goto restart; 881 } 882 883 /* 884 * Don't let big-order allocations loop unless the caller explicitly 885 * requests that. Wait for some write requests to complete then retry. 886 * 887 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order 888 * <= 3, but that may not be true in other implementations. 889 */ 890 do_retry = 0; 891 if (!(gfp_mask & __GFP_NORETRY)) { 892 if ((order <= 3) || (gfp_mask & __GFP_REPEAT)) 893 do_retry = 1; 894 if (gfp_mask & __GFP_NOFAIL) 895 do_retry = 1; 896 } 897 if (do_retry) { 898 blk_congestion_wait(WRITE, HZ/50); 899 goto rebalance; 900 } 901 902 nopage: 903 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { 904 printk(KERN_WARNING "%s: page allocation failure." 905 " order:%d, mode:0x%x\n", 906 p->comm, order, gfp_mask); 907 dump_stack(); 908 } 909 return NULL; 910 got_pg: 911 zone_statistics(zonelist, z); 912 return page; 913 } 914 915 EXPORT_SYMBOL(__alloc_pages); 916 917 /* 918 * Common helper functions. 919 */ 920 fastcall unsigned long __get_free_pages(unsigned int __nocast gfp_mask, unsigned int order) 921 { 922 struct page * page; 923 page = alloc_pages(gfp_mask, order); 924 if (!page) 925 return 0; 926 return (unsigned long) page_address(page); 927 } 928 929 EXPORT_SYMBOL(__get_free_pages); 930 931 fastcall unsigned long get_zeroed_page(unsigned int __nocast gfp_mask) 932 { 933 struct page * page; 934 935 /* 936 * get_zeroed_page() returns a 32-bit address, which cannot represent 937 * a highmem page 938 */ 939 BUG_ON(gfp_mask & __GFP_HIGHMEM); 940 941 page = alloc_pages(gfp_mask | __GFP_ZERO, 0); 942 if (page) 943 return (unsigned long) page_address(page); 944 return 0; 945 } 946 947 EXPORT_SYMBOL(get_zeroed_page); 948 949 void __pagevec_free(struct pagevec *pvec) 950 { 951 int i = pagevec_count(pvec); 952 953 while (--i >= 0) 954 free_hot_cold_page(pvec->pages[i], pvec->cold); 955 } 956 957 fastcall void __free_pages(struct page *page, unsigned int order) 958 { 959 if (!PageReserved(page) && put_page_testzero(page)) { 960 if (order == 0) 961 free_hot_page(page); 962 else 963 __free_pages_ok(page, order); 964 } 965 } 966 967 EXPORT_SYMBOL(__free_pages); 968 969 fastcall void free_pages(unsigned long addr, unsigned int order) 970 { 971 if (addr != 0) { 972 BUG_ON(!virt_addr_valid((void *)addr)); 973 __free_pages(virt_to_page((void *)addr), order); 974 } 975 } 976 977 EXPORT_SYMBOL(free_pages); 978 979 /* 980 * Total amount of free (allocatable) RAM: 981 */ 982 unsigned int nr_free_pages(void) 983 { 984 unsigned int sum = 0; 985 struct zone *zone; 986 987 for_each_zone(zone) 988 sum += zone->free_pages; 989 990 return sum; 991 } 992 993 EXPORT_SYMBOL(nr_free_pages); 994 995 #ifdef CONFIG_NUMA 996 unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) 997 { 998 unsigned int i, sum = 0; 999 1000 for (i = 0; i < MAX_NR_ZONES; i++) 1001 sum += pgdat->node_zones[i].free_pages; 1002 1003 return sum; 1004 } 1005 #endif 1006 1007 static unsigned int nr_free_zone_pages(int offset) 1008 { 1009 pg_data_t *pgdat; 1010 unsigned int sum = 0; 1011 1012 for_each_pgdat(pgdat) { 1013 struct zonelist *zonelist = pgdat->node_zonelists + offset; 1014 struct zone **zonep = zonelist->zones; 1015 struct zone *zone; 1016 1017 for (zone = *zonep++; zone; zone = *zonep++) { 1018 unsigned long size = zone->present_pages; 1019 unsigned long high = zone->pages_high; 1020 if (size > high) 1021 sum += size - high; 1022 } 1023 } 1024 1025 return sum; 1026 } 1027 1028 /* 1029 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL 1030 */ 1031 unsigned int nr_free_buffer_pages(void) 1032 { 1033 return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK); 1034 } 1035 1036 /* 1037 * Amount of free RAM allocatable within all zones 1038 */ 1039 unsigned int nr_free_pagecache_pages(void) 1040 { 1041 return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK); 1042 } 1043 1044 #ifdef CONFIG_HIGHMEM 1045 unsigned int nr_free_highpages (void) 1046 { 1047 pg_data_t *pgdat; 1048 unsigned int pages = 0; 1049 1050 for_each_pgdat(pgdat) 1051 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; 1052 1053 return pages; 1054 } 1055 #endif 1056 1057 #ifdef CONFIG_NUMA 1058 static void show_node(struct zone *zone) 1059 { 1060 printk("Node %d ", zone->zone_pgdat->node_id); 1061 } 1062 #else 1063 #define show_node(zone) do { } while (0) 1064 #endif 1065 1066 /* 1067 * Accumulate the page_state information across all CPUs. 1068 * The result is unavoidably approximate - it can change 1069 * during and after execution of this function. 1070 */ 1071 static DEFINE_PER_CPU(struct page_state, page_states) = {0}; 1072 1073 atomic_t nr_pagecache = ATOMIC_INIT(0); 1074 EXPORT_SYMBOL(nr_pagecache); 1075 #ifdef CONFIG_SMP 1076 DEFINE_PER_CPU(long, nr_pagecache_local) = 0; 1077 #endif 1078 1079 void __get_page_state(struct page_state *ret, int nr) 1080 { 1081 int cpu = 0; 1082 1083 memset(ret, 0, sizeof(*ret)); 1084 1085 cpu = first_cpu(cpu_online_map); 1086 while (cpu < NR_CPUS) { 1087 unsigned long *in, *out, off; 1088 1089 in = (unsigned long *)&per_cpu(page_states, cpu); 1090 1091 cpu = next_cpu(cpu, cpu_online_map); 1092 1093 if (cpu < NR_CPUS) 1094 prefetch(&per_cpu(page_states, cpu)); 1095 1096 out = (unsigned long *)ret; 1097 for (off = 0; off < nr; off++) 1098 *out++ += *in++; 1099 } 1100 } 1101 1102 void get_page_state(struct page_state *ret) 1103 { 1104 int nr; 1105 1106 nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); 1107 nr /= sizeof(unsigned long); 1108 1109 __get_page_state(ret, nr + 1); 1110 } 1111 1112 void get_full_page_state(struct page_state *ret) 1113 { 1114 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long)); 1115 } 1116 1117 unsigned long __read_page_state(unsigned offset) 1118 { 1119 unsigned long ret = 0; 1120 int cpu; 1121 1122 for_each_online_cpu(cpu) { 1123 unsigned long in; 1124 1125 in = (unsigned long)&per_cpu(page_states, cpu) + offset; 1126 ret += *((unsigned long *)in); 1127 } 1128 return ret; 1129 } 1130 1131 void __mod_page_state(unsigned offset, unsigned long delta) 1132 { 1133 unsigned long flags; 1134 void* ptr; 1135 1136 local_irq_save(flags); 1137 ptr = &__get_cpu_var(page_states); 1138 *(unsigned long*)(ptr + offset) += delta; 1139 local_irq_restore(flags); 1140 } 1141 1142 EXPORT_SYMBOL(__mod_page_state); 1143 1144 void __get_zone_counts(unsigned long *active, unsigned long *inactive, 1145 unsigned long *free, struct pglist_data *pgdat) 1146 { 1147 struct zone *zones = pgdat->node_zones; 1148 int i; 1149 1150 *active = 0; 1151 *inactive = 0; 1152 *free = 0; 1153 for (i = 0; i < MAX_NR_ZONES; i++) { 1154 *active += zones[i].nr_active; 1155 *inactive += zones[i].nr_inactive; 1156 *free += zones[i].free_pages; 1157 } 1158 } 1159 1160 void get_zone_counts(unsigned long *active, 1161 unsigned long *inactive, unsigned long *free) 1162 { 1163 struct pglist_data *pgdat; 1164 1165 *active = 0; 1166 *inactive = 0; 1167 *free = 0; 1168 for_each_pgdat(pgdat) { 1169 unsigned long l, m, n; 1170 __get_zone_counts(&l, &m, &n, pgdat); 1171 *active += l; 1172 *inactive += m; 1173 *free += n; 1174 } 1175 } 1176 1177 void si_meminfo(struct sysinfo *val) 1178 { 1179 val->totalram = totalram_pages; 1180 val->sharedram = 0; 1181 val->freeram = nr_free_pages(); 1182 val->bufferram = nr_blockdev_pages(); 1183 #ifdef CONFIG_HIGHMEM 1184 val->totalhigh = totalhigh_pages; 1185 val->freehigh = nr_free_highpages(); 1186 #else 1187 val->totalhigh = 0; 1188 val->freehigh = 0; 1189 #endif 1190 val->mem_unit = PAGE_SIZE; 1191 } 1192 1193 EXPORT_SYMBOL(si_meminfo); 1194 1195 #ifdef CONFIG_NUMA 1196 void si_meminfo_node(struct sysinfo *val, int nid) 1197 { 1198 pg_data_t *pgdat = NODE_DATA(nid); 1199 1200 val->totalram = pgdat->node_present_pages; 1201 val->freeram = nr_free_pages_pgdat(pgdat); 1202 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; 1203 val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; 1204 val->mem_unit = PAGE_SIZE; 1205 } 1206 #endif 1207 1208 #define K(x) ((x) << (PAGE_SHIFT-10)) 1209 1210 /* 1211 * Show free area list (used inside shift_scroll-lock stuff) 1212 * We also calculate the percentage fragmentation. We do this by counting the 1213 * memory on each free list with the exception of the first item on the list. 1214 */ 1215 void show_free_areas(void) 1216 { 1217 struct page_state ps; 1218 int cpu, temperature; 1219 unsigned long active; 1220 unsigned long inactive; 1221 unsigned long free; 1222 struct zone *zone; 1223 1224 for_each_zone(zone) { 1225 show_node(zone); 1226 printk("%s per-cpu:", zone->name); 1227 1228 if (!zone->present_pages) { 1229 printk(" empty\n"); 1230 continue; 1231 } else 1232 printk("\n"); 1233 1234 for (cpu = 0; cpu < NR_CPUS; ++cpu) { 1235 struct per_cpu_pageset *pageset; 1236 1237 if (!cpu_possible(cpu)) 1238 continue; 1239 1240 pageset = zone->pageset + cpu; 1241 1242 for (temperature = 0; temperature < 2; temperature++) 1243 printk("cpu %d %s: low %d, high %d, batch %d\n", 1244 cpu, 1245 temperature ? "cold" : "hot", 1246 pageset->pcp[temperature].low, 1247 pageset->pcp[temperature].high, 1248 pageset->pcp[temperature].batch); 1249 } 1250 } 1251 1252 get_page_state(&ps); 1253 get_zone_counts(&active, &inactive, &free); 1254 1255 printk("\nFree pages: %11ukB (%ukB HighMem)\n", 1256 K(nr_free_pages()), 1257 K(nr_free_highpages())); 1258 1259 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " 1260 "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", 1261 active, 1262 inactive, 1263 ps.nr_dirty, 1264 ps.nr_writeback, 1265 ps.nr_unstable, 1266 nr_free_pages(), 1267 ps.nr_slab, 1268 ps.nr_mapped, 1269 ps.nr_page_table_pages); 1270 1271 for_each_zone(zone) { 1272 int i; 1273 1274 show_node(zone); 1275 printk("%s" 1276 " free:%lukB" 1277 " min:%lukB" 1278 " low:%lukB" 1279 " high:%lukB" 1280 " active:%lukB" 1281 " inactive:%lukB" 1282 " present:%lukB" 1283 " pages_scanned:%lu" 1284 " all_unreclaimable? %s" 1285 "\n", 1286 zone->name, 1287 K(zone->free_pages), 1288 K(zone->pages_min), 1289 K(zone->pages_low), 1290 K(zone->pages_high), 1291 K(zone->nr_active), 1292 K(zone->nr_inactive), 1293 K(zone->present_pages), 1294 zone->pages_scanned, 1295 (zone->all_unreclaimable ? "yes" : "no") 1296 ); 1297 printk("lowmem_reserve[]:"); 1298 for (i = 0; i < MAX_NR_ZONES; i++) 1299 printk(" %lu", zone->lowmem_reserve[i]); 1300 printk("\n"); 1301 } 1302 1303 for_each_zone(zone) { 1304 unsigned long nr, flags, order, total = 0; 1305 1306 show_node(zone); 1307 printk("%s: ", zone->name); 1308 if (!zone->present_pages) { 1309 printk("empty\n"); 1310 continue; 1311 } 1312 1313 spin_lock_irqsave(&zone->lock, flags); 1314 for (order = 0; order < MAX_ORDER; order++) { 1315 nr = zone->free_area[order].nr_free; 1316 total += nr << order; 1317 printk("%lu*%lukB ", nr, K(1UL) << order); 1318 } 1319 spin_unlock_irqrestore(&zone->lock, flags); 1320 printk("= %lukB\n", K(total)); 1321 } 1322 1323 show_swap_cache_info(); 1324 } 1325 1326 /* 1327 * Builds allocation fallback zone lists. 1328 */ 1329 static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) 1330 { 1331 switch (k) { 1332 struct zone *zone; 1333 default: 1334 BUG(); 1335 case ZONE_HIGHMEM: 1336 zone = pgdat->node_zones + ZONE_HIGHMEM; 1337 if (zone->present_pages) { 1338 #ifndef CONFIG_HIGHMEM 1339 BUG(); 1340 #endif 1341 zonelist->zones[j++] = zone; 1342 } 1343 case ZONE_NORMAL: 1344 zone = pgdat->node_zones + ZONE_NORMAL; 1345 if (zone->present_pages) 1346 zonelist->zones[j++] = zone; 1347 case ZONE_DMA: 1348 zone = pgdat->node_zones + ZONE_DMA; 1349 if (zone->present_pages) 1350 zonelist->zones[j++] = zone; 1351 } 1352 1353 return j; 1354 } 1355 1356 #ifdef CONFIG_NUMA 1357 #define MAX_NODE_LOAD (num_online_nodes()) 1358 static int __initdata node_load[MAX_NUMNODES]; 1359 /** 1360 * find_next_best_node - find the next node that should appear in a given node's fallback list 1361 * @node: node whose fallback list we're appending 1362 * @used_node_mask: nodemask_t of already used nodes 1363 * 1364 * We use a number of factors to determine which is the next node that should 1365 * appear on a given node's fallback list. The node should not have appeared 1366 * already in @node's fallback list, and it should be the next closest node 1367 * according to the distance array (which contains arbitrary distance values 1368 * from each node to each node in the system), and should also prefer nodes 1369 * with no CPUs, since presumably they'll have very little allocation pressure 1370 * on them otherwise. 1371 * It returns -1 if no node is found. 1372 */ 1373 static int __init find_next_best_node(int node, nodemask_t *used_node_mask) 1374 { 1375 int i, n, val; 1376 int min_val = INT_MAX; 1377 int best_node = -1; 1378 1379 for_each_online_node(i) { 1380 cpumask_t tmp; 1381 1382 /* Start from local node */ 1383 n = (node+i) % num_online_nodes(); 1384 1385 /* Don't want a node to appear more than once */ 1386 if (node_isset(n, *used_node_mask)) 1387 continue; 1388 1389 /* Use the local node if we haven't already */ 1390 if (!node_isset(node, *used_node_mask)) { 1391 best_node = node; 1392 break; 1393 } 1394 1395 /* Use the distance array to find the distance */ 1396 val = node_distance(node, n); 1397 1398 /* Give preference to headless and unused nodes */ 1399 tmp = node_to_cpumask(n); 1400 if (!cpus_empty(tmp)) 1401 val += PENALTY_FOR_NODE_WITH_CPUS; 1402 1403 /* Slight preference for less loaded node */ 1404 val *= (MAX_NODE_LOAD*MAX_NUMNODES); 1405 val += node_load[n]; 1406 1407 if (val < min_val) { 1408 min_val = val; 1409 best_node = n; 1410 } 1411 } 1412 1413 if (best_node >= 0) 1414 node_set(best_node, *used_node_mask); 1415 1416 return best_node; 1417 } 1418 1419 static void __init build_zonelists(pg_data_t *pgdat) 1420 { 1421 int i, j, k, node, local_node; 1422 int prev_node, load; 1423 struct zonelist *zonelist; 1424 nodemask_t used_mask; 1425 1426 /* initialize zonelists */ 1427 for (i = 0; i < GFP_ZONETYPES; i++) { 1428 zonelist = pgdat->node_zonelists + i; 1429 zonelist->zones[0] = NULL; 1430 } 1431 1432 /* NUMA-aware ordering of nodes */ 1433 local_node = pgdat->node_id; 1434 load = num_online_nodes(); 1435 prev_node = local_node; 1436 nodes_clear(used_mask); 1437 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 1438 /* 1439 * We don't want to pressure a particular node. 1440 * So adding penalty to the first node in same 1441 * distance group to make it round-robin. 1442 */ 1443 if (node_distance(local_node, node) != 1444 node_distance(local_node, prev_node)) 1445 node_load[node] += load; 1446 prev_node = node; 1447 load--; 1448 for (i = 0; i < GFP_ZONETYPES; i++) { 1449 zonelist = pgdat->node_zonelists + i; 1450 for (j = 0; zonelist->zones[j] != NULL; j++); 1451 1452 k = ZONE_NORMAL; 1453 if (i & __GFP_HIGHMEM) 1454 k = ZONE_HIGHMEM; 1455 if (i & __GFP_DMA) 1456 k = ZONE_DMA; 1457 1458 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); 1459 zonelist->zones[j] = NULL; 1460 } 1461 } 1462 } 1463 1464 #else /* CONFIG_NUMA */ 1465 1466 static void __init build_zonelists(pg_data_t *pgdat) 1467 { 1468 int i, j, k, node, local_node; 1469 1470 local_node = pgdat->node_id; 1471 for (i = 0; i < GFP_ZONETYPES; i++) { 1472 struct zonelist *zonelist; 1473 1474 zonelist = pgdat->node_zonelists + i; 1475 1476 j = 0; 1477 k = ZONE_NORMAL; 1478 if (i & __GFP_HIGHMEM) 1479 k = ZONE_HIGHMEM; 1480 if (i & __GFP_DMA) 1481 k = ZONE_DMA; 1482 1483 j = build_zonelists_node(pgdat, zonelist, j, k); 1484 /* 1485 * Now we build the zonelist so that it contains the zones 1486 * of all the other nodes. 1487 * We don't want to pressure a particular node, so when 1488 * building the zones for node N, we make sure that the 1489 * zones coming right after the local ones are those from 1490 * node N+1 (modulo N) 1491 */ 1492 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 1493 if (!node_online(node)) 1494 continue; 1495 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); 1496 } 1497 for (node = 0; node < local_node; node++) { 1498 if (!node_online(node)) 1499 continue; 1500 j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); 1501 } 1502 1503 zonelist->zones[j] = NULL; 1504 } 1505 } 1506 1507 #endif /* CONFIG_NUMA */ 1508 1509 void __init build_all_zonelists(void) 1510 { 1511 int i; 1512 1513 for_each_online_node(i) 1514 build_zonelists(NODE_DATA(i)); 1515 printk("Built %i zonelists\n", num_online_nodes()); 1516 cpuset_init_current_mems_allowed(); 1517 } 1518 1519 /* 1520 * Helper functions to size the waitqueue hash table. 1521 * Essentially these want to choose hash table sizes sufficiently 1522 * large so that collisions trying to wait on pages are rare. 1523 * But in fact, the number of active page waitqueues on typical 1524 * systems is ridiculously low, less than 200. So this is even 1525 * conservative, even though it seems large. 1526 * 1527 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to 1528 * waitqueues, i.e. the size of the waitq table given the number of pages. 1529 */ 1530 #define PAGES_PER_WAITQUEUE 256 1531 1532 static inline unsigned long wait_table_size(unsigned long pages) 1533 { 1534 unsigned long size = 1; 1535 1536 pages /= PAGES_PER_WAITQUEUE; 1537 1538 while (size < pages) 1539 size <<= 1; 1540 1541 /* 1542 * Once we have dozens or even hundreds of threads sleeping 1543 * on IO we've got bigger problems than wait queue collision. 1544 * Limit the size of the wait table to a reasonable size. 1545 */ 1546 size = min(size, 4096UL); 1547 1548 return max(size, 4UL); 1549 } 1550 1551 /* 1552 * This is an integer logarithm so that shifts can be used later 1553 * to extract the more random high bits from the multiplicative 1554 * hash function before the remainder is taken. 1555 */ 1556 static inline unsigned long wait_table_bits(unsigned long size) 1557 { 1558 return ffz(~size); 1559 } 1560 1561 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 1562 1563 static void __init calculate_zone_totalpages(struct pglist_data *pgdat, 1564 unsigned long *zones_size, unsigned long *zholes_size) 1565 { 1566 unsigned long realtotalpages, totalpages = 0; 1567 int i; 1568 1569 for (i = 0; i < MAX_NR_ZONES; i++) 1570 totalpages += zones_size[i]; 1571 pgdat->node_spanned_pages = totalpages; 1572 1573 realtotalpages = totalpages; 1574 if (zholes_size) 1575 for (i = 0; i < MAX_NR_ZONES; i++) 1576 realtotalpages -= zholes_size[i]; 1577 pgdat->node_present_pages = realtotalpages; 1578 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); 1579 } 1580 1581 1582 /* 1583 * Initially all pages are reserved - free ones are freed 1584 * up by free_all_bootmem() once the early boot process is 1585 * done. Non-atomic initialization, single-pass. 1586 */ 1587 void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, 1588 unsigned long start_pfn) 1589 { 1590 struct page *start = pfn_to_page(start_pfn); 1591 struct page *page; 1592 1593 for (page = start; page < (start + size); page++) { 1594 set_page_zone(page, NODEZONE(nid, zone)); 1595 set_page_count(page, 0); 1596 reset_page_mapcount(page); 1597 SetPageReserved(page); 1598 INIT_LIST_HEAD(&page->lru); 1599 #ifdef WANT_PAGE_VIRTUAL 1600 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 1601 if (!is_highmem_idx(zone)) 1602 set_page_address(page, __va(start_pfn << PAGE_SHIFT)); 1603 #endif 1604 start_pfn++; 1605 } 1606 } 1607 1608 void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, 1609 unsigned long size) 1610 { 1611 int order; 1612 for (order = 0; order < MAX_ORDER ; order++) { 1613 INIT_LIST_HEAD(&zone->free_area[order].free_list); 1614 zone->free_area[order].nr_free = 0; 1615 } 1616 } 1617 1618 #ifndef __HAVE_ARCH_MEMMAP_INIT 1619 #define memmap_init(size, nid, zone, start_pfn) \ 1620 memmap_init_zone((size), (nid), (zone), (start_pfn)) 1621 #endif 1622 1623 /* 1624 * Set up the zone data structures: 1625 * - mark all pages reserved 1626 * - mark all memory queues empty 1627 * - clear the memory bitmaps 1628 */ 1629 static void __init free_area_init_core(struct pglist_data *pgdat, 1630 unsigned long *zones_size, unsigned long *zholes_size) 1631 { 1632 unsigned long i, j; 1633 const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); 1634 int cpu, nid = pgdat->node_id; 1635 unsigned long zone_start_pfn = pgdat->node_start_pfn; 1636 1637 pgdat->nr_zones = 0; 1638 init_waitqueue_head(&pgdat->kswapd_wait); 1639 pgdat->kswapd_max_order = 0; 1640 1641 for (j = 0; j < MAX_NR_ZONES; j++) { 1642 struct zone *zone = pgdat->node_zones + j; 1643 unsigned long size, realsize; 1644 unsigned long batch; 1645 1646 zone_table[NODEZONE(nid, j)] = zone; 1647 realsize = size = zones_size[j]; 1648 if (zholes_size) 1649 realsize -= zholes_size[j]; 1650 1651 if (j == ZONE_DMA || j == ZONE_NORMAL) 1652 nr_kernel_pages += realsize; 1653 nr_all_pages += realsize; 1654 1655 zone->spanned_pages = size; 1656 zone->present_pages = realsize; 1657 zone->name = zone_names[j]; 1658 spin_lock_init(&zone->lock); 1659 spin_lock_init(&zone->lru_lock); 1660 zone->zone_pgdat = pgdat; 1661 zone->free_pages = 0; 1662 1663 zone->temp_priority = zone->prev_priority = DEF_PRIORITY; 1664 1665 /* 1666 * The per-cpu-pages pools are set to around 1000th of the 1667 * size of the zone. But no more than 1/4 of a meg - there's 1668 * no point in going beyond the size of L2 cache. 1669 * 1670 * OK, so we don't know how big the cache is. So guess. 1671 */ 1672 batch = zone->present_pages / 1024; 1673 if (batch * PAGE_SIZE > 256 * 1024) 1674 batch = (256 * 1024) / PAGE_SIZE; 1675 batch /= 4; /* We effectively *= 4 below */ 1676 if (batch < 1) 1677 batch = 1; 1678 1679 /* 1680 * Clamp the batch to a 2^n - 1 value. Having a power 1681 * of 2 value was found to be more likely to have 1682 * suboptimal cache aliasing properties in some cases. 1683 * 1684 * For example if 2 tasks are alternately allocating 1685 * batches of pages, one task can end up with a lot 1686 * of pages of one half of the possible page colors 1687 * and the other with pages of the other colors. 1688 */ 1689 batch = (1 << fls(batch + batch/2)) - 1; 1690 1691 for (cpu = 0; cpu < NR_CPUS; cpu++) { 1692 struct per_cpu_pages *pcp; 1693 1694 pcp = &zone->pageset[cpu].pcp[0]; /* hot */ 1695 pcp->count = 0; 1696 pcp->low = 2 * batch; 1697 pcp->high = 6 * batch; 1698 pcp->batch = 1 * batch; 1699 INIT_LIST_HEAD(&pcp->list); 1700 1701 pcp = &zone->pageset[cpu].pcp[1]; /* cold */ 1702 pcp->count = 0; 1703 pcp->low = 0; 1704 pcp->high = 2 * batch; 1705 pcp->batch = 1 * batch; 1706 INIT_LIST_HEAD(&pcp->list); 1707 } 1708 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", 1709 zone_names[j], realsize, batch); 1710 INIT_LIST_HEAD(&zone->active_list); 1711 INIT_LIST_HEAD(&zone->inactive_list); 1712 zone->nr_scan_active = 0; 1713 zone->nr_scan_inactive = 0; 1714 zone->nr_active = 0; 1715 zone->nr_inactive = 0; 1716 if (!size) 1717 continue; 1718 1719 /* 1720 * The per-page waitqueue mechanism uses hashed waitqueues 1721 * per zone. 1722 */ 1723 zone->wait_table_size = wait_table_size(size); 1724 zone->wait_table_bits = 1725 wait_table_bits(zone->wait_table_size); 1726 zone->wait_table = (wait_queue_head_t *) 1727 alloc_bootmem_node(pgdat, zone->wait_table_size 1728 * sizeof(wait_queue_head_t)); 1729 1730 for(i = 0; i < zone->wait_table_size; ++i) 1731 init_waitqueue_head(zone->wait_table + i); 1732 1733 pgdat->nr_zones = j+1; 1734 1735 zone->zone_mem_map = pfn_to_page(zone_start_pfn); 1736 zone->zone_start_pfn = zone_start_pfn; 1737 1738 if ((zone_start_pfn) & (zone_required_alignment-1)) 1739 printk(KERN_CRIT "BUG: wrong zone alignment, it will crash\n"); 1740 1741 memmap_init(size, nid, j, zone_start_pfn); 1742 1743 zone_start_pfn += size; 1744 1745 zone_init_free_lists(pgdat, zone, zone->spanned_pages); 1746 } 1747 } 1748 1749 static void __init alloc_node_mem_map(struct pglist_data *pgdat) 1750 { 1751 unsigned long size; 1752 1753 /* Skip empty nodes */ 1754 if (!pgdat->node_spanned_pages) 1755 return; 1756 1757 /* ia64 gets its own node_mem_map, before this, without bootmem */ 1758 if (!pgdat->node_mem_map) { 1759 size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); 1760 pgdat->node_mem_map = alloc_bootmem_node(pgdat, size); 1761 } 1762 #ifndef CONFIG_DISCONTIGMEM 1763 /* 1764 * With no DISCONTIG, the global mem_map is just set as node 0's 1765 */ 1766 if (pgdat == NODE_DATA(0)) 1767 mem_map = NODE_DATA(0)->node_mem_map; 1768 #endif 1769 } 1770 1771 void __init free_area_init_node(int nid, struct pglist_data *pgdat, 1772 unsigned long *zones_size, unsigned long node_start_pfn, 1773 unsigned long *zholes_size) 1774 { 1775 pgdat->node_id = nid; 1776 pgdat->node_start_pfn = node_start_pfn; 1777 calculate_zone_totalpages(pgdat, zones_size, zholes_size); 1778 1779 alloc_node_mem_map(pgdat); 1780 1781 free_area_init_core(pgdat, zones_size, zholes_size); 1782 } 1783 1784 #ifndef CONFIG_DISCONTIGMEM 1785 static bootmem_data_t contig_bootmem_data; 1786 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; 1787 1788 EXPORT_SYMBOL(contig_page_data); 1789 1790 void __init free_area_init(unsigned long *zones_size) 1791 { 1792 free_area_init_node(0, &contig_page_data, zones_size, 1793 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 1794 } 1795 #endif 1796 1797 #ifdef CONFIG_PROC_FS 1798 1799 #include <linux/seq_file.h> 1800 1801 static void *frag_start(struct seq_file *m, loff_t *pos) 1802 { 1803 pg_data_t *pgdat; 1804 loff_t node = *pos; 1805 1806 for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next) 1807 --node; 1808 1809 return pgdat; 1810 } 1811 1812 static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) 1813 { 1814 pg_data_t *pgdat = (pg_data_t *)arg; 1815 1816 (*pos)++; 1817 return pgdat->pgdat_next; 1818 } 1819 1820 static void frag_stop(struct seq_file *m, void *arg) 1821 { 1822 } 1823 1824 /* 1825 * This walks the free areas for each zone. 1826 */ 1827 static int frag_show(struct seq_file *m, void *arg) 1828 { 1829 pg_data_t *pgdat = (pg_data_t *)arg; 1830 struct zone *zone; 1831 struct zone *node_zones = pgdat->node_zones; 1832 unsigned long flags; 1833 int order; 1834 1835 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { 1836 if (!zone->present_pages) 1837 continue; 1838 1839 spin_lock_irqsave(&zone->lock, flags); 1840 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); 1841 for (order = 0; order < MAX_ORDER; ++order) 1842 seq_printf(m, "%6lu ", zone->free_area[order].nr_free); 1843 spin_unlock_irqrestore(&zone->lock, flags); 1844 seq_putc(m, '\n'); 1845 } 1846 return 0; 1847 } 1848 1849 struct seq_operations fragmentation_op = { 1850 .start = frag_start, 1851 .next = frag_next, 1852 .stop = frag_stop, 1853 .show = frag_show, 1854 }; 1855 1856 static char *vmstat_text[] = { 1857 "nr_dirty", 1858 "nr_writeback", 1859 "nr_unstable", 1860 "nr_page_table_pages", 1861 "nr_mapped", 1862 "nr_slab", 1863 1864 "pgpgin", 1865 "pgpgout", 1866 "pswpin", 1867 "pswpout", 1868 "pgalloc_high", 1869 1870 "pgalloc_normal", 1871 "pgalloc_dma", 1872 "pgfree", 1873 "pgactivate", 1874 "pgdeactivate", 1875 1876 "pgfault", 1877 "pgmajfault", 1878 "pgrefill_high", 1879 "pgrefill_normal", 1880 "pgrefill_dma", 1881 1882 "pgsteal_high", 1883 "pgsteal_normal", 1884 "pgsteal_dma", 1885 "pgscan_kswapd_high", 1886 "pgscan_kswapd_normal", 1887 1888 "pgscan_kswapd_dma", 1889 "pgscan_direct_high", 1890 "pgscan_direct_normal", 1891 "pgscan_direct_dma", 1892 "pginodesteal", 1893 1894 "slabs_scanned", 1895 "kswapd_steal", 1896 "kswapd_inodesteal", 1897 "pageoutrun", 1898 "allocstall", 1899 1900 "pgrotated", 1901 "nr_bounce", 1902 }; 1903 1904 static void *vmstat_start(struct seq_file *m, loff_t *pos) 1905 { 1906 struct page_state *ps; 1907 1908 if (*pos >= ARRAY_SIZE(vmstat_text)) 1909 return NULL; 1910 1911 ps = kmalloc(sizeof(*ps), GFP_KERNEL); 1912 m->private = ps; 1913 if (!ps) 1914 return ERR_PTR(-ENOMEM); 1915 get_full_page_state(ps); 1916 ps->pgpgin /= 2; /* sectors -> kbytes */ 1917 ps->pgpgout /= 2; 1918 return (unsigned long *)ps + *pos; 1919 } 1920 1921 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) 1922 { 1923 (*pos)++; 1924 if (*pos >= ARRAY_SIZE(vmstat_text)) 1925 return NULL; 1926 return (unsigned long *)m->private + *pos; 1927 } 1928 1929 static int vmstat_show(struct seq_file *m, void *arg) 1930 { 1931 unsigned long *l = arg; 1932 unsigned long off = l - (unsigned long *)m->private; 1933 1934 seq_printf(m, "%s %lu\n", vmstat_text[off], *l); 1935 return 0; 1936 } 1937 1938 static void vmstat_stop(struct seq_file *m, void *arg) 1939 { 1940 kfree(m->private); 1941 m->private = NULL; 1942 } 1943 1944 struct seq_operations vmstat_op = { 1945 .start = vmstat_start, 1946 .next = vmstat_next, 1947 .stop = vmstat_stop, 1948 .show = vmstat_show, 1949 }; 1950 1951 #endif /* CONFIG_PROC_FS */ 1952 1953 #ifdef CONFIG_HOTPLUG_CPU 1954 static int page_alloc_cpu_notify(struct notifier_block *self, 1955 unsigned long action, void *hcpu) 1956 { 1957 int cpu = (unsigned long)hcpu; 1958 long *count; 1959 unsigned long *src, *dest; 1960 1961 if (action == CPU_DEAD) { 1962 int i; 1963 1964 /* Drain local pagecache count. */ 1965 count = &per_cpu(nr_pagecache_local, cpu); 1966 atomic_add(*count, &nr_pagecache); 1967 *count = 0; 1968 local_irq_disable(); 1969 __drain_pages(cpu); 1970 1971 /* Add dead cpu's page_states to our own. */ 1972 dest = (unsigned long *)&__get_cpu_var(page_states); 1973 src = (unsigned long *)&per_cpu(page_states, cpu); 1974 1975 for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long); 1976 i++) { 1977 dest[i] += src[i]; 1978 src[i] = 0; 1979 } 1980 1981 local_irq_enable(); 1982 } 1983 return NOTIFY_OK; 1984 } 1985 #endif /* CONFIG_HOTPLUG_CPU */ 1986 1987 void __init page_alloc_init(void) 1988 { 1989 hotcpu_notifier(page_alloc_cpu_notify, 0); 1990 } 1991 1992 /* 1993 * setup_per_zone_lowmem_reserve - called whenever 1994 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone 1995 * has a correct pages reserved value, so an adequate number of 1996 * pages are left in the zone after a successful __alloc_pages(). 1997 */ 1998 static void setup_per_zone_lowmem_reserve(void) 1999 { 2000 struct pglist_data *pgdat; 2001 int j, idx; 2002 2003 for_each_pgdat(pgdat) { 2004 for (j = 0; j < MAX_NR_ZONES; j++) { 2005 struct zone *zone = pgdat->node_zones + j; 2006 unsigned long present_pages = zone->present_pages; 2007 2008 zone->lowmem_reserve[j] = 0; 2009 2010 for (idx = j-1; idx >= 0; idx--) { 2011 struct zone *lower_zone; 2012 2013 if (sysctl_lowmem_reserve_ratio[idx] < 1) 2014 sysctl_lowmem_reserve_ratio[idx] = 1; 2015 2016 lower_zone = pgdat->node_zones + idx; 2017 lower_zone->lowmem_reserve[j] = present_pages / 2018 sysctl_lowmem_reserve_ratio[idx]; 2019 present_pages += lower_zone->present_pages; 2020 } 2021 } 2022 } 2023 } 2024 2025 /* 2026 * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures 2027 * that the pages_{min,low,high} values for each zone are set correctly 2028 * with respect to min_free_kbytes. 2029 */ 2030 static void setup_per_zone_pages_min(void) 2031 { 2032 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 2033 unsigned long lowmem_pages = 0; 2034 struct zone *zone; 2035 unsigned long flags; 2036 2037 /* Calculate total number of !ZONE_HIGHMEM pages */ 2038 for_each_zone(zone) { 2039 if (!is_highmem(zone)) 2040 lowmem_pages += zone->present_pages; 2041 } 2042 2043 for_each_zone(zone) { 2044 spin_lock_irqsave(&zone->lru_lock, flags); 2045 if (is_highmem(zone)) { 2046 /* 2047 * Often, highmem doesn't need to reserve any pages. 2048 * But the pages_min/low/high values are also used for 2049 * batching up page reclaim activity so we need a 2050 * decent value here. 2051 */ 2052 int min_pages; 2053 2054 min_pages = zone->present_pages / 1024; 2055 if (min_pages < SWAP_CLUSTER_MAX) 2056 min_pages = SWAP_CLUSTER_MAX; 2057 if (min_pages > 128) 2058 min_pages = 128; 2059 zone->pages_min = min_pages; 2060 } else { 2061 /* if it's a lowmem zone, reserve a number of pages 2062 * proportionate to the zone's size. 2063 */ 2064 zone->pages_min = (pages_min * zone->present_pages) / 2065 lowmem_pages; 2066 } 2067 2068 /* 2069 * When interpreting these watermarks, just keep in mind that: 2070 * zone->pages_min == (zone->pages_min * 4) / 4; 2071 */ 2072 zone->pages_low = (zone->pages_min * 5) / 4; 2073 zone->pages_high = (zone->pages_min * 6) / 4; 2074 spin_unlock_irqrestore(&zone->lru_lock, flags); 2075 } 2076 } 2077 2078 /* 2079 * Initialise min_free_kbytes. 2080 * 2081 * For small machines we want it small (128k min). For large machines 2082 * we want it large (64MB max). But it is not linear, because network 2083 * bandwidth does not increase linearly with machine size. We use 2084 * 2085 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 2086 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 2087 * 2088 * which yields 2089 * 2090 * 16MB: 512k 2091 * 32MB: 724k 2092 * 64MB: 1024k 2093 * 128MB: 1448k 2094 * 256MB: 2048k 2095 * 512MB: 2896k 2096 * 1024MB: 4096k 2097 * 2048MB: 5792k 2098 * 4096MB: 8192k 2099 * 8192MB: 11584k 2100 * 16384MB: 16384k 2101 */ 2102 static int __init init_per_zone_pages_min(void) 2103 { 2104 unsigned long lowmem_kbytes; 2105 2106 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 2107 2108 min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 2109 if (min_free_kbytes < 128) 2110 min_free_kbytes = 128; 2111 if (min_free_kbytes > 65536) 2112 min_free_kbytes = 65536; 2113 setup_per_zone_pages_min(); 2114 setup_per_zone_lowmem_reserve(); 2115 return 0; 2116 } 2117 module_init(init_per_zone_pages_min) 2118 2119 /* 2120 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 2121 * that we can call two helper functions whenever min_free_kbytes 2122 * changes. 2123 */ 2124 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 2125 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 2126 { 2127 proc_dointvec(table, write, file, buffer, length, ppos); 2128 setup_per_zone_pages_min(); 2129 return 0; 2130 } 2131 2132 /* 2133 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 2134 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 2135 * whenever sysctl_lowmem_reserve_ratio changes. 2136 * 2137 * The reserve ratio obviously has absolutely no relation with the 2138 * pages_min watermarks. The lowmem reserve ratio can only make sense 2139 * if in function of the boot time zone sizes. 2140 */ 2141 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 2142 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 2143 { 2144 proc_dointvec_minmax(table, write, file, buffer, length, ppos); 2145 setup_per_zone_lowmem_reserve(); 2146 return 0; 2147 } 2148 2149 __initdata int hashdist = HASHDIST_DEFAULT; 2150 2151 #ifdef CONFIG_NUMA 2152 static int __init set_hashdist(char *str) 2153 { 2154 if (!str) 2155 return 0; 2156 hashdist = simple_strtoul(str, &str, 0); 2157 return 1; 2158 } 2159 __setup("hashdist=", set_hashdist); 2160 #endif 2161 2162 /* 2163 * allocate a large system hash table from bootmem 2164 * - it is assumed that the hash table must contain an exact power-of-2 2165 * quantity of entries 2166 * - limit is the number of hash buckets, not the total allocation size 2167 */ 2168 void *__init alloc_large_system_hash(const char *tablename, 2169 unsigned long bucketsize, 2170 unsigned long numentries, 2171 int scale, 2172 int flags, 2173 unsigned int *_hash_shift, 2174 unsigned int *_hash_mask, 2175 unsigned long limit) 2176 { 2177 unsigned long long max = limit; 2178 unsigned long log2qty, size; 2179 void *table = NULL; 2180 2181 /* allow the kernel cmdline to have a say */ 2182 if (!numentries) { 2183 /* round applicable memory size up to nearest megabyte */ 2184 numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages; 2185 numentries += (1UL << (20 - PAGE_SHIFT)) - 1; 2186 numentries >>= 20 - PAGE_SHIFT; 2187 numentries <<= 20 - PAGE_SHIFT; 2188 2189 /* limit to 1 bucket per 2^scale bytes of low memory */ 2190 if (scale > PAGE_SHIFT) 2191 numentries >>= (scale - PAGE_SHIFT); 2192 else 2193 numentries <<= (PAGE_SHIFT - scale); 2194 } 2195 /* rounded up to nearest power of 2 in size */ 2196 numentries = 1UL << (long_log2(numentries) + 1); 2197 2198 /* limit allocation size to 1/16 total memory by default */ 2199 if (max == 0) { 2200 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 2201 do_div(max, bucketsize); 2202 } 2203 2204 if (numentries > max) 2205 numentries = max; 2206 2207 log2qty = long_log2(numentries); 2208 2209 do { 2210 size = bucketsize << log2qty; 2211 if (flags & HASH_EARLY) 2212 table = alloc_bootmem(size); 2213 else if (hashdist) 2214 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 2215 else { 2216 unsigned long order; 2217 for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) 2218 ; 2219 table = (void*) __get_free_pages(GFP_ATOMIC, order); 2220 } 2221 } while (!table && size > PAGE_SIZE && --log2qty); 2222 2223 if (!table) 2224 panic("Failed to allocate %s hash table\n", tablename); 2225 2226 printk("%s hash table entries: %d (order: %d, %lu bytes)\n", 2227 tablename, 2228 (1U << log2qty), 2229 long_log2(size) - PAGE_SHIFT, 2230 size); 2231 2232 if (_hash_shift) 2233 *_hash_shift = log2qty; 2234 if (_hash_mask) 2235 *_hash_mask = (1 << log2qty) - 1; 2236 2237 return table; 2238 } 2239