1 /* 2 * linux/mm/compaction.c 3 * 4 * Memory compaction for the reduction of external fragmentation. Note that 5 * this heavily depends upon page migration to do all the real heavy 6 * lifting 7 * 8 * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie> 9 */ 10 #include <linux/swap.h> 11 #include <linux/migrate.h> 12 #include <linux/compaction.h> 13 #include <linux/mm_inline.h> 14 #include <linux/backing-dev.h> 15 #include <linux/sysctl.h> 16 #include <linux/sysfs.h> 17 #include <linux/balloon_compaction.h> 18 #include <linux/page-isolation.h> 19 #include <linux/kasan.h> 20 #include "internal.h" 21 22 #ifdef CONFIG_COMPACTION 23 static inline void count_compact_event(enum vm_event_item item) 24 { 25 count_vm_event(item); 26 } 27 28 static inline void count_compact_events(enum vm_event_item item, long delta) 29 { 30 count_vm_events(item, delta); 31 } 32 #else 33 #define count_compact_event(item) do { } while (0) 34 #define count_compact_events(item, delta) do { } while (0) 35 #endif 36 37 #if defined CONFIG_COMPACTION || defined CONFIG_CMA 38 39 #define CREATE_TRACE_POINTS 40 #include <trace/events/compaction.h> 41 42 static unsigned long release_freepages(struct list_head *freelist) 43 { 44 struct page *page, *next; 45 unsigned long high_pfn = 0; 46 47 list_for_each_entry_safe(page, next, freelist, lru) { 48 unsigned long pfn = page_to_pfn(page); 49 list_del(&page->lru); 50 __free_page(page); 51 if (pfn > high_pfn) 52 high_pfn = pfn; 53 } 54 55 return high_pfn; 56 } 57 58 static void map_pages(struct list_head *list) 59 { 60 struct page *page; 61 62 list_for_each_entry(page, list, lru) { 63 arch_alloc_page(page, 0); 64 kernel_map_pages(page, 1, 1); 65 kasan_alloc_pages(page, 0); 66 } 67 } 68 69 static inline bool migrate_async_suitable(int migratetype) 70 { 71 return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; 72 } 73 74 /* 75 * Check that the whole (or subset of) a pageblock given by the interval of 76 * [start_pfn, end_pfn) is valid and within the same zone, before scanning it 77 * with the migration of free compaction scanner. The scanners then need to 78 * use only pfn_valid_within() check for arches that allow holes within 79 * pageblocks. 80 * 81 * Return struct page pointer of start_pfn, or NULL if checks were not passed. 82 * 83 * It's possible on some configurations to have a setup like node0 node1 node0 84 * i.e. it's possible that all pages within a zones range of pages do not 85 * belong to a single zone. We assume that a border between node0 and node1 86 * can occur within a single pageblock, but not a node0 node1 node0 87 * interleaving within a single pageblock. It is therefore sufficient to check 88 * the first and last page of a pageblock and avoid checking each individual 89 * page in a pageblock. 90 */ 91 static struct page *pageblock_pfn_to_page(unsigned long start_pfn, 92 unsigned long end_pfn, struct zone *zone) 93 { 94 struct page *start_page; 95 struct page *end_page; 96 97 /* end_pfn is one past the range we are checking */ 98 end_pfn--; 99 100 if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) 101 return NULL; 102 103 start_page = pfn_to_page(start_pfn); 104 105 if (page_zone(start_page) != zone) 106 return NULL; 107 108 end_page = pfn_to_page(end_pfn); 109 110 /* This gives a shorter code than deriving page_zone(end_page) */ 111 if (page_zone_id(start_page) != page_zone_id(end_page)) 112 return NULL; 113 114 return start_page; 115 } 116 117 #ifdef CONFIG_COMPACTION 118 119 /* Do not skip compaction more than 64 times */ 120 #define COMPACT_MAX_DEFER_SHIFT 6 121 122 /* 123 * Compaction is deferred when compaction fails to result in a page 124 * allocation success. 1 << compact_defer_limit compactions are skipped up 125 * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT 126 */ 127 void defer_compaction(struct zone *zone, int order) 128 { 129 zone->compact_considered = 0; 130 zone->compact_defer_shift++; 131 132 if (order < zone->compact_order_failed) 133 zone->compact_order_failed = order; 134 135 if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT) 136 zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT; 137 138 trace_mm_compaction_defer_compaction(zone, order); 139 } 140 141 /* Returns true if compaction should be skipped this time */ 142 bool compaction_deferred(struct zone *zone, int order) 143 { 144 unsigned long defer_limit = 1UL << zone->compact_defer_shift; 145 146 if (order < zone->compact_order_failed) 147 return false; 148 149 /* Avoid possible overflow */ 150 if (++zone->compact_considered > defer_limit) 151 zone->compact_considered = defer_limit; 152 153 if (zone->compact_considered >= defer_limit) 154 return false; 155 156 trace_mm_compaction_deferred(zone, order); 157 158 return true; 159 } 160 161 /* 162 * Update defer tracking counters after successful compaction of given order, 163 * which means an allocation either succeeded (alloc_success == true) or is 164 * expected to succeed. 165 */ 166 void compaction_defer_reset(struct zone *zone, int order, 167 bool alloc_success) 168 { 169 if (alloc_success) { 170 zone->compact_considered = 0; 171 zone->compact_defer_shift = 0; 172 } 173 if (order >= zone->compact_order_failed) 174 zone->compact_order_failed = order + 1; 175 176 trace_mm_compaction_defer_reset(zone, order); 177 } 178 179 /* Returns true if restarting compaction after many failures */ 180 bool compaction_restarting(struct zone *zone, int order) 181 { 182 if (order < zone->compact_order_failed) 183 return false; 184 185 return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT && 186 zone->compact_considered >= 1UL << zone->compact_defer_shift; 187 } 188 189 /* Returns true if the pageblock should be scanned for pages to isolate. */ 190 static inline bool isolation_suitable(struct compact_control *cc, 191 struct page *page) 192 { 193 if (cc->ignore_skip_hint) 194 return true; 195 196 return !get_pageblock_skip(page); 197 } 198 199 static void reset_cached_positions(struct zone *zone) 200 { 201 zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; 202 zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; 203 zone->compact_cached_free_pfn = zone_end_pfn(zone); 204 } 205 206 /* 207 * This function is called to clear all cached information on pageblocks that 208 * should be skipped for page isolation when the migrate and free page scanner 209 * meet. 210 */ 211 static void __reset_isolation_suitable(struct zone *zone) 212 { 213 unsigned long start_pfn = zone->zone_start_pfn; 214 unsigned long end_pfn = zone_end_pfn(zone); 215 unsigned long pfn; 216 217 zone->compact_blockskip_flush = false; 218 219 /* Walk the zone and mark every pageblock as suitable for isolation */ 220 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 221 struct page *page; 222 223 cond_resched(); 224 225 if (!pfn_valid(pfn)) 226 continue; 227 228 page = pfn_to_page(pfn); 229 if (zone != page_zone(page)) 230 continue; 231 232 clear_pageblock_skip(page); 233 } 234 235 reset_cached_positions(zone); 236 } 237 238 void reset_isolation_suitable(pg_data_t *pgdat) 239 { 240 int zoneid; 241 242 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { 243 struct zone *zone = &pgdat->node_zones[zoneid]; 244 if (!populated_zone(zone)) 245 continue; 246 247 /* Only flush if a full compaction finished recently */ 248 if (zone->compact_blockskip_flush) 249 __reset_isolation_suitable(zone); 250 } 251 } 252 253 /* 254 * If no pages were isolated then mark this pageblock to be skipped in the 255 * future. The information is later cleared by __reset_isolation_suitable(). 256 */ 257 static void update_pageblock_skip(struct compact_control *cc, 258 struct page *page, unsigned long nr_isolated, 259 bool migrate_scanner) 260 { 261 struct zone *zone = cc->zone; 262 unsigned long pfn; 263 264 if (cc->ignore_skip_hint) 265 return; 266 267 if (!page) 268 return; 269 270 if (nr_isolated) 271 return; 272 273 set_pageblock_skip(page); 274 275 pfn = page_to_pfn(page); 276 277 /* Update where async and sync compaction should restart */ 278 if (migrate_scanner) { 279 if (pfn > zone->compact_cached_migrate_pfn[0]) 280 zone->compact_cached_migrate_pfn[0] = pfn; 281 if (cc->mode != MIGRATE_ASYNC && 282 pfn > zone->compact_cached_migrate_pfn[1]) 283 zone->compact_cached_migrate_pfn[1] = pfn; 284 } else { 285 if (pfn < zone->compact_cached_free_pfn) 286 zone->compact_cached_free_pfn = pfn; 287 } 288 } 289 #else 290 static inline bool isolation_suitable(struct compact_control *cc, 291 struct page *page) 292 { 293 return true; 294 } 295 296 static void update_pageblock_skip(struct compact_control *cc, 297 struct page *page, unsigned long nr_isolated, 298 bool migrate_scanner) 299 { 300 } 301 #endif /* CONFIG_COMPACTION */ 302 303 /* 304 * Compaction requires the taking of some coarse locks that are potentially 305 * very heavily contended. For async compaction, back out if the lock cannot 306 * be taken immediately. For sync compaction, spin on the lock if needed. 307 * 308 * Returns true if the lock is held 309 * Returns false if the lock is not held and compaction should abort 310 */ 311 static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags, 312 struct compact_control *cc) 313 { 314 if (cc->mode == MIGRATE_ASYNC) { 315 if (!spin_trylock_irqsave(lock, *flags)) { 316 cc->contended = COMPACT_CONTENDED_LOCK; 317 return false; 318 } 319 } else { 320 spin_lock_irqsave(lock, *flags); 321 } 322 323 return true; 324 } 325 326 /* 327 * Compaction requires the taking of some coarse locks that are potentially 328 * very heavily contended. The lock should be periodically unlocked to avoid 329 * having disabled IRQs for a long time, even when there is nobody waiting on 330 * the lock. It might also be that allowing the IRQs will result in 331 * need_resched() becoming true. If scheduling is needed, async compaction 332 * aborts. Sync compaction schedules. 333 * Either compaction type will also abort if a fatal signal is pending. 334 * In either case if the lock was locked, it is dropped and not regained. 335 * 336 * Returns true if compaction should abort due to fatal signal pending, or 337 * async compaction due to need_resched() 338 * Returns false when compaction can continue (sync compaction might have 339 * scheduled) 340 */ 341 static bool compact_unlock_should_abort(spinlock_t *lock, 342 unsigned long flags, bool *locked, struct compact_control *cc) 343 { 344 if (*locked) { 345 spin_unlock_irqrestore(lock, flags); 346 *locked = false; 347 } 348 349 if (fatal_signal_pending(current)) { 350 cc->contended = COMPACT_CONTENDED_SCHED; 351 return true; 352 } 353 354 if (need_resched()) { 355 if (cc->mode == MIGRATE_ASYNC) { 356 cc->contended = COMPACT_CONTENDED_SCHED; 357 return true; 358 } 359 cond_resched(); 360 } 361 362 return false; 363 } 364 365 /* 366 * Aside from avoiding lock contention, compaction also periodically checks 367 * need_resched() and either schedules in sync compaction or aborts async 368 * compaction. This is similar to what compact_unlock_should_abort() does, but 369 * is used where no lock is concerned. 370 * 371 * Returns false when no scheduling was needed, or sync compaction scheduled. 372 * Returns true when async compaction should abort. 373 */ 374 static inline bool compact_should_abort(struct compact_control *cc) 375 { 376 /* async compaction aborts if contended */ 377 if (need_resched()) { 378 if (cc->mode == MIGRATE_ASYNC) { 379 cc->contended = COMPACT_CONTENDED_SCHED; 380 return true; 381 } 382 383 cond_resched(); 384 } 385 386 return false; 387 } 388 389 /* 390 * Isolate free pages onto a private freelist. If @strict is true, will abort 391 * returning 0 on any invalid PFNs or non-free pages inside of the pageblock 392 * (even though it may still end up isolating some pages). 393 */ 394 static unsigned long isolate_freepages_block(struct compact_control *cc, 395 unsigned long *start_pfn, 396 unsigned long end_pfn, 397 struct list_head *freelist, 398 bool strict) 399 { 400 int nr_scanned = 0, total_isolated = 0; 401 struct page *cursor, *valid_page = NULL; 402 unsigned long flags = 0; 403 bool locked = false; 404 unsigned long blockpfn = *start_pfn; 405 406 cursor = pfn_to_page(blockpfn); 407 408 /* Isolate free pages. */ 409 for (; blockpfn < end_pfn; blockpfn++, cursor++) { 410 int isolated, i; 411 struct page *page = cursor; 412 413 /* 414 * Periodically drop the lock (if held) regardless of its 415 * contention, to give chance to IRQs. Abort if fatal signal 416 * pending or async compaction detects need_resched() 417 */ 418 if (!(blockpfn % SWAP_CLUSTER_MAX) 419 && compact_unlock_should_abort(&cc->zone->lock, flags, 420 &locked, cc)) 421 break; 422 423 nr_scanned++; 424 if (!pfn_valid_within(blockpfn)) 425 goto isolate_fail; 426 427 if (!valid_page) 428 valid_page = page; 429 430 /* 431 * For compound pages such as THP and hugetlbfs, we can save 432 * potentially a lot of iterations if we skip them at once. 433 * The check is racy, but we can consider only valid values 434 * and the only danger is skipping too much. 435 */ 436 if (PageCompound(page)) { 437 unsigned int comp_order = compound_order(page); 438 439 if (likely(comp_order < MAX_ORDER)) { 440 blockpfn += (1UL << comp_order) - 1; 441 cursor += (1UL << comp_order) - 1; 442 } 443 444 goto isolate_fail; 445 } 446 447 if (!PageBuddy(page)) 448 goto isolate_fail; 449 450 /* 451 * If we already hold the lock, we can skip some rechecking. 452 * Note that if we hold the lock now, checked_pageblock was 453 * already set in some previous iteration (or strict is true), 454 * so it is correct to skip the suitable migration target 455 * recheck as well. 456 */ 457 if (!locked) { 458 /* 459 * The zone lock must be held to isolate freepages. 460 * Unfortunately this is a very coarse lock and can be 461 * heavily contended if there are parallel allocations 462 * or parallel compactions. For async compaction do not 463 * spin on the lock and we acquire the lock as late as 464 * possible. 465 */ 466 locked = compact_trylock_irqsave(&cc->zone->lock, 467 &flags, cc); 468 if (!locked) 469 break; 470 471 /* Recheck this is a buddy page under lock */ 472 if (!PageBuddy(page)) 473 goto isolate_fail; 474 } 475 476 /* Found a free page, break it into order-0 pages */ 477 isolated = split_free_page(page); 478 total_isolated += isolated; 479 for (i = 0; i < isolated; i++) { 480 list_add(&page->lru, freelist); 481 page++; 482 } 483 484 /* If a page was split, advance to the end of it */ 485 if (isolated) { 486 cc->nr_freepages += isolated; 487 if (!strict && 488 cc->nr_migratepages <= cc->nr_freepages) { 489 blockpfn += isolated; 490 break; 491 } 492 493 blockpfn += isolated - 1; 494 cursor += isolated - 1; 495 continue; 496 } 497 498 isolate_fail: 499 if (strict) 500 break; 501 else 502 continue; 503 504 } 505 506 /* 507 * There is a tiny chance that we have read bogus compound_order(), 508 * so be careful to not go outside of the pageblock. 509 */ 510 if (unlikely(blockpfn > end_pfn)) 511 blockpfn = end_pfn; 512 513 trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn, 514 nr_scanned, total_isolated); 515 516 /* Record how far we have got within the block */ 517 *start_pfn = blockpfn; 518 519 /* 520 * If strict isolation is requested by CMA then check that all the 521 * pages requested were isolated. If there were any failures, 0 is 522 * returned and CMA will fail. 523 */ 524 if (strict && blockpfn < end_pfn) 525 total_isolated = 0; 526 527 if (locked) 528 spin_unlock_irqrestore(&cc->zone->lock, flags); 529 530 /* Update the pageblock-skip if the whole pageblock was scanned */ 531 if (blockpfn == end_pfn) 532 update_pageblock_skip(cc, valid_page, total_isolated, false); 533 534 count_compact_events(COMPACTFREE_SCANNED, nr_scanned); 535 if (total_isolated) 536 count_compact_events(COMPACTISOLATED, total_isolated); 537 return total_isolated; 538 } 539 540 /** 541 * isolate_freepages_range() - isolate free pages. 542 * @start_pfn: The first PFN to start isolating. 543 * @end_pfn: The one-past-last PFN. 544 * 545 * Non-free pages, invalid PFNs, or zone boundaries within the 546 * [start_pfn, end_pfn) range are considered errors, cause function to 547 * undo its actions and return zero. 548 * 549 * Otherwise, function returns one-past-the-last PFN of isolated page 550 * (which may be greater then end_pfn if end fell in a middle of 551 * a free page). 552 */ 553 unsigned long 554 isolate_freepages_range(struct compact_control *cc, 555 unsigned long start_pfn, unsigned long end_pfn) 556 { 557 unsigned long isolated, pfn, block_end_pfn; 558 LIST_HEAD(freelist); 559 560 pfn = start_pfn; 561 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 562 563 for (; pfn < end_pfn; pfn += isolated, 564 block_end_pfn += pageblock_nr_pages) { 565 /* Protect pfn from changing by isolate_freepages_block */ 566 unsigned long isolate_start_pfn = pfn; 567 568 block_end_pfn = min(block_end_pfn, end_pfn); 569 570 /* 571 * pfn could pass the block_end_pfn if isolated freepage 572 * is more than pageblock order. In this case, we adjust 573 * scanning range to right one. 574 */ 575 if (pfn >= block_end_pfn) { 576 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 577 block_end_pfn = min(block_end_pfn, end_pfn); 578 } 579 580 if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) 581 break; 582 583 isolated = isolate_freepages_block(cc, &isolate_start_pfn, 584 block_end_pfn, &freelist, true); 585 586 /* 587 * In strict mode, isolate_freepages_block() returns 0 if 588 * there are any holes in the block (ie. invalid PFNs or 589 * non-free pages). 590 */ 591 if (!isolated) 592 break; 593 594 /* 595 * If we managed to isolate pages, it is always (1 << n) * 596 * pageblock_nr_pages for some non-negative n. (Max order 597 * page may span two pageblocks). 598 */ 599 } 600 601 /* split_free_page does not map the pages */ 602 map_pages(&freelist); 603 604 if (pfn < end_pfn) { 605 /* Loop terminated early, cleanup. */ 606 release_freepages(&freelist); 607 return 0; 608 } 609 610 /* We don't use freelists for anything. */ 611 return pfn; 612 } 613 614 /* Update the number of anon and file isolated pages in the zone */ 615 static void acct_isolated(struct zone *zone, struct compact_control *cc) 616 { 617 struct page *page; 618 unsigned int count[2] = { 0, }; 619 620 if (list_empty(&cc->migratepages)) 621 return; 622 623 list_for_each_entry(page, &cc->migratepages, lru) 624 count[!!page_is_file_cache(page)]++; 625 626 mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); 627 mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); 628 } 629 630 /* Similar to reclaim, but different enough that they don't share logic */ 631 static bool too_many_isolated(struct zone *zone) 632 { 633 unsigned long active, inactive, isolated; 634 635 inactive = zone_page_state(zone, NR_INACTIVE_FILE) + 636 zone_page_state(zone, NR_INACTIVE_ANON); 637 active = zone_page_state(zone, NR_ACTIVE_FILE) + 638 zone_page_state(zone, NR_ACTIVE_ANON); 639 isolated = zone_page_state(zone, NR_ISOLATED_FILE) + 640 zone_page_state(zone, NR_ISOLATED_ANON); 641 642 return isolated > (inactive + active) / 2; 643 } 644 645 /** 646 * isolate_migratepages_block() - isolate all migrate-able pages within 647 * a single pageblock 648 * @cc: Compaction control structure. 649 * @low_pfn: The first PFN to isolate 650 * @end_pfn: The one-past-the-last PFN to isolate, within same pageblock 651 * @isolate_mode: Isolation mode to be used. 652 * 653 * Isolate all pages that can be migrated from the range specified by 654 * [low_pfn, end_pfn). The range is expected to be within same pageblock. 655 * Returns zero if there is a fatal signal pending, otherwise PFN of the 656 * first page that was not scanned (which may be both less, equal to or more 657 * than end_pfn). 658 * 659 * The pages are isolated on cc->migratepages list (not required to be empty), 660 * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field 661 * is neither read nor updated. 662 */ 663 static unsigned long 664 isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, 665 unsigned long end_pfn, isolate_mode_t isolate_mode) 666 { 667 struct zone *zone = cc->zone; 668 unsigned long nr_scanned = 0, nr_isolated = 0; 669 struct list_head *migratelist = &cc->migratepages; 670 struct lruvec *lruvec; 671 unsigned long flags = 0; 672 bool locked = false; 673 struct page *page = NULL, *valid_page = NULL; 674 unsigned long start_pfn = low_pfn; 675 676 /* 677 * Ensure that there are not too many pages isolated from the LRU 678 * list by either parallel reclaimers or compaction. If there are, 679 * delay for some time until fewer pages are isolated 680 */ 681 while (unlikely(too_many_isolated(zone))) { 682 /* async migration should just abort */ 683 if (cc->mode == MIGRATE_ASYNC) 684 return 0; 685 686 congestion_wait(BLK_RW_ASYNC, HZ/10); 687 688 if (fatal_signal_pending(current)) 689 return 0; 690 } 691 692 if (compact_should_abort(cc)) 693 return 0; 694 695 /* Time to isolate some pages for migration */ 696 for (; low_pfn < end_pfn; low_pfn++) { 697 bool is_lru; 698 699 /* 700 * Periodically drop the lock (if held) regardless of its 701 * contention, to give chance to IRQs. Abort async compaction 702 * if contended. 703 */ 704 if (!(low_pfn % SWAP_CLUSTER_MAX) 705 && compact_unlock_should_abort(&zone->lru_lock, flags, 706 &locked, cc)) 707 break; 708 709 if (!pfn_valid_within(low_pfn)) 710 continue; 711 nr_scanned++; 712 713 page = pfn_to_page(low_pfn); 714 715 if (!valid_page) 716 valid_page = page; 717 718 /* 719 * Skip if free. We read page order here without zone lock 720 * which is generally unsafe, but the race window is small and 721 * the worst thing that can happen is that we skip some 722 * potential isolation targets. 723 */ 724 if (PageBuddy(page)) { 725 unsigned long freepage_order = page_order_unsafe(page); 726 727 /* 728 * Without lock, we cannot be sure that what we got is 729 * a valid page order. Consider only values in the 730 * valid order range to prevent low_pfn overflow. 731 */ 732 if (freepage_order > 0 && freepage_order < MAX_ORDER) 733 low_pfn += (1UL << freepage_order) - 1; 734 continue; 735 } 736 737 /* 738 * Check may be lockless but that's ok as we recheck later. 739 * It's possible to migrate LRU pages and balloon pages 740 * Skip any other type of page 741 */ 742 is_lru = PageLRU(page); 743 if (!is_lru) { 744 if (unlikely(balloon_page_movable(page))) { 745 if (balloon_page_isolate(page)) { 746 /* Successfully isolated */ 747 goto isolate_success; 748 } 749 } 750 } 751 752 /* 753 * Regardless of being on LRU, compound pages such as THP and 754 * hugetlbfs are not to be compacted. We can potentially save 755 * a lot of iterations if we skip them at once. The check is 756 * racy, but we can consider only valid values and the only 757 * danger is skipping too much. 758 */ 759 if (PageCompound(page)) { 760 unsigned int comp_order = compound_order(page); 761 762 if (likely(comp_order < MAX_ORDER)) 763 low_pfn += (1UL << comp_order) - 1; 764 765 continue; 766 } 767 768 if (!is_lru) 769 continue; 770 771 /* 772 * Migration will fail if an anonymous page is pinned in memory, 773 * so avoid taking lru_lock and isolating it unnecessarily in an 774 * admittedly racy check. 775 */ 776 if (!page_mapping(page) && 777 page_count(page) > page_mapcount(page)) 778 continue; 779 780 /* If we already hold the lock, we can skip some rechecking */ 781 if (!locked) { 782 locked = compact_trylock_irqsave(&zone->lru_lock, 783 &flags, cc); 784 if (!locked) 785 break; 786 787 /* Recheck PageLRU and PageCompound under lock */ 788 if (!PageLRU(page)) 789 continue; 790 791 /* 792 * Page become compound since the non-locked check, 793 * and it's on LRU. It can only be a THP so the order 794 * is safe to read and it's 0 for tail pages. 795 */ 796 if (unlikely(PageCompound(page))) { 797 low_pfn += (1UL << compound_order(page)) - 1; 798 continue; 799 } 800 } 801 802 lruvec = mem_cgroup_page_lruvec(page, zone); 803 804 /* Try isolate the page */ 805 if (__isolate_lru_page(page, isolate_mode) != 0) 806 continue; 807 808 VM_BUG_ON_PAGE(PageCompound(page), page); 809 810 /* Successfully isolated */ 811 del_page_from_lru_list(page, lruvec, page_lru(page)); 812 813 isolate_success: 814 list_add(&page->lru, migratelist); 815 cc->nr_migratepages++; 816 nr_isolated++; 817 818 /* Avoid isolating too much */ 819 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { 820 ++low_pfn; 821 break; 822 } 823 } 824 825 /* 826 * The PageBuddy() check could have potentially brought us outside 827 * the range to be scanned. 828 */ 829 if (unlikely(low_pfn > end_pfn)) 830 low_pfn = end_pfn; 831 832 if (locked) 833 spin_unlock_irqrestore(&zone->lru_lock, flags); 834 835 /* 836 * Update the pageblock-skip information and cached scanner pfn, 837 * if the whole pageblock was scanned without isolating any page. 838 */ 839 if (low_pfn == end_pfn) 840 update_pageblock_skip(cc, valid_page, nr_isolated, true); 841 842 trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn, 843 nr_scanned, nr_isolated); 844 845 count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned); 846 if (nr_isolated) 847 count_compact_events(COMPACTISOLATED, nr_isolated); 848 849 return low_pfn; 850 } 851 852 /** 853 * isolate_migratepages_range() - isolate migrate-able pages in a PFN range 854 * @cc: Compaction control structure. 855 * @start_pfn: The first PFN to start isolating. 856 * @end_pfn: The one-past-last PFN. 857 * 858 * Returns zero if isolation fails fatally due to e.g. pending signal. 859 * Otherwise, function returns one-past-the-last PFN of isolated page 860 * (which may be greater than end_pfn if end fell in a middle of a THP page). 861 */ 862 unsigned long 863 isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, 864 unsigned long end_pfn) 865 { 866 unsigned long pfn, block_end_pfn; 867 868 /* Scan block by block. First and last block may be incomplete */ 869 pfn = start_pfn; 870 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 871 872 for (; pfn < end_pfn; pfn = block_end_pfn, 873 block_end_pfn += pageblock_nr_pages) { 874 875 block_end_pfn = min(block_end_pfn, end_pfn); 876 877 if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) 878 continue; 879 880 pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, 881 ISOLATE_UNEVICTABLE); 882 883 /* 884 * In case of fatal failure, release everything that might 885 * have been isolated in the previous iteration, and signal 886 * the failure back to caller. 887 */ 888 if (!pfn) { 889 putback_movable_pages(&cc->migratepages); 890 cc->nr_migratepages = 0; 891 break; 892 } 893 894 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) 895 break; 896 } 897 acct_isolated(cc->zone, cc); 898 899 return pfn; 900 } 901 902 #endif /* CONFIG_COMPACTION || CONFIG_CMA */ 903 #ifdef CONFIG_COMPACTION 904 905 /* Returns true if the page is within a block suitable for migration to */ 906 static bool suitable_migration_target(struct page *page) 907 { 908 /* If the page is a large free page, then disallow migration */ 909 if (PageBuddy(page)) { 910 /* 911 * We are checking page_order without zone->lock taken. But 912 * the only small danger is that we skip a potentially suitable 913 * pageblock, so it's not worth to check order for valid range. 914 */ 915 if (page_order_unsafe(page) >= pageblock_order) 916 return false; 917 } 918 919 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ 920 if (migrate_async_suitable(get_pageblock_migratetype(page))) 921 return true; 922 923 /* Otherwise skip the block */ 924 return false; 925 } 926 927 /* 928 * Test whether the free scanner has reached the same or lower pageblock than 929 * the migration scanner, and compaction should thus terminate. 930 */ 931 static inline bool compact_scanners_met(struct compact_control *cc) 932 { 933 return (cc->free_pfn >> pageblock_order) 934 <= (cc->migrate_pfn >> pageblock_order); 935 } 936 937 /* 938 * Based on information in the current compact_control, find blocks 939 * suitable for isolating free pages from and then isolate them. 940 */ 941 static void isolate_freepages(struct compact_control *cc) 942 { 943 struct zone *zone = cc->zone; 944 struct page *page; 945 unsigned long block_start_pfn; /* start of current pageblock */ 946 unsigned long isolate_start_pfn; /* exact pfn we start at */ 947 unsigned long block_end_pfn; /* end of current pageblock */ 948 unsigned long low_pfn; /* lowest pfn scanner is able to scan */ 949 struct list_head *freelist = &cc->freepages; 950 951 /* 952 * Initialise the free scanner. The starting point is where we last 953 * successfully isolated from, zone-cached value, or the end of the 954 * zone when isolating for the first time. For looping we also need 955 * this pfn aligned down to the pageblock boundary, because we do 956 * block_start_pfn -= pageblock_nr_pages in the for loop. 957 * For ending point, take care when isolating in last pageblock of a 958 * a zone which ends in the middle of a pageblock. 959 * The low boundary is the end of the pageblock the migration scanner 960 * is using. 961 */ 962 isolate_start_pfn = cc->free_pfn; 963 block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); 964 block_end_pfn = min(block_start_pfn + pageblock_nr_pages, 965 zone_end_pfn(zone)); 966 low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); 967 968 /* 969 * Isolate free pages until enough are available to migrate the 970 * pages on cc->migratepages. We stop searching if the migrate 971 * and free page scanners meet or enough free pages are isolated. 972 */ 973 for (; block_start_pfn >= low_pfn; 974 block_end_pfn = block_start_pfn, 975 block_start_pfn -= pageblock_nr_pages, 976 isolate_start_pfn = block_start_pfn) { 977 978 /* 979 * This can iterate a massively long zone without finding any 980 * suitable migration targets, so periodically check if we need 981 * to schedule, or even abort async compaction. 982 */ 983 if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) 984 && compact_should_abort(cc)) 985 break; 986 987 page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, 988 zone); 989 if (!page) 990 continue; 991 992 /* Check the block is suitable for migration */ 993 if (!suitable_migration_target(page)) 994 continue; 995 996 /* If isolation recently failed, do not retry */ 997 if (!isolation_suitable(cc, page)) 998 continue; 999 1000 /* Found a block suitable for isolating free pages from. */ 1001 isolate_freepages_block(cc, &isolate_start_pfn, 1002 block_end_pfn, freelist, false); 1003 1004 /* 1005 * If we isolated enough freepages, or aborted due to async 1006 * compaction being contended, terminate the loop. 1007 * Remember where the free scanner should restart next time, 1008 * which is where isolate_freepages_block() left off. 1009 * But if it scanned the whole pageblock, isolate_start_pfn 1010 * now points at block_end_pfn, which is the start of the next 1011 * pageblock. 1012 * In that case we will however want to restart at the start 1013 * of the previous pageblock. 1014 */ 1015 if ((cc->nr_freepages >= cc->nr_migratepages) 1016 || cc->contended) { 1017 if (isolate_start_pfn >= block_end_pfn) 1018 isolate_start_pfn = 1019 block_start_pfn - pageblock_nr_pages; 1020 break; 1021 } else { 1022 /* 1023 * isolate_freepages_block() should not terminate 1024 * prematurely unless contended, or isolated enough 1025 */ 1026 VM_BUG_ON(isolate_start_pfn < block_end_pfn); 1027 } 1028 } 1029 1030 /* split_free_page does not map the pages */ 1031 map_pages(freelist); 1032 1033 /* 1034 * Record where the free scanner will restart next time. Either we 1035 * broke from the loop and set isolate_start_pfn based on the last 1036 * call to isolate_freepages_block(), or we met the migration scanner 1037 * and the loop terminated due to isolate_start_pfn < low_pfn 1038 */ 1039 cc->free_pfn = isolate_start_pfn; 1040 } 1041 1042 /* 1043 * This is a migrate-callback that "allocates" freepages by taking pages 1044 * from the isolated freelists in the block we are migrating to. 1045 */ 1046 static struct page *compaction_alloc(struct page *migratepage, 1047 unsigned long data, 1048 int **result) 1049 { 1050 struct compact_control *cc = (struct compact_control *)data; 1051 struct page *freepage; 1052 1053 /* 1054 * Isolate free pages if necessary, and if we are not aborting due to 1055 * contention. 1056 */ 1057 if (list_empty(&cc->freepages)) { 1058 if (!cc->contended) 1059 isolate_freepages(cc); 1060 1061 if (list_empty(&cc->freepages)) 1062 return NULL; 1063 } 1064 1065 freepage = list_entry(cc->freepages.next, struct page, lru); 1066 list_del(&freepage->lru); 1067 cc->nr_freepages--; 1068 1069 return freepage; 1070 } 1071 1072 /* 1073 * This is a migrate-callback that "frees" freepages back to the isolated 1074 * freelist. All pages on the freelist are from the same zone, so there is no 1075 * special handling needed for NUMA. 1076 */ 1077 static void compaction_free(struct page *page, unsigned long data) 1078 { 1079 struct compact_control *cc = (struct compact_control *)data; 1080 1081 list_add(&page->lru, &cc->freepages); 1082 cc->nr_freepages++; 1083 } 1084 1085 /* possible outcome of isolate_migratepages */ 1086 typedef enum { 1087 ISOLATE_ABORT, /* Abort compaction now */ 1088 ISOLATE_NONE, /* No pages isolated, continue scanning */ 1089 ISOLATE_SUCCESS, /* Pages isolated, migrate */ 1090 } isolate_migrate_t; 1091 1092 /* 1093 * Allow userspace to control policy on scanning the unevictable LRU for 1094 * compactable pages. 1095 */ 1096 int sysctl_compact_unevictable_allowed __read_mostly = 1; 1097 1098 /* 1099 * Isolate all pages that can be migrated from the first suitable block, 1100 * starting at the block pointed to by the migrate scanner pfn within 1101 * compact_control. 1102 */ 1103 static isolate_migrate_t isolate_migratepages(struct zone *zone, 1104 struct compact_control *cc) 1105 { 1106 unsigned long low_pfn, end_pfn; 1107 unsigned long isolate_start_pfn; 1108 struct page *page; 1109 const isolate_mode_t isolate_mode = 1110 (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | 1111 (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); 1112 1113 /* 1114 * Start at where we last stopped, or beginning of the zone as 1115 * initialized by compact_zone() 1116 */ 1117 low_pfn = cc->migrate_pfn; 1118 1119 /* Only scan within a pageblock boundary */ 1120 end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); 1121 1122 /* 1123 * Iterate over whole pageblocks until we find the first suitable. 1124 * Do not cross the free scanner. 1125 */ 1126 for (; end_pfn <= cc->free_pfn; 1127 low_pfn = end_pfn, end_pfn += pageblock_nr_pages) { 1128 1129 /* 1130 * This can potentially iterate a massively long zone with 1131 * many pageblocks unsuitable, so periodically check if we 1132 * need to schedule, or even abort async compaction. 1133 */ 1134 if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) 1135 && compact_should_abort(cc)) 1136 break; 1137 1138 page = pageblock_pfn_to_page(low_pfn, end_pfn, zone); 1139 if (!page) 1140 continue; 1141 1142 /* If isolation recently failed, do not retry */ 1143 if (!isolation_suitable(cc, page)) 1144 continue; 1145 1146 /* 1147 * For async compaction, also only scan in MOVABLE blocks. 1148 * Async compaction is optimistic to see if the minimum amount 1149 * of work satisfies the allocation. 1150 */ 1151 if (cc->mode == MIGRATE_ASYNC && 1152 !migrate_async_suitable(get_pageblock_migratetype(page))) 1153 continue; 1154 1155 /* Perform the isolation */ 1156 isolate_start_pfn = low_pfn; 1157 low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, 1158 isolate_mode); 1159 1160 if (!low_pfn || cc->contended) { 1161 acct_isolated(zone, cc); 1162 return ISOLATE_ABORT; 1163 } 1164 1165 /* 1166 * Record where we could have freed pages by migration and not 1167 * yet flushed them to buddy allocator. 1168 * - this is the lowest page that could have been isolated and 1169 * then freed by migration. 1170 */ 1171 if (cc->nr_migratepages && !cc->last_migrated_pfn) 1172 cc->last_migrated_pfn = isolate_start_pfn; 1173 1174 /* 1175 * Either we isolated something and proceed with migration. Or 1176 * we failed and compact_zone should decide if we should 1177 * continue or not. 1178 */ 1179 break; 1180 } 1181 1182 acct_isolated(zone, cc); 1183 /* Record where migration scanner will be restarted. */ 1184 cc->migrate_pfn = low_pfn; 1185 1186 return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; 1187 } 1188 1189 /* 1190 * order == -1 is expected when compacting via 1191 * /proc/sys/vm/compact_memory 1192 */ 1193 static inline bool is_via_compact_memory(int order) 1194 { 1195 return order == -1; 1196 } 1197 1198 static int __compact_finished(struct zone *zone, struct compact_control *cc, 1199 const int migratetype) 1200 { 1201 unsigned int order; 1202 unsigned long watermark; 1203 1204 if (cc->contended || fatal_signal_pending(current)) 1205 return COMPACT_CONTENDED; 1206 1207 /* Compaction run completes if the migrate and free scanner meet */ 1208 if (compact_scanners_met(cc)) { 1209 /* Let the next compaction start anew. */ 1210 reset_cached_positions(zone); 1211 1212 /* 1213 * Mark that the PG_migrate_skip information should be cleared 1214 * by kswapd when it goes to sleep. kswapd does not set the 1215 * flag itself as the decision to be clear should be directly 1216 * based on an allocation request. 1217 */ 1218 if (!current_is_kswapd()) 1219 zone->compact_blockskip_flush = true; 1220 1221 return COMPACT_COMPLETE; 1222 } 1223 1224 if (is_via_compact_memory(cc->order)) 1225 return COMPACT_CONTINUE; 1226 1227 /* Compaction run is not finished if the watermark is not met */ 1228 watermark = low_wmark_pages(zone); 1229 1230 if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx, 1231 cc->alloc_flags)) 1232 return COMPACT_CONTINUE; 1233 1234 /* Direct compactor: Is a suitable page free? */ 1235 for (order = cc->order; order < MAX_ORDER; order++) { 1236 struct free_area *area = &zone->free_area[order]; 1237 bool can_steal; 1238 1239 /* Job done if page is free of the right migratetype */ 1240 if (!list_empty(&area->free_list[migratetype])) 1241 return COMPACT_PARTIAL; 1242 1243 #ifdef CONFIG_CMA 1244 /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */ 1245 if (migratetype == MIGRATE_MOVABLE && 1246 !list_empty(&area->free_list[MIGRATE_CMA])) 1247 return COMPACT_PARTIAL; 1248 #endif 1249 /* 1250 * Job done if allocation would steal freepages from 1251 * other migratetype buddy lists. 1252 */ 1253 if (find_suitable_fallback(area, order, migratetype, 1254 true, &can_steal) != -1) 1255 return COMPACT_PARTIAL; 1256 } 1257 1258 return COMPACT_NO_SUITABLE_PAGE; 1259 } 1260 1261 static int compact_finished(struct zone *zone, struct compact_control *cc, 1262 const int migratetype) 1263 { 1264 int ret; 1265 1266 ret = __compact_finished(zone, cc, migratetype); 1267 trace_mm_compaction_finished(zone, cc->order, ret); 1268 if (ret == COMPACT_NO_SUITABLE_PAGE) 1269 ret = COMPACT_CONTINUE; 1270 1271 return ret; 1272 } 1273 1274 /* 1275 * compaction_suitable: Is this suitable to run compaction on this zone now? 1276 * Returns 1277 * COMPACT_SKIPPED - If there are too few free pages for compaction 1278 * COMPACT_PARTIAL - If the allocation would succeed without compaction 1279 * COMPACT_CONTINUE - If compaction should run now 1280 */ 1281 static unsigned long __compaction_suitable(struct zone *zone, int order, 1282 int alloc_flags, int classzone_idx) 1283 { 1284 int fragindex; 1285 unsigned long watermark; 1286 1287 if (is_via_compact_memory(order)) 1288 return COMPACT_CONTINUE; 1289 1290 watermark = low_wmark_pages(zone); 1291 /* 1292 * If watermarks for high-order allocation are already met, there 1293 * should be no need for compaction at all. 1294 */ 1295 if (zone_watermark_ok(zone, order, watermark, classzone_idx, 1296 alloc_flags)) 1297 return COMPACT_PARTIAL; 1298 1299 /* 1300 * Watermarks for order-0 must be met for compaction. Note the 2UL. 1301 * This is because during migration, copies of pages need to be 1302 * allocated and for a short time, the footprint is higher 1303 */ 1304 watermark += (2UL << order); 1305 if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags)) 1306 return COMPACT_SKIPPED; 1307 1308 /* 1309 * fragmentation index determines if allocation failures are due to 1310 * low memory or external fragmentation 1311 * 1312 * index of -1000 would imply allocations might succeed depending on 1313 * watermarks, but we already failed the high-order watermark check 1314 * index towards 0 implies failure is due to lack of memory 1315 * index towards 1000 implies failure is due to fragmentation 1316 * 1317 * Only compact if a failure would be due to fragmentation. 1318 */ 1319 fragindex = fragmentation_index(zone, order); 1320 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) 1321 return COMPACT_NOT_SUITABLE_ZONE; 1322 1323 return COMPACT_CONTINUE; 1324 } 1325 1326 unsigned long compaction_suitable(struct zone *zone, int order, 1327 int alloc_flags, int classzone_idx) 1328 { 1329 unsigned long ret; 1330 1331 ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx); 1332 trace_mm_compaction_suitable(zone, order, ret); 1333 if (ret == COMPACT_NOT_SUITABLE_ZONE) 1334 ret = COMPACT_SKIPPED; 1335 1336 return ret; 1337 } 1338 1339 static int compact_zone(struct zone *zone, struct compact_control *cc) 1340 { 1341 int ret; 1342 unsigned long start_pfn = zone->zone_start_pfn; 1343 unsigned long end_pfn = zone_end_pfn(zone); 1344 const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); 1345 const bool sync = cc->mode != MIGRATE_ASYNC; 1346 1347 ret = compaction_suitable(zone, cc->order, cc->alloc_flags, 1348 cc->classzone_idx); 1349 switch (ret) { 1350 case COMPACT_PARTIAL: 1351 case COMPACT_SKIPPED: 1352 /* Compaction is likely to fail */ 1353 return ret; 1354 case COMPACT_CONTINUE: 1355 /* Fall through to compaction */ 1356 ; 1357 } 1358 1359 /* 1360 * Clear pageblock skip if there were failures recently and compaction 1361 * is about to be retried after being deferred. kswapd does not do 1362 * this reset as it'll reset the cached information when going to sleep. 1363 */ 1364 if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) 1365 __reset_isolation_suitable(zone); 1366 1367 /* 1368 * Setup to move all movable pages to the end of the zone. Used cached 1369 * information on where the scanners should start but check that it 1370 * is initialised by ensuring the values are within zone boundaries. 1371 */ 1372 cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; 1373 cc->free_pfn = zone->compact_cached_free_pfn; 1374 if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { 1375 cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); 1376 zone->compact_cached_free_pfn = cc->free_pfn; 1377 } 1378 if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { 1379 cc->migrate_pfn = start_pfn; 1380 zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; 1381 zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; 1382 } 1383 cc->last_migrated_pfn = 0; 1384 1385 trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, 1386 cc->free_pfn, end_pfn, sync); 1387 1388 migrate_prep_local(); 1389 1390 while ((ret = compact_finished(zone, cc, migratetype)) == 1391 COMPACT_CONTINUE) { 1392 int err; 1393 1394 switch (isolate_migratepages(zone, cc)) { 1395 case ISOLATE_ABORT: 1396 ret = COMPACT_CONTENDED; 1397 putback_movable_pages(&cc->migratepages); 1398 cc->nr_migratepages = 0; 1399 goto out; 1400 case ISOLATE_NONE: 1401 /* 1402 * We haven't isolated and migrated anything, but 1403 * there might still be unflushed migrations from 1404 * previous cc->order aligned block. 1405 */ 1406 goto check_drain; 1407 case ISOLATE_SUCCESS: 1408 ; 1409 } 1410 1411 err = migrate_pages(&cc->migratepages, compaction_alloc, 1412 compaction_free, (unsigned long)cc, cc->mode, 1413 MR_COMPACTION); 1414 1415 trace_mm_compaction_migratepages(cc->nr_migratepages, err, 1416 &cc->migratepages); 1417 1418 /* All pages were either migrated or will be released */ 1419 cc->nr_migratepages = 0; 1420 if (err) { 1421 putback_movable_pages(&cc->migratepages); 1422 /* 1423 * migrate_pages() may return -ENOMEM when scanners meet 1424 * and we want compact_finished() to detect it 1425 */ 1426 if (err == -ENOMEM && !compact_scanners_met(cc)) { 1427 ret = COMPACT_CONTENDED; 1428 goto out; 1429 } 1430 } 1431 1432 check_drain: 1433 /* 1434 * Has the migration scanner moved away from the previous 1435 * cc->order aligned block where we migrated from? If yes, 1436 * flush the pages that were freed, so that they can merge and 1437 * compact_finished() can detect immediately if allocation 1438 * would succeed. 1439 */ 1440 if (cc->order > 0 && cc->last_migrated_pfn) { 1441 int cpu; 1442 unsigned long current_block_start = 1443 cc->migrate_pfn & ~((1UL << cc->order) - 1); 1444 1445 if (cc->last_migrated_pfn < current_block_start) { 1446 cpu = get_cpu(); 1447 lru_add_drain_cpu(cpu); 1448 drain_local_pages(zone); 1449 put_cpu(); 1450 /* No more flushing until we migrate again */ 1451 cc->last_migrated_pfn = 0; 1452 } 1453 } 1454 1455 } 1456 1457 out: 1458 /* 1459 * Release free pages and update where the free scanner should restart, 1460 * so we don't leave any returned pages behind in the next attempt. 1461 */ 1462 if (cc->nr_freepages > 0) { 1463 unsigned long free_pfn = release_freepages(&cc->freepages); 1464 1465 cc->nr_freepages = 0; 1466 VM_BUG_ON(free_pfn == 0); 1467 /* The cached pfn is always the first in a pageblock */ 1468 free_pfn &= ~(pageblock_nr_pages-1); 1469 /* 1470 * Only go back, not forward. The cached pfn might have been 1471 * already reset to zone end in compact_finished() 1472 */ 1473 if (free_pfn > zone->compact_cached_free_pfn) 1474 zone->compact_cached_free_pfn = free_pfn; 1475 } 1476 1477 trace_mm_compaction_end(start_pfn, cc->migrate_pfn, 1478 cc->free_pfn, end_pfn, sync, ret); 1479 1480 if (ret == COMPACT_CONTENDED) 1481 ret = COMPACT_PARTIAL; 1482 1483 return ret; 1484 } 1485 1486 static unsigned long compact_zone_order(struct zone *zone, int order, 1487 gfp_t gfp_mask, enum migrate_mode mode, int *contended, 1488 int alloc_flags, int classzone_idx) 1489 { 1490 unsigned long ret; 1491 struct compact_control cc = { 1492 .nr_freepages = 0, 1493 .nr_migratepages = 0, 1494 .order = order, 1495 .gfp_mask = gfp_mask, 1496 .zone = zone, 1497 .mode = mode, 1498 .alloc_flags = alloc_flags, 1499 .classzone_idx = classzone_idx, 1500 }; 1501 INIT_LIST_HEAD(&cc.freepages); 1502 INIT_LIST_HEAD(&cc.migratepages); 1503 1504 ret = compact_zone(zone, &cc); 1505 1506 VM_BUG_ON(!list_empty(&cc.freepages)); 1507 VM_BUG_ON(!list_empty(&cc.migratepages)); 1508 1509 *contended = cc.contended; 1510 return ret; 1511 } 1512 1513 int sysctl_extfrag_threshold = 500; 1514 1515 /** 1516 * try_to_compact_pages - Direct compact to satisfy a high-order allocation 1517 * @gfp_mask: The GFP mask of the current allocation 1518 * @order: The order of the current allocation 1519 * @alloc_flags: The allocation flags of the current allocation 1520 * @ac: The context of current allocation 1521 * @mode: The migration mode for async, sync light, or sync migration 1522 * @contended: Return value that determines if compaction was aborted due to 1523 * need_resched() or lock contention 1524 * 1525 * This is the main entry point for direct page compaction. 1526 */ 1527 unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, 1528 int alloc_flags, const struct alloc_context *ac, 1529 enum migrate_mode mode, int *contended) 1530 { 1531 int may_enter_fs = gfp_mask & __GFP_FS; 1532 int may_perform_io = gfp_mask & __GFP_IO; 1533 struct zoneref *z; 1534 struct zone *zone; 1535 int rc = COMPACT_DEFERRED; 1536 int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ 1537 1538 *contended = COMPACT_CONTENDED_NONE; 1539 1540 /* Check if the GFP flags allow compaction */ 1541 if (!order || !may_enter_fs || !may_perform_io) 1542 return COMPACT_SKIPPED; 1543 1544 trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode); 1545 1546 /* Compact each zone in the list */ 1547 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, 1548 ac->nodemask) { 1549 int status; 1550 int zone_contended; 1551 1552 if (compaction_deferred(zone, order)) 1553 continue; 1554 1555 status = compact_zone_order(zone, order, gfp_mask, mode, 1556 &zone_contended, alloc_flags, 1557 ac->classzone_idx); 1558 rc = max(status, rc); 1559 /* 1560 * It takes at least one zone that wasn't lock contended 1561 * to clear all_zones_contended. 1562 */ 1563 all_zones_contended &= zone_contended; 1564 1565 /* If a normal allocation would succeed, stop compacting */ 1566 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 1567 ac->classzone_idx, alloc_flags)) { 1568 /* 1569 * We think the allocation will succeed in this zone, 1570 * but it is not certain, hence the false. The caller 1571 * will repeat this with true if allocation indeed 1572 * succeeds in this zone. 1573 */ 1574 compaction_defer_reset(zone, order, false); 1575 /* 1576 * It is possible that async compaction aborted due to 1577 * need_resched() and the watermarks were ok thanks to 1578 * somebody else freeing memory. The allocation can 1579 * however still fail so we better signal the 1580 * need_resched() contention anyway (this will not 1581 * prevent the allocation attempt). 1582 */ 1583 if (zone_contended == COMPACT_CONTENDED_SCHED) 1584 *contended = COMPACT_CONTENDED_SCHED; 1585 1586 goto break_loop; 1587 } 1588 1589 if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) { 1590 /* 1591 * We think that allocation won't succeed in this zone 1592 * so we defer compaction there. If it ends up 1593 * succeeding after all, it will be reset. 1594 */ 1595 defer_compaction(zone, order); 1596 } 1597 1598 /* 1599 * We might have stopped compacting due to need_resched() in 1600 * async compaction, or due to a fatal signal detected. In that 1601 * case do not try further zones and signal need_resched() 1602 * contention. 1603 */ 1604 if ((zone_contended == COMPACT_CONTENDED_SCHED) 1605 || fatal_signal_pending(current)) { 1606 *contended = COMPACT_CONTENDED_SCHED; 1607 goto break_loop; 1608 } 1609 1610 continue; 1611 break_loop: 1612 /* 1613 * We might not have tried all the zones, so be conservative 1614 * and assume they are not all lock contended. 1615 */ 1616 all_zones_contended = 0; 1617 break; 1618 } 1619 1620 /* 1621 * If at least one zone wasn't deferred or skipped, we report if all 1622 * zones that were tried were lock contended. 1623 */ 1624 if (rc > COMPACT_SKIPPED && all_zones_contended) 1625 *contended = COMPACT_CONTENDED_LOCK; 1626 1627 return rc; 1628 } 1629 1630 1631 /* Compact all zones within a node */ 1632 static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) 1633 { 1634 int zoneid; 1635 struct zone *zone; 1636 1637 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { 1638 1639 zone = &pgdat->node_zones[zoneid]; 1640 if (!populated_zone(zone)) 1641 continue; 1642 1643 cc->nr_freepages = 0; 1644 cc->nr_migratepages = 0; 1645 cc->zone = zone; 1646 INIT_LIST_HEAD(&cc->freepages); 1647 INIT_LIST_HEAD(&cc->migratepages); 1648 1649 /* 1650 * When called via /proc/sys/vm/compact_memory 1651 * this makes sure we compact the whole zone regardless of 1652 * cached scanner positions. 1653 */ 1654 if (is_via_compact_memory(cc->order)) 1655 __reset_isolation_suitable(zone); 1656 1657 if (is_via_compact_memory(cc->order) || 1658 !compaction_deferred(zone, cc->order)) 1659 compact_zone(zone, cc); 1660 1661 VM_BUG_ON(!list_empty(&cc->freepages)); 1662 VM_BUG_ON(!list_empty(&cc->migratepages)); 1663 1664 if (is_via_compact_memory(cc->order)) 1665 continue; 1666 1667 if (zone_watermark_ok(zone, cc->order, 1668 low_wmark_pages(zone), 0, 0)) 1669 compaction_defer_reset(zone, cc->order, false); 1670 } 1671 } 1672 1673 void compact_pgdat(pg_data_t *pgdat, int order) 1674 { 1675 struct compact_control cc = { 1676 .order = order, 1677 .mode = MIGRATE_ASYNC, 1678 }; 1679 1680 if (!order) 1681 return; 1682 1683 __compact_pgdat(pgdat, &cc); 1684 } 1685 1686 static void compact_node(int nid) 1687 { 1688 struct compact_control cc = { 1689 .order = -1, 1690 .mode = MIGRATE_SYNC, 1691 .ignore_skip_hint = true, 1692 }; 1693 1694 __compact_pgdat(NODE_DATA(nid), &cc); 1695 } 1696 1697 /* Compact all nodes in the system */ 1698 static void compact_nodes(void) 1699 { 1700 int nid; 1701 1702 /* Flush pending updates to the LRU lists */ 1703 lru_add_drain_all(); 1704 1705 for_each_online_node(nid) 1706 compact_node(nid); 1707 } 1708 1709 /* The written value is actually unused, all memory is compacted */ 1710 int sysctl_compact_memory; 1711 1712 /* 1713 * This is the entry point for compacting all nodes via 1714 * /proc/sys/vm/compact_memory 1715 */ 1716 int sysctl_compaction_handler(struct ctl_table *table, int write, 1717 void __user *buffer, size_t *length, loff_t *ppos) 1718 { 1719 if (write) 1720 compact_nodes(); 1721 1722 return 0; 1723 } 1724 1725 int sysctl_extfrag_handler(struct ctl_table *table, int write, 1726 void __user *buffer, size_t *length, loff_t *ppos) 1727 { 1728 proc_dointvec_minmax(table, write, buffer, length, ppos); 1729 1730 return 0; 1731 } 1732 1733 #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) 1734 static ssize_t sysfs_compact_node(struct device *dev, 1735 struct device_attribute *attr, 1736 const char *buf, size_t count) 1737 { 1738 int nid = dev->id; 1739 1740 if (nid >= 0 && nid < nr_node_ids && node_online(nid)) { 1741 /* Flush pending updates to the LRU lists */ 1742 lru_add_drain_all(); 1743 1744 compact_node(nid); 1745 } 1746 1747 return count; 1748 } 1749 static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node); 1750 1751 int compaction_register_node(struct node *node) 1752 { 1753 return device_create_file(&node->dev, &dev_attr_compact); 1754 } 1755 1756 void compaction_unregister_node(struct node *node) 1757 { 1758 return device_remove_file(&node->dev, &dev_attr_compact); 1759 } 1760 #endif /* CONFIG_SYSFS && CONFIG_NUMA */ 1761 1762 #endif /* CONFIG_COMPACTION */ 1763