1 /* 2 * linux/mm/compaction.c 3 * 4 * Memory compaction for the reduction of external fragmentation. Note that 5 * this heavily depends upon page migration to do all the real heavy 6 * lifting 7 * 8 * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie> 9 */ 10 #include <linux/swap.h> 11 #include <linux/migrate.h> 12 #include <linux/compaction.h> 13 #include <linux/mm_inline.h> 14 #include <linux/backing-dev.h> 15 #include <linux/sysctl.h> 16 #include <linux/sysfs.h> 17 #include <linux/balloon_compaction.h> 18 #include <linux/page-isolation.h> 19 #include <linux/kasan.h> 20 #include "internal.h" 21 22 #ifdef CONFIG_COMPACTION 23 static inline void count_compact_event(enum vm_event_item item) 24 { 25 count_vm_event(item); 26 } 27 28 static inline void count_compact_events(enum vm_event_item item, long delta) 29 { 30 count_vm_events(item, delta); 31 } 32 #else 33 #define count_compact_event(item) do { } while (0) 34 #define count_compact_events(item, delta) do { } while (0) 35 #endif 36 37 #if defined CONFIG_COMPACTION || defined CONFIG_CMA 38 #ifdef CONFIG_TRACEPOINTS 39 static const char *const compaction_status_string[] = { 40 "deferred", 41 "skipped", 42 "continue", 43 "partial", 44 "complete", 45 "no_suitable_page", 46 "not_suitable_zone", 47 }; 48 #endif 49 50 #define CREATE_TRACE_POINTS 51 #include <trace/events/compaction.h> 52 53 static unsigned long release_freepages(struct list_head *freelist) 54 { 55 struct page *page, *next; 56 unsigned long high_pfn = 0; 57 58 list_for_each_entry_safe(page, next, freelist, lru) { 59 unsigned long pfn = page_to_pfn(page); 60 list_del(&page->lru); 61 __free_page(page); 62 if (pfn > high_pfn) 63 high_pfn = pfn; 64 } 65 66 return high_pfn; 67 } 68 69 static void map_pages(struct list_head *list) 70 { 71 struct page *page; 72 73 list_for_each_entry(page, list, lru) { 74 arch_alloc_page(page, 0); 75 kernel_map_pages(page, 1, 1); 76 kasan_alloc_pages(page, 0); 77 } 78 } 79 80 static inline bool migrate_async_suitable(int migratetype) 81 { 82 return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; 83 } 84 85 /* 86 * Check that the whole (or subset of) a pageblock given by the interval of 87 * [start_pfn, end_pfn) is valid and within the same zone, before scanning it 88 * with the migration of free compaction scanner. The scanners then need to 89 * use only pfn_valid_within() check for arches that allow holes within 90 * pageblocks. 91 * 92 * Return struct page pointer of start_pfn, or NULL if checks were not passed. 93 * 94 * It's possible on some configurations to have a setup like node0 node1 node0 95 * i.e. it's possible that all pages within a zones range of pages do not 96 * belong to a single zone. We assume that a border between node0 and node1 97 * can occur within a single pageblock, but not a node0 node1 node0 98 * interleaving within a single pageblock. It is therefore sufficient to check 99 * the first and last page of a pageblock and avoid checking each individual 100 * page in a pageblock. 101 */ 102 static struct page *pageblock_pfn_to_page(unsigned long start_pfn, 103 unsigned long end_pfn, struct zone *zone) 104 { 105 struct page *start_page; 106 struct page *end_page; 107 108 /* end_pfn is one past the range we are checking */ 109 end_pfn--; 110 111 if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) 112 return NULL; 113 114 start_page = pfn_to_page(start_pfn); 115 116 if (page_zone(start_page) != zone) 117 return NULL; 118 119 end_page = pfn_to_page(end_pfn); 120 121 /* This gives a shorter code than deriving page_zone(end_page) */ 122 if (page_zone_id(start_page) != page_zone_id(end_page)) 123 return NULL; 124 125 return start_page; 126 } 127 128 #ifdef CONFIG_COMPACTION 129 130 /* Do not skip compaction more than 64 times */ 131 #define COMPACT_MAX_DEFER_SHIFT 6 132 133 /* 134 * Compaction is deferred when compaction fails to result in a page 135 * allocation success. 1 << compact_defer_limit compactions are skipped up 136 * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT 137 */ 138 void defer_compaction(struct zone *zone, int order) 139 { 140 zone->compact_considered = 0; 141 zone->compact_defer_shift++; 142 143 if (order < zone->compact_order_failed) 144 zone->compact_order_failed = order; 145 146 if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT) 147 zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT; 148 149 trace_mm_compaction_defer_compaction(zone, order); 150 } 151 152 /* Returns true if compaction should be skipped this time */ 153 bool compaction_deferred(struct zone *zone, int order) 154 { 155 unsigned long defer_limit = 1UL << zone->compact_defer_shift; 156 157 if (order < zone->compact_order_failed) 158 return false; 159 160 /* Avoid possible overflow */ 161 if (++zone->compact_considered > defer_limit) 162 zone->compact_considered = defer_limit; 163 164 if (zone->compact_considered >= defer_limit) 165 return false; 166 167 trace_mm_compaction_deferred(zone, order); 168 169 return true; 170 } 171 172 /* 173 * Update defer tracking counters after successful compaction of given order, 174 * which means an allocation either succeeded (alloc_success == true) or is 175 * expected to succeed. 176 */ 177 void compaction_defer_reset(struct zone *zone, int order, 178 bool alloc_success) 179 { 180 if (alloc_success) { 181 zone->compact_considered = 0; 182 zone->compact_defer_shift = 0; 183 } 184 if (order >= zone->compact_order_failed) 185 zone->compact_order_failed = order + 1; 186 187 trace_mm_compaction_defer_reset(zone, order); 188 } 189 190 /* Returns true if restarting compaction after many failures */ 191 bool compaction_restarting(struct zone *zone, int order) 192 { 193 if (order < zone->compact_order_failed) 194 return false; 195 196 return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT && 197 zone->compact_considered >= 1UL << zone->compact_defer_shift; 198 } 199 200 /* Returns true if the pageblock should be scanned for pages to isolate. */ 201 static inline bool isolation_suitable(struct compact_control *cc, 202 struct page *page) 203 { 204 if (cc->ignore_skip_hint) 205 return true; 206 207 return !get_pageblock_skip(page); 208 } 209 210 /* 211 * This function is called to clear all cached information on pageblocks that 212 * should be skipped for page isolation when the migrate and free page scanner 213 * meet. 214 */ 215 static void __reset_isolation_suitable(struct zone *zone) 216 { 217 unsigned long start_pfn = zone->zone_start_pfn; 218 unsigned long end_pfn = zone_end_pfn(zone); 219 unsigned long pfn; 220 221 zone->compact_cached_migrate_pfn[0] = start_pfn; 222 zone->compact_cached_migrate_pfn[1] = start_pfn; 223 zone->compact_cached_free_pfn = end_pfn; 224 zone->compact_blockskip_flush = false; 225 226 /* Walk the zone and mark every pageblock as suitable for isolation */ 227 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 228 struct page *page; 229 230 cond_resched(); 231 232 if (!pfn_valid(pfn)) 233 continue; 234 235 page = pfn_to_page(pfn); 236 if (zone != page_zone(page)) 237 continue; 238 239 clear_pageblock_skip(page); 240 } 241 } 242 243 void reset_isolation_suitable(pg_data_t *pgdat) 244 { 245 int zoneid; 246 247 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { 248 struct zone *zone = &pgdat->node_zones[zoneid]; 249 if (!populated_zone(zone)) 250 continue; 251 252 /* Only flush if a full compaction finished recently */ 253 if (zone->compact_blockskip_flush) 254 __reset_isolation_suitable(zone); 255 } 256 } 257 258 /* 259 * If no pages were isolated then mark this pageblock to be skipped in the 260 * future. The information is later cleared by __reset_isolation_suitable(). 261 */ 262 static void update_pageblock_skip(struct compact_control *cc, 263 struct page *page, unsigned long nr_isolated, 264 bool migrate_scanner) 265 { 266 struct zone *zone = cc->zone; 267 unsigned long pfn; 268 269 if (cc->ignore_skip_hint) 270 return; 271 272 if (!page) 273 return; 274 275 if (nr_isolated) 276 return; 277 278 set_pageblock_skip(page); 279 280 pfn = page_to_pfn(page); 281 282 /* Update where async and sync compaction should restart */ 283 if (migrate_scanner) { 284 if (pfn > zone->compact_cached_migrate_pfn[0]) 285 zone->compact_cached_migrate_pfn[0] = pfn; 286 if (cc->mode != MIGRATE_ASYNC && 287 pfn > zone->compact_cached_migrate_pfn[1]) 288 zone->compact_cached_migrate_pfn[1] = pfn; 289 } else { 290 if (pfn < zone->compact_cached_free_pfn) 291 zone->compact_cached_free_pfn = pfn; 292 } 293 } 294 #else 295 static inline bool isolation_suitable(struct compact_control *cc, 296 struct page *page) 297 { 298 return true; 299 } 300 301 static void update_pageblock_skip(struct compact_control *cc, 302 struct page *page, unsigned long nr_isolated, 303 bool migrate_scanner) 304 { 305 } 306 #endif /* CONFIG_COMPACTION */ 307 308 /* 309 * Compaction requires the taking of some coarse locks that are potentially 310 * very heavily contended. For async compaction, back out if the lock cannot 311 * be taken immediately. For sync compaction, spin on the lock if needed. 312 * 313 * Returns true if the lock is held 314 * Returns false if the lock is not held and compaction should abort 315 */ 316 static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags, 317 struct compact_control *cc) 318 { 319 if (cc->mode == MIGRATE_ASYNC) { 320 if (!spin_trylock_irqsave(lock, *flags)) { 321 cc->contended = COMPACT_CONTENDED_LOCK; 322 return false; 323 } 324 } else { 325 spin_lock_irqsave(lock, *flags); 326 } 327 328 return true; 329 } 330 331 /* 332 * Compaction requires the taking of some coarse locks that are potentially 333 * very heavily contended. The lock should be periodically unlocked to avoid 334 * having disabled IRQs for a long time, even when there is nobody waiting on 335 * the lock. It might also be that allowing the IRQs will result in 336 * need_resched() becoming true. If scheduling is needed, async compaction 337 * aborts. Sync compaction schedules. 338 * Either compaction type will also abort if a fatal signal is pending. 339 * In either case if the lock was locked, it is dropped and not regained. 340 * 341 * Returns true if compaction should abort due to fatal signal pending, or 342 * async compaction due to need_resched() 343 * Returns false when compaction can continue (sync compaction might have 344 * scheduled) 345 */ 346 static bool compact_unlock_should_abort(spinlock_t *lock, 347 unsigned long flags, bool *locked, struct compact_control *cc) 348 { 349 if (*locked) { 350 spin_unlock_irqrestore(lock, flags); 351 *locked = false; 352 } 353 354 if (fatal_signal_pending(current)) { 355 cc->contended = COMPACT_CONTENDED_SCHED; 356 return true; 357 } 358 359 if (need_resched()) { 360 if (cc->mode == MIGRATE_ASYNC) { 361 cc->contended = COMPACT_CONTENDED_SCHED; 362 return true; 363 } 364 cond_resched(); 365 } 366 367 return false; 368 } 369 370 /* 371 * Aside from avoiding lock contention, compaction also periodically checks 372 * need_resched() and either schedules in sync compaction or aborts async 373 * compaction. This is similar to what compact_unlock_should_abort() does, but 374 * is used where no lock is concerned. 375 * 376 * Returns false when no scheduling was needed, or sync compaction scheduled. 377 * Returns true when async compaction should abort. 378 */ 379 static inline bool compact_should_abort(struct compact_control *cc) 380 { 381 /* async compaction aborts if contended */ 382 if (need_resched()) { 383 if (cc->mode == MIGRATE_ASYNC) { 384 cc->contended = COMPACT_CONTENDED_SCHED; 385 return true; 386 } 387 388 cond_resched(); 389 } 390 391 return false; 392 } 393 394 /* Returns true if the page is within a block suitable for migration to */ 395 static bool suitable_migration_target(struct page *page) 396 { 397 /* If the page is a large free page, then disallow migration */ 398 if (PageBuddy(page)) { 399 /* 400 * We are checking page_order without zone->lock taken. But 401 * the only small danger is that we skip a potentially suitable 402 * pageblock, so it's not worth to check order for valid range. 403 */ 404 if (page_order_unsafe(page) >= pageblock_order) 405 return false; 406 } 407 408 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ 409 if (migrate_async_suitable(get_pageblock_migratetype(page))) 410 return true; 411 412 /* Otherwise skip the block */ 413 return false; 414 } 415 416 /* 417 * Isolate free pages onto a private freelist. If @strict is true, will abort 418 * returning 0 on any invalid PFNs or non-free pages inside of the pageblock 419 * (even though it may still end up isolating some pages). 420 */ 421 static unsigned long isolate_freepages_block(struct compact_control *cc, 422 unsigned long *start_pfn, 423 unsigned long end_pfn, 424 struct list_head *freelist, 425 bool strict) 426 { 427 int nr_scanned = 0, total_isolated = 0; 428 struct page *cursor, *valid_page = NULL; 429 unsigned long flags = 0; 430 bool locked = false; 431 unsigned long blockpfn = *start_pfn; 432 433 cursor = pfn_to_page(blockpfn); 434 435 /* Isolate free pages. */ 436 for (; blockpfn < end_pfn; blockpfn++, cursor++) { 437 int isolated, i; 438 struct page *page = cursor; 439 440 /* 441 * Periodically drop the lock (if held) regardless of its 442 * contention, to give chance to IRQs. Abort if fatal signal 443 * pending or async compaction detects need_resched() 444 */ 445 if (!(blockpfn % SWAP_CLUSTER_MAX) 446 && compact_unlock_should_abort(&cc->zone->lock, flags, 447 &locked, cc)) 448 break; 449 450 nr_scanned++; 451 if (!pfn_valid_within(blockpfn)) 452 goto isolate_fail; 453 454 if (!valid_page) 455 valid_page = page; 456 if (!PageBuddy(page)) 457 goto isolate_fail; 458 459 /* 460 * If we already hold the lock, we can skip some rechecking. 461 * Note that if we hold the lock now, checked_pageblock was 462 * already set in some previous iteration (or strict is true), 463 * so it is correct to skip the suitable migration target 464 * recheck as well. 465 */ 466 if (!locked) { 467 /* 468 * The zone lock must be held to isolate freepages. 469 * Unfortunately this is a very coarse lock and can be 470 * heavily contended if there are parallel allocations 471 * or parallel compactions. For async compaction do not 472 * spin on the lock and we acquire the lock as late as 473 * possible. 474 */ 475 locked = compact_trylock_irqsave(&cc->zone->lock, 476 &flags, cc); 477 if (!locked) 478 break; 479 480 /* Recheck this is a buddy page under lock */ 481 if (!PageBuddy(page)) 482 goto isolate_fail; 483 } 484 485 /* Found a free page, break it into order-0 pages */ 486 isolated = split_free_page(page); 487 total_isolated += isolated; 488 for (i = 0; i < isolated; i++) { 489 list_add(&page->lru, freelist); 490 page++; 491 } 492 493 /* If a page was split, advance to the end of it */ 494 if (isolated) { 495 cc->nr_freepages += isolated; 496 if (!strict && 497 cc->nr_migratepages <= cc->nr_freepages) { 498 blockpfn += isolated; 499 break; 500 } 501 502 blockpfn += isolated - 1; 503 cursor += isolated - 1; 504 continue; 505 } 506 507 isolate_fail: 508 if (strict) 509 break; 510 else 511 continue; 512 513 } 514 515 trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn, 516 nr_scanned, total_isolated); 517 518 /* Record how far we have got within the block */ 519 *start_pfn = blockpfn; 520 521 /* 522 * If strict isolation is requested by CMA then check that all the 523 * pages requested were isolated. If there were any failures, 0 is 524 * returned and CMA will fail. 525 */ 526 if (strict && blockpfn < end_pfn) 527 total_isolated = 0; 528 529 if (locked) 530 spin_unlock_irqrestore(&cc->zone->lock, flags); 531 532 /* Update the pageblock-skip if the whole pageblock was scanned */ 533 if (blockpfn == end_pfn) 534 update_pageblock_skip(cc, valid_page, total_isolated, false); 535 536 count_compact_events(COMPACTFREE_SCANNED, nr_scanned); 537 if (total_isolated) 538 count_compact_events(COMPACTISOLATED, total_isolated); 539 return total_isolated; 540 } 541 542 /** 543 * isolate_freepages_range() - isolate free pages. 544 * @start_pfn: The first PFN to start isolating. 545 * @end_pfn: The one-past-last PFN. 546 * 547 * Non-free pages, invalid PFNs, or zone boundaries within the 548 * [start_pfn, end_pfn) range are considered errors, cause function to 549 * undo its actions and return zero. 550 * 551 * Otherwise, function returns one-past-the-last PFN of isolated page 552 * (which may be greater then end_pfn if end fell in a middle of 553 * a free page). 554 */ 555 unsigned long 556 isolate_freepages_range(struct compact_control *cc, 557 unsigned long start_pfn, unsigned long end_pfn) 558 { 559 unsigned long isolated, pfn, block_end_pfn; 560 LIST_HEAD(freelist); 561 562 pfn = start_pfn; 563 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 564 565 for (; pfn < end_pfn; pfn += isolated, 566 block_end_pfn += pageblock_nr_pages) { 567 /* Protect pfn from changing by isolate_freepages_block */ 568 unsigned long isolate_start_pfn = pfn; 569 570 block_end_pfn = min(block_end_pfn, end_pfn); 571 572 /* 573 * pfn could pass the block_end_pfn if isolated freepage 574 * is more than pageblock order. In this case, we adjust 575 * scanning range to right one. 576 */ 577 if (pfn >= block_end_pfn) { 578 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 579 block_end_pfn = min(block_end_pfn, end_pfn); 580 } 581 582 if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) 583 break; 584 585 isolated = isolate_freepages_block(cc, &isolate_start_pfn, 586 block_end_pfn, &freelist, true); 587 588 /* 589 * In strict mode, isolate_freepages_block() returns 0 if 590 * there are any holes in the block (ie. invalid PFNs or 591 * non-free pages). 592 */ 593 if (!isolated) 594 break; 595 596 /* 597 * If we managed to isolate pages, it is always (1 << n) * 598 * pageblock_nr_pages for some non-negative n. (Max order 599 * page may span two pageblocks). 600 */ 601 } 602 603 /* split_free_page does not map the pages */ 604 map_pages(&freelist); 605 606 if (pfn < end_pfn) { 607 /* Loop terminated early, cleanup. */ 608 release_freepages(&freelist); 609 return 0; 610 } 611 612 /* We don't use freelists for anything. */ 613 return pfn; 614 } 615 616 /* Update the number of anon and file isolated pages in the zone */ 617 static void acct_isolated(struct zone *zone, struct compact_control *cc) 618 { 619 struct page *page; 620 unsigned int count[2] = { 0, }; 621 622 if (list_empty(&cc->migratepages)) 623 return; 624 625 list_for_each_entry(page, &cc->migratepages, lru) 626 count[!!page_is_file_cache(page)]++; 627 628 mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); 629 mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); 630 } 631 632 /* Similar to reclaim, but different enough that they don't share logic */ 633 static bool too_many_isolated(struct zone *zone) 634 { 635 unsigned long active, inactive, isolated; 636 637 inactive = zone_page_state(zone, NR_INACTIVE_FILE) + 638 zone_page_state(zone, NR_INACTIVE_ANON); 639 active = zone_page_state(zone, NR_ACTIVE_FILE) + 640 zone_page_state(zone, NR_ACTIVE_ANON); 641 isolated = zone_page_state(zone, NR_ISOLATED_FILE) + 642 zone_page_state(zone, NR_ISOLATED_ANON); 643 644 return isolated > (inactive + active) / 2; 645 } 646 647 /** 648 * isolate_migratepages_block() - isolate all migrate-able pages within 649 * a single pageblock 650 * @cc: Compaction control structure. 651 * @low_pfn: The first PFN to isolate 652 * @end_pfn: The one-past-the-last PFN to isolate, within same pageblock 653 * @isolate_mode: Isolation mode to be used. 654 * 655 * Isolate all pages that can be migrated from the range specified by 656 * [low_pfn, end_pfn). The range is expected to be within same pageblock. 657 * Returns zero if there is a fatal signal pending, otherwise PFN of the 658 * first page that was not scanned (which may be both less, equal to or more 659 * than end_pfn). 660 * 661 * The pages are isolated on cc->migratepages list (not required to be empty), 662 * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field 663 * is neither read nor updated. 664 */ 665 static unsigned long 666 isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, 667 unsigned long end_pfn, isolate_mode_t isolate_mode) 668 { 669 struct zone *zone = cc->zone; 670 unsigned long nr_scanned = 0, nr_isolated = 0; 671 struct list_head *migratelist = &cc->migratepages; 672 struct lruvec *lruvec; 673 unsigned long flags = 0; 674 bool locked = false; 675 struct page *page = NULL, *valid_page = NULL; 676 unsigned long start_pfn = low_pfn; 677 678 /* 679 * Ensure that there are not too many pages isolated from the LRU 680 * list by either parallel reclaimers or compaction. If there are, 681 * delay for some time until fewer pages are isolated 682 */ 683 while (unlikely(too_many_isolated(zone))) { 684 /* async migration should just abort */ 685 if (cc->mode == MIGRATE_ASYNC) 686 return 0; 687 688 congestion_wait(BLK_RW_ASYNC, HZ/10); 689 690 if (fatal_signal_pending(current)) 691 return 0; 692 } 693 694 if (compact_should_abort(cc)) 695 return 0; 696 697 /* Time to isolate some pages for migration */ 698 for (; low_pfn < end_pfn; low_pfn++) { 699 /* 700 * Periodically drop the lock (if held) regardless of its 701 * contention, to give chance to IRQs. Abort async compaction 702 * if contended. 703 */ 704 if (!(low_pfn % SWAP_CLUSTER_MAX) 705 && compact_unlock_should_abort(&zone->lru_lock, flags, 706 &locked, cc)) 707 break; 708 709 if (!pfn_valid_within(low_pfn)) 710 continue; 711 nr_scanned++; 712 713 page = pfn_to_page(low_pfn); 714 715 if (!valid_page) 716 valid_page = page; 717 718 /* 719 * Skip if free. We read page order here without zone lock 720 * which is generally unsafe, but the race window is small and 721 * the worst thing that can happen is that we skip some 722 * potential isolation targets. 723 */ 724 if (PageBuddy(page)) { 725 unsigned long freepage_order = page_order_unsafe(page); 726 727 /* 728 * Without lock, we cannot be sure that what we got is 729 * a valid page order. Consider only values in the 730 * valid order range to prevent low_pfn overflow. 731 */ 732 if (freepage_order > 0 && freepage_order < MAX_ORDER) 733 low_pfn += (1UL << freepage_order) - 1; 734 continue; 735 } 736 737 /* 738 * Check may be lockless but that's ok as we recheck later. 739 * It's possible to migrate LRU pages and balloon pages 740 * Skip any other type of page 741 */ 742 if (!PageLRU(page)) { 743 if (unlikely(balloon_page_movable(page))) { 744 if (balloon_page_isolate(page)) { 745 /* Successfully isolated */ 746 goto isolate_success; 747 } 748 } 749 continue; 750 } 751 752 /* 753 * PageLRU is set. lru_lock normally excludes isolation 754 * splitting and collapsing (collapsing has already happened 755 * if PageLRU is set) but the lock is not necessarily taken 756 * here and it is wasteful to take it just to check transhuge. 757 * Check TransHuge without lock and skip the whole pageblock if 758 * it's either a transhuge or hugetlbfs page, as calling 759 * compound_order() without preventing THP from splitting the 760 * page underneath us may return surprising results. 761 */ 762 if (PageTransHuge(page)) { 763 if (!locked) 764 low_pfn = ALIGN(low_pfn + 1, 765 pageblock_nr_pages) - 1; 766 else 767 low_pfn += (1 << compound_order(page)) - 1; 768 769 continue; 770 } 771 772 /* 773 * Migration will fail if an anonymous page is pinned in memory, 774 * so avoid taking lru_lock and isolating it unnecessarily in an 775 * admittedly racy check. 776 */ 777 if (!page_mapping(page) && 778 page_count(page) > page_mapcount(page)) 779 continue; 780 781 /* If we already hold the lock, we can skip some rechecking */ 782 if (!locked) { 783 locked = compact_trylock_irqsave(&zone->lru_lock, 784 &flags, cc); 785 if (!locked) 786 break; 787 788 /* Recheck PageLRU and PageTransHuge under lock */ 789 if (!PageLRU(page)) 790 continue; 791 if (PageTransHuge(page)) { 792 low_pfn += (1 << compound_order(page)) - 1; 793 continue; 794 } 795 } 796 797 lruvec = mem_cgroup_page_lruvec(page, zone); 798 799 /* Try isolate the page */ 800 if (__isolate_lru_page(page, isolate_mode) != 0) 801 continue; 802 803 VM_BUG_ON_PAGE(PageTransCompound(page), page); 804 805 /* Successfully isolated */ 806 del_page_from_lru_list(page, lruvec, page_lru(page)); 807 808 isolate_success: 809 list_add(&page->lru, migratelist); 810 cc->nr_migratepages++; 811 nr_isolated++; 812 813 /* Avoid isolating too much */ 814 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { 815 ++low_pfn; 816 break; 817 } 818 } 819 820 /* 821 * The PageBuddy() check could have potentially brought us outside 822 * the range to be scanned. 823 */ 824 if (unlikely(low_pfn > end_pfn)) 825 low_pfn = end_pfn; 826 827 if (locked) 828 spin_unlock_irqrestore(&zone->lru_lock, flags); 829 830 /* 831 * Update the pageblock-skip information and cached scanner pfn, 832 * if the whole pageblock was scanned without isolating any page. 833 */ 834 if (low_pfn == end_pfn) 835 update_pageblock_skip(cc, valid_page, nr_isolated, true); 836 837 trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn, 838 nr_scanned, nr_isolated); 839 840 count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned); 841 if (nr_isolated) 842 count_compact_events(COMPACTISOLATED, nr_isolated); 843 844 return low_pfn; 845 } 846 847 /** 848 * isolate_migratepages_range() - isolate migrate-able pages in a PFN range 849 * @cc: Compaction control structure. 850 * @start_pfn: The first PFN to start isolating. 851 * @end_pfn: The one-past-last PFN. 852 * 853 * Returns zero if isolation fails fatally due to e.g. pending signal. 854 * Otherwise, function returns one-past-the-last PFN of isolated page 855 * (which may be greater than end_pfn if end fell in a middle of a THP page). 856 */ 857 unsigned long 858 isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, 859 unsigned long end_pfn) 860 { 861 unsigned long pfn, block_end_pfn; 862 863 /* Scan block by block. First and last block may be incomplete */ 864 pfn = start_pfn; 865 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 866 867 for (; pfn < end_pfn; pfn = block_end_pfn, 868 block_end_pfn += pageblock_nr_pages) { 869 870 block_end_pfn = min(block_end_pfn, end_pfn); 871 872 if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) 873 continue; 874 875 pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, 876 ISOLATE_UNEVICTABLE); 877 878 /* 879 * In case of fatal failure, release everything that might 880 * have been isolated in the previous iteration, and signal 881 * the failure back to caller. 882 */ 883 if (!pfn) { 884 putback_movable_pages(&cc->migratepages); 885 cc->nr_migratepages = 0; 886 break; 887 } 888 889 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) 890 break; 891 } 892 acct_isolated(cc->zone, cc); 893 894 return pfn; 895 } 896 897 #endif /* CONFIG_COMPACTION || CONFIG_CMA */ 898 #ifdef CONFIG_COMPACTION 899 /* 900 * Based on information in the current compact_control, find blocks 901 * suitable for isolating free pages from and then isolate them. 902 */ 903 static void isolate_freepages(struct compact_control *cc) 904 { 905 struct zone *zone = cc->zone; 906 struct page *page; 907 unsigned long block_start_pfn; /* start of current pageblock */ 908 unsigned long isolate_start_pfn; /* exact pfn we start at */ 909 unsigned long block_end_pfn; /* end of current pageblock */ 910 unsigned long low_pfn; /* lowest pfn scanner is able to scan */ 911 struct list_head *freelist = &cc->freepages; 912 913 /* 914 * Initialise the free scanner. The starting point is where we last 915 * successfully isolated from, zone-cached value, or the end of the 916 * zone when isolating for the first time. For looping we also need 917 * this pfn aligned down to the pageblock boundary, because we do 918 * block_start_pfn -= pageblock_nr_pages in the for loop. 919 * For ending point, take care when isolating in last pageblock of a 920 * a zone which ends in the middle of a pageblock. 921 * The low boundary is the end of the pageblock the migration scanner 922 * is using. 923 */ 924 isolate_start_pfn = cc->free_pfn; 925 block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); 926 block_end_pfn = min(block_start_pfn + pageblock_nr_pages, 927 zone_end_pfn(zone)); 928 low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); 929 930 /* 931 * Isolate free pages until enough are available to migrate the 932 * pages on cc->migratepages. We stop searching if the migrate 933 * and free page scanners meet or enough free pages are isolated. 934 */ 935 for (; block_start_pfn >= low_pfn && 936 cc->nr_migratepages > cc->nr_freepages; 937 block_end_pfn = block_start_pfn, 938 block_start_pfn -= pageblock_nr_pages, 939 isolate_start_pfn = block_start_pfn) { 940 941 /* 942 * This can iterate a massively long zone without finding any 943 * suitable migration targets, so periodically check if we need 944 * to schedule, or even abort async compaction. 945 */ 946 if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) 947 && compact_should_abort(cc)) 948 break; 949 950 page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, 951 zone); 952 if (!page) 953 continue; 954 955 /* Check the block is suitable for migration */ 956 if (!suitable_migration_target(page)) 957 continue; 958 959 /* If isolation recently failed, do not retry */ 960 if (!isolation_suitable(cc, page)) 961 continue; 962 963 /* Found a block suitable for isolating free pages from. */ 964 isolate_freepages_block(cc, &isolate_start_pfn, 965 block_end_pfn, freelist, false); 966 967 /* 968 * Remember where the free scanner should restart next time, 969 * which is where isolate_freepages_block() left off. 970 * But if it scanned the whole pageblock, isolate_start_pfn 971 * now points at block_end_pfn, which is the start of the next 972 * pageblock. 973 * In that case we will however want to restart at the start 974 * of the previous pageblock. 975 */ 976 cc->free_pfn = (isolate_start_pfn < block_end_pfn) ? 977 isolate_start_pfn : 978 block_start_pfn - pageblock_nr_pages; 979 980 /* 981 * isolate_freepages_block() might have aborted due to async 982 * compaction being contended 983 */ 984 if (cc->contended) 985 break; 986 } 987 988 /* split_free_page does not map the pages */ 989 map_pages(freelist); 990 991 /* 992 * If we crossed the migrate scanner, we want to keep it that way 993 * so that compact_finished() may detect this 994 */ 995 if (block_start_pfn < low_pfn) 996 cc->free_pfn = cc->migrate_pfn; 997 } 998 999 /* 1000 * This is a migrate-callback that "allocates" freepages by taking pages 1001 * from the isolated freelists in the block we are migrating to. 1002 */ 1003 static struct page *compaction_alloc(struct page *migratepage, 1004 unsigned long data, 1005 int **result) 1006 { 1007 struct compact_control *cc = (struct compact_control *)data; 1008 struct page *freepage; 1009 1010 /* 1011 * Isolate free pages if necessary, and if we are not aborting due to 1012 * contention. 1013 */ 1014 if (list_empty(&cc->freepages)) { 1015 if (!cc->contended) 1016 isolate_freepages(cc); 1017 1018 if (list_empty(&cc->freepages)) 1019 return NULL; 1020 } 1021 1022 freepage = list_entry(cc->freepages.next, struct page, lru); 1023 list_del(&freepage->lru); 1024 cc->nr_freepages--; 1025 1026 return freepage; 1027 } 1028 1029 /* 1030 * This is a migrate-callback that "frees" freepages back to the isolated 1031 * freelist. All pages on the freelist are from the same zone, so there is no 1032 * special handling needed for NUMA. 1033 */ 1034 static void compaction_free(struct page *page, unsigned long data) 1035 { 1036 struct compact_control *cc = (struct compact_control *)data; 1037 1038 list_add(&page->lru, &cc->freepages); 1039 cc->nr_freepages++; 1040 } 1041 1042 /* possible outcome of isolate_migratepages */ 1043 typedef enum { 1044 ISOLATE_ABORT, /* Abort compaction now */ 1045 ISOLATE_NONE, /* No pages isolated, continue scanning */ 1046 ISOLATE_SUCCESS, /* Pages isolated, migrate */ 1047 } isolate_migrate_t; 1048 1049 /* 1050 * Isolate all pages that can be migrated from the first suitable block, 1051 * starting at the block pointed to by the migrate scanner pfn within 1052 * compact_control. 1053 */ 1054 static isolate_migrate_t isolate_migratepages(struct zone *zone, 1055 struct compact_control *cc) 1056 { 1057 unsigned long low_pfn, end_pfn; 1058 struct page *page; 1059 const isolate_mode_t isolate_mode = 1060 (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); 1061 1062 /* 1063 * Start at where we last stopped, or beginning of the zone as 1064 * initialized by compact_zone() 1065 */ 1066 low_pfn = cc->migrate_pfn; 1067 1068 /* Only scan within a pageblock boundary */ 1069 end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); 1070 1071 /* 1072 * Iterate over whole pageblocks until we find the first suitable. 1073 * Do not cross the free scanner. 1074 */ 1075 for (; end_pfn <= cc->free_pfn; 1076 low_pfn = end_pfn, end_pfn += pageblock_nr_pages) { 1077 1078 /* 1079 * This can potentially iterate a massively long zone with 1080 * many pageblocks unsuitable, so periodically check if we 1081 * need to schedule, or even abort async compaction. 1082 */ 1083 if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) 1084 && compact_should_abort(cc)) 1085 break; 1086 1087 page = pageblock_pfn_to_page(low_pfn, end_pfn, zone); 1088 if (!page) 1089 continue; 1090 1091 /* If isolation recently failed, do not retry */ 1092 if (!isolation_suitable(cc, page)) 1093 continue; 1094 1095 /* 1096 * For async compaction, also only scan in MOVABLE blocks. 1097 * Async compaction is optimistic to see if the minimum amount 1098 * of work satisfies the allocation. 1099 */ 1100 if (cc->mode == MIGRATE_ASYNC && 1101 !migrate_async_suitable(get_pageblock_migratetype(page))) 1102 continue; 1103 1104 /* Perform the isolation */ 1105 low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, 1106 isolate_mode); 1107 1108 if (!low_pfn || cc->contended) { 1109 acct_isolated(zone, cc); 1110 return ISOLATE_ABORT; 1111 } 1112 1113 /* 1114 * Either we isolated something and proceed with migration. Or 1115 * we failed and compact_zone should decide if we should 1116 * continue or not. 1117 */ 1118 break; 1119 } 1120 1121 acct_isolated(zone, cc); 1122 /* 1123 * Record where migration scanner will be restarted. If we end up in 1124 * the same pageblock as the free scanner, make the scanners fully 1125 * meet so that compact_finished() terminates compaction. 1126 */ 1127 cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn; 1128 1129 return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; 1130 } 1131 1132 static int __compact_finished(struct zone *zone, struct compact_control *cc, 1133 const int migratetype) 1134 { 1135 unsigned int order; 1136 unsigned long watermark; 1137 1138 if (cc->contended || fatal_signal_pending(current)) 1139 return COMPACT_PARTIAL; 1140 1141 /* Compaction run completes if the migrate and free scanner meet */ 1142 if (cc->free_pfn <= cc->migrate_pfn) { 1143 /* Let the next compaction start anew. */ 1144 zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; 1145 zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; 1146 zone->compact_cached_free_pfn = zone_end_pfn(zone); 1147 1148 /* 1149 * Mark that the PG_migrate_skip information should be cleared 1150 * by kswapd when it goes to sleep. kswapd does not set the 1151 * flag itself as the decision to be clear should be directly 1152 * based on an allocation request. 1153 */ 1154 if (!current_is_kswapd()) 1155 zone->compact_blockskip_flush = true; 1156 1157 return COMPACT_COMPLETE; 1158 } 1159 1160 /* 1161 * order == -1 is expected when compacting via 1162 * /proc/sys/vm/compact_memory 1163 */ 1164 if (cc->order == -1) 1165 return COMPACT_CONTINUE; 1166 1167 /* Compaction run is not finished if the watermark is not met */ 1168 watermark = low_wmark_pages(zone); 1169 1170 if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx, 1171 cc->alloc_flags)) 1172 return COMPACT_CONTINUE; 1173 1174 /* Direct compactor: Is a suitable page free? */ 1175 for (order = cc->order; order < MAX_ORDER; order++) { 1176 struct free_area *area = &zone->free_area[order]; 1177 1178 /* Job done if page is free of the right migratetype */ 1179 if (!list_empty(&area->free_list[migratetype])) 1180 return COMPACT_PARTIAL; 1181 1182 /* Job done if allocation would set block type */ 1183 if (order >= pageblock_order && area->nr_free) 1184 return COMPACT_PARTIAL; 1185 } 1186 1187 return COMPACT_NO_SUITABLE_PAGE; 1188 } 1189 1190 static int compact_finished(struct zone *zone, struct compact_control *cc, 1191 const int migratetype) 1192 { 1193 int ret; 1194 1195 ret = __compact_finished(zone, cc, migratetype); 1196 trace_mm_compaction_finished(zone, cc->order, ret); 1197 if (ret == COMPACT_NO_SUITABLE_PAGE) 1198 ret = COMPACT_CONTINUE; 1199 1200 return ret; 1201 } 1202 1203 /* 1204 * compaction_suitable: Is this suitable to run compaction on this zone now? 1205 * Returns 1206 * COMPACT_SKIPPED - If there are too few free pages for compaction 1207 * COMPACT_PARTIAL - If the allocation would succeed without compaction 1208 * COMPACT_CONTINUE - If compaction should run now 1209 */ 1210 static unsigned long __compaction_suitable(struct zone *zone, int order, 1211 int alloc_flags, int classzone_idx) 1212 { 1213 int fragindex; 1214 unsigned long watermark; 1215 1216 /* 1217 * order == -1 is expected when compacting via 1218 * /proc/sys/vm/compact_memory 1219 */ 1220 if (order == -1) 1221 return COMPACT_CONTINUE; 1222 1223 watermark = low_wmark_pages(zone); 1224 /* 1225 * If watermarks for high-order allocation are already met, there 1226 * should be no need for compaction at all. 1227 */ 1228 if (zone_watermark_ok(zone, order, watermark, classzone_idx, 1229 alloc_flags)) 1230 return COMPACT_PARTIAL; 1231 1232 /* 1233 * Watermarks for order-0 must be met for compaction. Note the 2UL. 1234 * This is because during migration, copies of pages need to be 1235 * allocated and for a short time, the footprint is higher 1236 */ 1237 watermark += (2UL << order); 1238 if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags)) 1239 return COMPACT_SKIPPED; 1240 1241 /* 1242 * fragmentation index determines if allocation failures are due to 1243 * low memory or external fragmentation 1244 * 1245 * index of -1000 would imply allocations might succeed depending on 1246 * watermarks, but we already failed the high-order watermark check 1247 * index towards 0 implies failure is due to lack of memory 1248 * index towards 1000 implies failure is due to fragmentation 1249 * 1250 * Only compact if a failure would be due to fragmentation. 1251 */ 1252 fragindex = fragmentation_index(zone, order); 1253 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) 1254 return COMPACT_NOT_SUITABLE_ZONE; 1255 1256 return COMPACT_CONTINUE; 1257 } 1258 1259 unsigned long compaction_suitable(struct zone *zone, int order, 1260 int alloc_flags, int classzone_idx) 1261 { 1262 unsigned long ret; 1263 1264 ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx); 1265 trace_mm_compaction_suitable(zone, order, ret); 1266 if (ret == COMPACT_NOT_SUITABLE_ZONE) 1267 ret = COMPACT_SKIPPED; 1268 1269 return ret; 1270 } 1271 1272 static int compact_zone(struct zone *zone, struct compact_control *cc) 1273 { 1274 int ret; 1275 unsigned long start_pfn = zone->zone_start_pfn; 1276 unsigned long end_pfn = zone_end_pfn(zone); 1277 const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); 1278 const bool sync = cc->mode != MIGRATE_ASYNC; 1279 unsigned long last_migrated_pfn = 0; 1280 1281 ret = compaction_suitable(zone, cc->order, cc->alloc_flags, 1282 cc->classzone_idx); 1283 switch (ret) { 1284 case COMPACT_PARTIAL: 1285 case COMPACT_SKIPPED: 1286 /* Compaction is likely to fail */ 1287 return ret; 1288 case COMPACT_CONTINUE: 1289 /* Fall through to compaction */ 1290 ; 1291 } 1292 1293 /* 1294 * Clear pageblock skip if there were failures recently and compaction 1295 * is about to be retried after being deferred. kswapd does not do 1296 * this reset as it'll reset the cached information when going to sleep. 1297 */ 1298 if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) 1299 __reset_isolation_suitable(zone); 1300 1301 /* 1302 * Setup to move all movable pages to the end of the zone. Used cached 1303 * information on where the scanners should start but check that it 1304 * is initialised by ensuring the values are within zone boundaries. 1305 */ 1306 cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; 1307 cc->free_pfn = zone->compact_cached_free_pfn; 1308 if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { 1309 cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); 1310 zone->compact_cached_free_pfn = cc->free_pfn; 1311 } 1312 if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { 1313 cc->migrate_pfn = start_pfn; 1314 zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; 1315 zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; 1316 } 1317 1318 trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, 1319 cc->free_pfn, end_pfn, sync); 1320 1321 migrate_prep_local(); 1322 1323 while ((ret = compact_finished(zone, cc, migratetype)) == 1324 COMPACT_CONTINUE) { 1325 int err; 1326 unsigned long isolate_start_pfn = cc->migrate_pfn; 1327 1328 switch (isolate_migratepages(zone, cc)) { 1329 case ISOLATE_ABORT: 1330 ret = COMPACT_PARTIAL; 1331 putback_movable_pages(&cc->migratepages); 1332 cc->nr_migratepages = 0; 1333 goto out; 1334 case ISOLATE_NONE: 1335 /* 1336 * We haven't isolated and migrated anything, but 1337 * there might still be unflushed migrations from 1338 * previous cc->order aligned block. 1339 */ 1340 goto check_drain; 1341 case ISOLATE_SUCCESS: 1342 ; 1343 } 1344 1345 err = migrate_pages(&cc->migratepages, compaction_alloc, 1346 compaction_free, (unsigned long)cc, cc->mode, 1347 MR_COMPACTION); 1348 1349 trace_mm_compaction_migratepages(cc->nr_migratepages, err, 1350 &cc->migratepages); 1351 1352 /* All pages were either migrated or will be released */ 1353 cc->nr_migratepages = 0; 1354 if (err) { 1355 putback_movable_pages(&cc->migratepages); 1356 /* 1357 * migrate_pages() may return -ENOMEM when scanners meet 1358 * and we want compact_finished() to detect it 1359 */ 1360 if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) { 1361 ret = COMPACT_PARTIAL; 1362 goto out; 1363 } 1364 } 1365 1366 /* 1367 * Record where we could have freed pages by migration and not 1368 * yet flushed them to buddy allocator. We use the pfn that 1369 * isolate_migratepages() started from in this loop iteration 1370 * - this is the lowest page that could have been isolated and 1371 * then freed by migration. 1372 */ 1373 if (!last_migrated_pfn) 1374 last_migrated_pfn = isolate_start_pfn; 1375 1376 check_drain: 1377 /* 1378 * Has the migration scanner moved away from the previous 1379 * cc->order aligned block where we migrated from? If yes, 1380 * flush the pages that were freed, so that they can merge and 1381 * compact_finished() can detect immediately if allocation 1382 * would succeed. 1383 */ 1384 if (cc->order > 0 && last_migrated_pfn) { 1385 int cpu; 1386 unsigned long current_block_start = 1387 cc->migrate_pfn & ~((1UL << cc->order) - 1); 1388 1389 if (last_migrated_pfn < current_block_start) { 1390 cpu = get_cpu(); 1391 lru_add_drain_cpu(cpu); 1392 drain_local_pages(zone); 1393 put_cpu(); 1394 /* No more flushing until we migrate again */ 1395 last_migrated_pfn = 0; 1396 } 1397 } 1398 1399 } 1400 1401 out: 1402 /* 1403 * Release free pages and update where the free scanner should restart, 1404 * so we don't leave any returned pages behind in the next attempt. 1405 */ 1406 if (cc->nr_freepages > 0) { 1407 unsigned long free_pfn = release_freepages(&cc->freepages); 1408 1409 cc->nr_freepages = 0; 1410 VM_BUG_ON(free_pfn == 0); 1411 /* The cached pfn is always the first in a pageblock */ 1412 free_pfn &= ~(pageblock_nr_pages-1); 1413 /* 1414 * Only go back, not forward. The cached pfn might have been 1415 * already reset to zone end in compact_finished() 1416 */ 1417 if (free_pfn > zone->compact_cached_free_pfn) 1418 zone->compact_cached_free_pfn = free_pfn; 1419 } 1420 1421 trace_mm_compaction_end(start_pfn, cc->migrate_pfn, 1422 cc->free_pfn, end_pfn, sync, ret); 1423 1424 return ret; 1425 } 1426 1427 static unsigned long compact_zone_order(struct zone *zone, int order, 1428 gfp_t gfp_mask, enum migrate_mode mode, int *contended, 1429 int alloc_flags, int classzone_idx) 1430 { 1431 unsigned long ret; 1432 struct compact_control cc = { 1433 .nr_freepages = 0, 1434 .nr_migratepages = 0, 1435 .order = order, 1436 .gfp_mask = gfp_mask, 1437 .zone = zone, 1438 .mode = mode, 1439 .alloc_flags = alloc_flags, 1440 .classzone_idx = classzone_idx, 1441 }; 1442 INIT_LIST_HEAD(&cc.freepages); 1443 INIT_LIST_HEAD(&cc.migratepages); 1444 1445 ret = compact_zone(zone, &cc); 1446 1447 VM_BUG_ON(!list_empty(&cc.freepages)); 1448 VM_BUG_ON(!list_empty(&cc.migratepages)); 1449 1450 *contended = cc.contended; 1451 return ret; 1452 } 1453 1454 int sysctl_extfrag_threshold = 500; 1455 1456 /** 1457 * try_to_compact_pages - Direct compact to satisfy a high-order allocation 1458 * @gfp_mask: The GFP mask of the current allocation 1459 * @order: The order of the current allocation 1460 * @alloc_flags: The allocation flags of the current allocation 1461 * @ac: The context of current allocation 1462 * @mode: The migration mode for async, sync light, or sync migration 1463 * @contended: Return value that determines if compaction was aborted due to 1464 * need_resched() or lock contention 1465 * 1466 * This is the main entry point for direct page compaction. 1467 */ 1468 unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, 1469 int alloc_flags, const struct alloc_context *ac, 1470 enum migrate_mode mode, int *contended) 1471 { 1472 int may_enter_fs = gfp_mask & __GFP_FS; 1473 int may_perform_io = gfp_mask & __GFP_IO; 1474 struct zoneref *z; 1475 struct zone *zone; 1476 int rc = COMPACT_DEFERRED; 1477 int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ 1478 1479 *contended = COMPACT_CONTENDED_NONE; 1480 1481 /* Check if the GFP flags allow compaction */ 1482 if (!order || !may_enter_fs || !may_perform_io) 1483 return COMPACT_SKIPPED; 1484 1485 trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode); 1486 1487 /* Compact each zone in the list */ 1488 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, 1489 ac->nodemask) { 1490 int status; 1491 int zone_contended; 1492 1493 if (compaction_deferred(zone, order)) 1494 continue; 1495 1496 status = compact_zone_order(zone, order, gfp_mask, mode, 1497 &zone_contended, alloc_flags, 1498 ac->classzone_idx); 1499 rc = max(status, rc); 1500 /* 1501 * It takes at least one zone that wasn't lock contended 1502 * to clear all_zones_contended. 1503 */ 1504 all_zones_contended &= zone_contended; 1505 1506 /* If a normal allocation would succeed, stop compacting */ 1507 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 1508 ac->classzone_idx, alloc_flags)) { 1509 /* 1510 * We think the allocation will succeed in this zone, 1511 * but it is not certain, hence the false. The caller 1512 * will repeat this with true if allocation indeed 1513 * succeeds in this zone. 1514 */ 1515 compaction_defer_reset(zone, order, false); 1516 /* 1517 * It is possible that async compaction aborted due to 1518 * need_resched() and the watermarks were ok thanks to 1519 * somebody else freeing memory. The allocation can 1520 * however still fail so we better signal the 1521 * need_resched() contention anyway (this will not 1522 * prevent the allocation attempt). 1523 */ 1524 if (zone_contended == COMPACT_CONTENDED_SCHED) 1525 *contended = COMPACT_CONTENDED_SCHED; 1526 1527 goto break_loop; 1528 } 1529 1530 if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) { 1531 /* 1532 * We think that allocation won't succeed in this zone 1533 * so we defer compaction there. If it ends up 1534 * succeeding after all, it will be reset. 1535 */ 1536 defer_compaction(zone, order); 1537 } 1538 1539 /* 1540 * We might have stopped compacting due to need_resched() in 1541 * async compaction, or due to a fatal signal detected. In that 1542 * case do not try further zones and signal need_resched() 1543 * contention. 1544 */ 1545 if ((zone_contended == COMPACT_CONTENDED_SCHED) 1546 || fatal_signal_pending(current)) { 1547 *contended = COMPACT_CONTENDED_SCHED; 1548 goto break_loop; 1549 } 1550 1551 continue; 1552 break_loop: 1553 /* 1554 * We might not have tried all the zones, so be conservative 1555 * and assume they are not all lock contended. 1556 */ 1557 all_zones_contended = 0; 1558 break; 1559 } 1560 1561 /* 1562 * If at least one zone wasn't deferred or skipped, we report if all 1563 * zones that were tried were lock contended. 1564 */ 1565 if (rc > COMPACT_SKIPPED && all_zones_contended) 1566 *contended = COMPACT_CONTENDED_LOCK; 1567 1568 return rc; 1569 } 1570 1571 1572 /* Compact all zones within a node */ 1573 static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) 1574 { 1575 int zoneid; 1576 struct zone *zone; 1577 1578 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { 1579 1580 zone = &pgdat->node_zones[zoneid]; 1581 if (!populated_zone(zone)) 1582 continue; 1583 1584 cc->nr_freepages = 0; 1585 cc->nr_migratepages = 0; 1586 cc->zone = zone; 1587 INIT_LIST_HEAD(&cc->freepages); 1588 INIT_LIST_HEAD(&cc->migratepages); 1589 1590 if (cc->order == -1 || !compaction_deferred(zone, cc->order)) 1591 compact_zone(zone, cc); 1592 1593 if (cc->order > 0) { 1594 if (zone_watermark_ok(zone, cc->order, 1595 low_wmark_pages(zone), 0, 0)) 1596 compaction_defer_reset(zone, cc->order, false); 1597 } 1598 1599 VM_BUG_ON(!list_empty(&cc->freepages)); 1600 VM_BUG_ON(!list_empty(&cc->migratepages)); 1601 } 1602 } 1603 1604 void compact_pgdat(pg_data_t *pgdat, int order) 1605 { 1606 struct compact_control cc = { 1607 .order = order, 1608 .mode = MIGRATE_ASYNC, 1609 }; 1610 1611 if (!order) 1612 return; 1613 1614 __compact_pgdat(pgdat, &cc); 1615 } 1616 1617 static void compact_node(int nid) 1618 { 1619 struct compact_control cc = { 1620 .order = -1, 1621 .mode = MIGRATE_SYNC, 1622 .ignore_skip_hint = true, 1623 }; 1624 1625 __compact_pgdat(NODE_DATA(nid), &cc); 1626 } 1627 1628 /* Compact all nodes in the system */ 1629 static void compact_nodes(void) 1630 { 1631 int nid; 1632 1633 /* Flush pending updates to the LRU lists */ 1634 lru_add_drain_all(); 1635 1636 for_each_online_node(nid) 1637 compact_node(nid); 1638 } 1639 1640 /* The written value is actually unused, all memory is compacted */ 1641 int sysctl_compact_memory; 1642 1643 /* This is the entry point for compacting all nodes via /proc/sys/vm */ 1644 int sysctl_compaction_handler(struct ctl_table *table, int write, 1645 void __user *buffer, size_t *length, loff_t *ppos) 1646 { 1647 if (write) 1648 compact_nodes(); 1649 1650 return 0; 1651 } 1652 1653 int sysctl_extfrag_handler(struct ctl_table *table, int write, 1654 void __user *buffer, size_t *length, loff_t *ppos) 1655 { 1656 proc_dointvec_minmax(table, write, buffer, length, ppos); 1657 1658 return 0; 1659 } 1660 1661 #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) 1662 static ssize_t sysfs_compact_node(struct device *dev, 1663 struct device_attribute *attr, 1664 const char *buf, size_t count) 1665 { 1666 int nid = dev->id; 1667 1668 if (nid >= 0 && nid < nr_node_ids && node_online(nid)) { 1669 /* Flush pending updates to the LRU lists */ 1670 lru_add_drain_all(); 1671 1672 compact_node(nid); 1673 } 1674 1675 return count; 1676 } 1677 static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node); 1678 1679 int compaction_register_node(struct node *node) 1680 { 1681 return device_create_file(&node->dev, &dev_attr_compact); 1682 } 1683 1684 void compaction_unregister_node(struct node *node) 1685 { 1686 return device_remove_file(&node->dev, &dev_attr_compact); 1687 } 1688 #endif /* CONFIG_SYSFS && CONFIG_NUMA */ 1689 1690 #endif /* CONFIG_COMPACTION */ 1691