1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * 4 * Manages the free list, the system allocates free pages here. 5 * Note that kmalloc() lives in slab.c 6 * 7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 8 * Swap reorganised 29.12.95, Stephen Tweedie 9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 15 */ 16 17 #include <linux/stddef.h> 18 #include <linux/mm.h> 19 #include <linux/highmem.h> 20 #include <linux/interrupt.h> 21 #include <linux/jiffies.h> 22 #include <linux/compiler.h> 23 #include <linux/kernel.h> 24 #include <linux/kasan.h> 25 #include <linux/kmsan.h> 26 #include <linux/module.h> 27 #include <linux/suspend.h> 28 #include <linux/ratelimit.h> 29 #include <linux/oom.h> 30 #include <linux/topology.h> 31 #include <linux/sysctl.h> 32 #include <linux/cpu.h> 33 #include <linux/cpuset.h> 34 #include <linux/folio_batch.h> 35 #include <linux/memory_hotplug.h> 36 #include <linux/nodemask.h> 37 #include <linux/vmstat.h> 38 #include <linux/fault-inject.h> 39 #include <linux/compaction.h> 40 #include <trace/events/kmem.h> 41 #include <trace/events/oom.h> 42 #include <linux/prefetch.h> 43 #include <linux/mm_inline.h> 44 #include <linux/mmu_notifier.h> 45 #include <linux/migrate.h> 46 #include <linux/sched/mm.h> 47 #include <linux/page_owner.h> 48 #include <linux/page_table_check.h> 49 #include <linux/memcontrol.h> 50 #include <linux/ftrace.h> 51 #include <linux/lockdep.h> 52 #include <linux/psi.h> 53 #include <linux/khugepaged.h> 54 #include <linux/delayacct.h> 55 #include <linux/cacheinfo.h> 56 #include <linux/pgalloc_tag.h> 57 #include <asm/div64.h> 58 #include "internal.h" 59 #include "shuffle.h" 60 #include "page_reporting.h" 61 62 /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */ 63 typedef int __bitwise fpi_t; 64 65 /* No special request */ 66 #define FPI_NONE ((__force fpi_t)0) 67 68 /* 69 * Skip free page reporting notification for the (possibly merged) page. 70 * This does not hinder free page reporting from grabbing the page, 71 * reporting it and marking it "reported" - it only skips notifying 72 * the free page reporting infrastructure about a newly freed page. For 73 * example, used when temporarily pulling a page from a freelist and 74 * putting it back unmodified. 75 */ 76 #define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0)) 77 78 /* 79 * Place the (possibly merged) page to the tail of the freelist. Will ignore 80 * page shuffling (relevant code - e.g., memory onlining - is expected to 81 * shuffle the whole zone). 82 * 83 * Note: No code should rely on this flag for correctness - it's purely 84 * to allow for optimizations when handing back either fresh pages 85 * (memory onlining) or untouched pages (page isolation, free page 86 * reporting). 87 */ 88 #define FPI_TO_TAIL ((__force fpi_t)BIT(1)) 89 90 /* Free the page without taking locks. Rely on trylock only. */ 91 #define FPI_TRYLOCK ((__force fpi_t)BIT(2)) 92 93 /* free_pages_prepare() has already been called for page(s) being freed. */ 94 #define FPI_PREPARED ((__force fpi_t)BIT(3)) 95 96 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ 97 static DEFINE_MUTEX(pcp_batch_high_lock); 98 #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) 99 100 /* 101 * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid 102 * a migration causing the wrong PCP to be locked and remote memory being 103 * potentially allocated, pin the task to the CPU for the lookup+lock. 104 * preempt_disable is used on !RT because it is faster than migrate_disable. 105 * migrate_disable is used on RT because otherwise RT spinlock usage is 106 * interfered with and a high priority task cannot preempt the allocator. 107 */ 108 #ifndef CONFIG_PREEMPT_RT 109 #define pcpu_task_pin() preempt_disable() 110 #define pcpu_task_unpin() preempt_enable() 111 #else 112 #define pcpu_task_pin() migrate_disable() 113 #define pcpu_task_unpin() migrate_enable() 114 #endif 115 116 /* 117 * A helper to lookup and trylock pcp with embedded spinlock. 118 * The return value should be used with the unlock helper. 119 * NULL return value means the trylock failed. 120 */ 121 #ifdef CONFIG_SMP 122 #define pcp_spin_trylock(ptr) \ 123 ({ \ 124 struct per_cpu_pages *_ret; \ 125 pcpu_task_pin(); \ 126 _ret = this_cpu_ptr(ptr); \ 127 if (!spin_trylock(&_ret->lock)) { \ 128 pcpu_task_unpin(); \ 129 _ret = NULL; \ 130 } \ 131 _ret; \ 132 }) 133 134 #define pcp_spin_unlock(ptr) \ 135 ({ \ 136 spin_unlock(&ptr->lock); \ 137 pcpu_task_unpin(); \ 138 }) 139 140 /* 141 * On CONFIG_SMP=n the UP implementation of spin_trylock() never fails and thus 142 * is not compatible with our locking scheme. However we do not need pcp for 143 * scalability in the first place, so just make all the trylocks fail and take 144 * the slow path unconditionally. 145 */ 146 #else 147 #define pcp_spin_trylock(ptr) \ 148 NULL 149 150 #define pcp_spin_unlock(ptr) \ 151 BUG_ON(1) 152 #endif 153 154 /* 155 * In some cases we do not need to pin the task to the CPU because we are 156 * already given a specific cpu's pcp pointer. 157 */ 158 #define pcp_spin_lock_nopin(ptr) \ 159 spin_lock(&(ptr)->lock) 160 #define pcp_spin_unlock_nopin(ptr) \ 161 spin_unlock(&(ptr)->lock) 162 163 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 164 DEFINE_PER_CPU(int, numa_node); 165 EXPORT_PER_CPU_SYMBOL(numa_node); 166 #endif 167 168 DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key); 169 170 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 171 /* 172 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. 173 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. 174 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() 175 * defined in <linux/topology.h>. 176 */ 177 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ 178 EXPORT_PER_CPU_SYMBOL(_numa_mem_); 179 #endif 180 181 static DEFINE_MUTEX(pcpu_drain_mutex); 182 183 #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY 184 volatile unsigned long latent_entropy __latent_entropy; 185 EXPORT_SYMBOL(latent_entropy); 186 #endif 187 188 /* 189 * Array of node states. 190 */ 191 nodemask_t node_states[NR_NODE_STATES] __read_mostly = { 192 [N_POSSIBLE] = NODE_MASK_ALL, 193 [N_ONLINE] = { { [0] = 1UL } }, 194 #ifndef CONFIG_NUMA 195 [N_NORMAL_MEMORY] = { { [0] = 1UL } }, 196 #ifdef CONFIG_HIGHMEM 197 [N_HIGH_MEMORY] = { { [0] = 1UL } }, 198 #endif 199 [N_MEMORY] = { { [0] = 1UL } }, 200 [N_CPU] = { { [0] = 1UL } }, 201 #endif /* NUMA */ 202 }; 203 EXPORT_SYMBOL(node_states); 204 205 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 206 207 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 208 unsigned int pageblock_order __read_mostly; 209 #endif 210 211 static void __free_pages_ok(struct page *page, unsigned int order, 212 fpi_t fpi_flags); 213 static void reserve_highatomic_pageblock(struct page *page, int order, 214 struct zone *zone); 215 216 /* 217 * results with 256, 32 in the lowmem_reserve sysctl: 218 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 219 * 1G machine -> (16M dma, 784M normal, 224M high) 220 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 221 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 222 * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA 223 * 224 * TBD: should special case ZONE_DMA32 machines here - in those we normally 225 * don't need any ZONE_NORMAL reservation 226 */ 227 static int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = { 228 #ifdef CONFIG_ZONE_DMA 229 [ZONE_DMA] = 256, 230 #endif 231 #ifdef CONFIG_ZONE_DMA32 232 [ZONE_DMA32] = 256, 233 #endif 234 [ZONE_NORMAL] = 32, 235 #ifdef CONFIG_HIGHMEM 236 [ZONE_HIGHMEM] = 0, 237 #endif 238 [ZONE_MOVABLE] = 0, 239 }; 240 241 char * const zone_names[MAX_NR_ZONES] = { 242 #ifdef CONFIG_ZONE_DMA 243 "DMA", 244 #endif 245 #ifdef CONFIG_ZONE_DMA32 246 "DMA32", 247 #endif 248 "Normal", 249 #ifdef CONFIG_HIGHMEM 250 "HighMem", 251 #endif 252 "Movable", 253 #ifdef CONFIG_ZONE_DEVICE 254 "Device", 255 #endif 256 }; 257 258 const char * const migratetype_names[MIGRATE_TYPES] = { 259 "Unmovable", 260 "Movable", 261 "Reclaimable", 262 "HighAtomic", 263 #ifdef CONFIG_CMA 264 "CMA", 265 #endif 266 #ifdef CONFIG_MEMORY_ISOLATION 267 "Isolate", 268 #endif 269 }; 270 271 int min_free_kbytes = 1024; 272 int user_min_free_kbytes = -1; 273 static int watermark_boost_factor __read_mostly = 15000; 274 static int watermark_scale_factor = 10; 275 int defrag_mode; 276 277 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 278 int movable_zone; 279 EXPORT_SYMBOL(movable_zone); 280 281 #if MAX_NUMNODES > 1 282 unsigned int nr_node_ids __read_mostly = MAX_NUMNODES; 283 unsigned int nr_online_nodes __read_mostly = 1; 284 EXPORT_SYMBOL(nr_node_ids); 285 EXPORT_SYMBOL(nr_online_nodes); 286 #endif 287 288 /* 289 * When page allocations stall for longer than a threshold, 290 * ALLOC_STALL_WARN_MSECS, leave a warning in the kernel log. Only one warning 291 * will be printed during this duration for the entire system. 292 */ 293 #define ALLOC_STALL_WARN_MSECS (10 * 1000UL) 294 static unsigned long alloc_stall_warn_jiffies = INITIAL_JIFFIES; 295 296 static bool page_contains_unaccepted(struct page *page, unsigned int order); 297 static bool cond_accept_memory(struct zone *zone, unsigned int order, 298 int alloc_flags); 299 static bool __free_unaccepted(struct page *page); 300 301 int page_group_by_mobility_disabled __read_mostly; 302 303 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 304 /* 305 * During boot we initialize deferred pages on-demand, as needed, but once 306 * page_alloc_init_late() has finished, the deferred pages are all initialized, 307 * and we can permanently disable that path. 308 */ 309 DEFINE_STATIC_KEY_TRUE(deferred_pages); 310 311 /* 312 * deferred_grow_zone() is __init, but it is called from 313 * get_page_from_freelist() during early boot until deferred_pages permanently 314 * disables this call. This is why we have refdata wrapper to avoid warning, 315 * and to ensure that the function body gets unloaded. 316 */ 317 static bool __ref 318 _deferred_grow_zone(struct zone *zone, unsigned int order) 319 { 320 return deferred_grow_zone(zone, order); 321 } 322 #else 323 static inline bool _deferred_grow_zone(struct zone *zone, unsigned int order) 324 { 325 return false; 326 } 327 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 328 329 /* Return a pointer to the bitmap storing bits affecting a block of pages */ 330 static inline unsigned long *get_pageblock_bitmap(const struct page *page, 331 unsigned long pfn) 332 { 333 #ifdef CONFIG_SPARSEMEM 334 return section_to_usemap(__pfn_to_section(pfn)); 335 #else 336 return page_zone(page)->pageblock_flags; 337 #endif /* CONFIG_SPARSEMEM */ 338 } 339 340 static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn) 341 { 342 #ifdef CONFIG_SPARSEMEM 343 pfn &= (PAGES_PER_SECTION-1); 344 #else 345 pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn); 346 #endif /* CONFIG_SPARSEMEM */ 347 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 348 } 349 350 static __always_inline bool is_standalone_pb_bit(enum pageblock_bits pb_bit) 351 { 352 return pb_bit >= PB_compact_skip && pb_bit < __NR_PAGEBLOCK_BITS; 353 } 354 355 static __always_inline void 356 get_pfnblock_bitmap_bitidx(const struct page *page, unsigned long pfn, 357 unsigned long **bitmap_word, unsigned long *bitidx) 358 { 359 unsigned long *bitmap; 360 unsigned long word_bitidx; 361 362 #ifdef CONFIG_MEMORY_ISOLATION 363 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 8); 364 #else 365 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); 366 #endif 367 BUILD_BUG_ON(__MIGRATE_TYPE_END > PAGEBLOCK_MIGRATETYPE_MASK); 368 VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); 369 370 bitmap = get_pageblock_bitmap(page, pfn); 371 *bitidx = pfn_to_bitidx(page, pfn); 372 word_bitidx = *bitidx / BITS_PER_LONG; 373 *bitidx &= (BITS_PER_LONG - 1); 374 *bitmap_word = &bitmap[word_bitidx]; 375 } 376 377 378 /** 379 * __get_pfnblock_flags_mask - Return the requested group of flags for 380 * a pageblock_nr_pages block of pages 381 * @page: The page within the block of interest 382 * @pfn: The target page frame number 383 * @mask: mask of bits that the caller is interested in 384 * 385 * Return: pageblock_bits flags 386 */ 387 static unsigned long __get_pfnblock_flags_mask(const struct page *page, 388 unsigned long pfn, 389 unsigned long mask) 390 { 391 unsigned long *bitmap_word; 392 unsigned long bitidx; 393 unsigned long word; 394 395 get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx); 396 /* 397 * This races, without locks, with set_pfnblock_migratetype(). Ensure 398 * a consistent read of the memory array, so that results, even though 399 * racy, are not corrupted. 400 */ 401 word = READ_ONCE(*bitmap_word); 402 return (word >> bitidx) & mask; 403 } 404 405 /** 406 * get_pfnblock_bit - Check if a standalone bit of a pageblock is set 407 * @page: The page within the block of interest 408 * @pfn: The target page frame number 409 * @pb_bit: pageblock bit to check 410 * 411 * Return: true if the bit is set, otherwise false 412 */ 413 bool get_pfnblock_bit(const struct page *page, unsigned long pfn, 414 enum pageblock_bits pb_bit) 415 { 416 unsigned long *bitmap_word; 417 unsigned long bitidx; 418 419 if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit))) 420 return false; 421 422 get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx); 423 424 return test_bit(bitidx + pb_bit, bitmap_word); 425 } 426 427 /** 428 * get_pfnblock_migratetype - Return the migratetype of a pageblock 429 * @page: The page within the block of interest 430 * @pfn: The target page frame number 431 * 432 * Return: The migratetype of the pageblock 433 * 434 * Use get_pfnblock_migratetype() if caller already has both @page and @pfn 435 * to save a call to page_to_pfn(). 436 */ 437 enum migratetype 438 get_pfnblock_migratetype(const struct page *page, unsigned long pfn) 439 { 440 unsigned long mask = PAGEBLOCK_MIGRATETYPE_MASK | PAGEBLOCK_ISO_MASK; 441 unsigned long flags; 442 443 flags = __get_pfnblock_flags_mask(page, pfn, mask); 444 445 #ifdef CONFIG_MEMORY_ISOLATION 446 if (flags & BIT(PB_migrate_isolate)) 447 return MIGRATE_ISOLATE; 448 #endif 449 return flags & PAGEBLOCK_MIGRATETYPE_MASK; 450 } 451 452 /** 453 * __set_pfnblock_flags_mask - Set the requested group of flags for 454 * a pageblock_nr_pages block of pages 455 * @page: The page within the block of interest 456 * @pfn: The target page frame number 457 * @flags: The flags to set 458 * @mask: mask of bits that the caller is interested in 459 */ 460 static void __set_pfnblock_flags_mask(struct page *page, unsigned long pfn, 461 unsigned long flags, unsigned long mask) 462 { 463 unsigned long *bitmap_word; 464 unsigned long bitidx; 465 unsigned long word; 466 467 get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx); 468 469 mask <<= bitidx; 470 flags <<= bitidx; 471 472 word = READ_ONCE(*bitmap_word); 473 do { 474 } while (!try_cmpxchg(bitmap_word, &word, (word & ~mask) | flags)); 475 } 476 477 /** 478 * set_pfnblock_bit - Set a standalone bit of a pageblock 479 * @page: The page within the block of interest 480 * @pfn: The target page frame number 481 * @pb_bit: pageblock bit to set 482 */ 483 void set_pfnblock_bit(const struct page *page, unsigned long pfn, 484 enum pageblock_bits pb_bit) 485 { 486 unsigned long *bitmap_word; 487 unsigned long bitidx; 488 489 if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit))) 490 return; 491 492 get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx); 493 494 set_bit(bitidx + pb_bit, bitmap_word); 495 } 496 497 /** 498 * clear_pfnblock_bit - Clear a standalone bit of a pageblock 499 * @page: The page within the block of interest 500 * @pfn: The target page frame number 501 * @pb_bit: pageblock bit to clear 502 */ 503 void clear_pfnblock_bit(const struct page *page, unsigned long pfn, 504 enum pageblock_bits pb_bit) 505 { 506 unsigned long *bitmap_word; 507 unsigned long bitidx; 508 509 if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit))) 510 return; 511 512 get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx); 513 514 clear_bit(bitidx + pb_bit, bitmap_word); 515 } 516 517 /** 518 * set_pageblock_migratetype - Set the migratetype of a pageblock 519 * @page: The page within the block of interest 520 * @migratetype: migratetype to set 521 */ 522 static void set_pageblock_migratetype(struct page *page, 523 enum migratetype migratetype) 524 { 525 if (unlikely(page_group_by_mobility_disabled && 526 migratetype < MIGRATE_PCPTYPES)) 527 migratetype = MIGRATE_UNMOVABLE; 528 529 #ifdef CONFIG_MEMORY_ISOLATION 530 if (migratetype == MIGRATE_ISOLATE) { 531 VM_WARN_ONCE(1, 532 "Use set_pageblock_isolate() for pageblock isolation"); 533 return; 534 } 535 VM_WARN_ONCE(get_pageblock_isolate(page), 536 "Use clear_pageblock_isolate() to unisolate pageblock"); 537 /* PAGEBLOCK_ISO_MASK clears PB_migrate_isolate if it is set */ 538 #endif 539 __set_pfnblock_flags_mask(page, page_to_pfn(page), 540 (unsigned long)migratetype, 541 PAGEBLOCK_MIGRATETYPE_MASK | PAGEBLOCK_ISO_MASK); 542 } 543 544 void __meminit init_pageblock_migratetype(struct page *page, 545 enum migratetype migratetype, 546 bool isolate) 547 { 548 unsigned long flags; 549 550 if (unlikely(page_group_by_mobility_disabled && 551 migratetype < MIGRATE_PCPTYPES)) 552 migratetype = MIGRATE_UNMOVABLE; 553 554 flags = migratetype; 555 556 #ifdef CONFIG_MEMORY_ISOLATION 557 if (migratetype == MIGRATE_ISOLATE) { 558 VM_WARN_ONCE( 559 1, 560 "Set isolate=true to isolate pageblock with a migratetype"); 561 return; 562 } 563 if (isolate) 564 flags |= BIT(PB_migrate_isolate); 565 #endif 566 __set_pfnblock_flags_mask(page, page_to_pfn(page), flags, 567 PAGEBLOCK_MIGRATETYPE_MASK | PAGEBLOCK_ISO_MASK); 568 } 569 570 #ifdef CONFIG_DEBUG_VM 571 static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 572 { 573 int ret; 574 unsigned seq; 575 unsigned long pfn = page_to_pfn(page); 576 unsigned long sp, start_pfn; 577 578 do { 579 seq = zone_span_seqbegin(zone); 580 start_pfn = zone->zone_start_pfn; 581 sp = zone->spanned_pages; 582 ret = !zone_spans_pfn(zone, pfn); 583 } while (zone_span_seqretry(zone, seq)); 584 585 if (ret) 586 pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n", 587 pfn, zone_to_nid(zone), zone->name, 588 start_pfn, start_pfn + sp); 589 590 return ret; 591 } 592 593 /* 594 * Temporary debugging check for pages not lying within a given zone. 595 */ 596 static bool __maybe_unused bad_range(struct zone *zone, struct page *page) 597 { 598 if (page_outside_zone_boundaries(zone, page)) 599 return true; 600 if (zone != page_zone(page)) 601 return true; 602 603 return false; 604 } 605 #else 606 static inline bool __maybe_unused bad_range(struct zone *zone, struct page *page) 607 { 608 return false; 609 } 610 #endif 611 612 static void bad_page(struct page *page, const char *reason) 613 { 614 static unsigned long resume; 615 static unsigned long nr_shown; 616 static unsigned long nr_unshown; 617 618 /* 619 * Allow a burst of 60 reports, then keep quiet for that minute; 620 * or allow a steady drip of one report per second. 621 */ 622 if (nr_shown == 60) { 623 if (time_before(jiffies, resume)) { 624 nr_unshown++; 625 goto out; 626 } 627 if (nr_unshown) { 628 pr_alert( 629 "BUG: Bad page state: %lu messages suppressed\n", 630 nr_unshown); 631 nr_unshown = 0; 632 } 633 nr_shown = 0; 634 } 635 if (nr_shown++ == 0) 636 resume = jiffies + 60 * HZ; 637 638 pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", 639 current->comm, page_to_pfn(page)); 640 dump_page(page, reason); 641 642 print_modules(); 643 dump_stack(); 644 out: 645 /* Leave bad fields for debug, except PageBuddy could make trouble */ 646 if (PageBuddy(page)) 647 __ClearPageBuddy(page); 648 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 649 } 650 651 static inline unsigned int order_to_pindex(int migratetype, int order) 652 { 653 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { 654 bool movable = migratetype == MIGRATE_MOVABLE; 655 656 if (order > PAGE_ALLOC_COSTLY_ORDER) 657 return NR_LOWORDER_PCP_LISTS + movable; 658 } 659 660 return (MIGRATE_PCPTYPES * order) + migratetype; 661 } 662 663 static inline int pindex_to_order(unsigned int pindex) 664 { 665 int order = pindex / MIGRATE_PCPTYPES; 666 667 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { 668 if (pindex >= NR_LOWORDER_PCP_LISTS) 669 order = HPAGE_PMD_ORDER; 670 } 671 672 return order; 673 } 674 675 static inline bool pcp_allowed_order(unsigned int order) 676 { 677 if (order <= PAGE_ALLOC_COSTLY_ORDER) 678 return true; 679 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 680 if (is_pmd_order(order)) 681 return true; 682 #endif 683 return false; 684 } 685 686 /* 687 * Higher-order pages are called "compound pages". They are structured thusly: 688 * 689 * The first PAGE_SIZE page is called the "head page" and have PG_head set. 690 * 691 * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded 692 * in bit 0 of page->compound_info. The rest of bits is pointer to head page. 693 * 694 * The first tail page's ->compound_order holds the order of allocation. 695 * This usage means that zero-order pages may not be compound. 696 */ 697 698 void prep_compound_page(struct page *page, unsigned int order) 699 { 700 int i; 701 int nr_pages = 1 << order; 702 703 __SetPageHead(page); 704 for (i = 1; i < nr_pages; i++) 705 prep_compound_tail(page + i, page, order); 706 707 prep_compound_head(page, order); 708 } 709 710 static inline void set_buddy_order(struct page *page, unsigned int order) 711 { 712 set_page_private(page, order); 713 __SetPageBuddy(page); 714 } 715 716 #ifdef CONFIG_COMPACTION 717 static inline struct capture_control *task_capc(struct zone *zone) 718 { 719 struct capture_control *capc = current->capture_control; 720 721 return unlikely(capc) && 722 !(current->flags & PF_KTHREAD) && 723 !capc->page && 724 capc->cc->zone == zone ? capc : NULL; 725 } 726 727 static inline bool 728 compaction_capture(struct capture_control *capc, struct page *page, 729 int order, int migratetype) 730 { 731 if (!capc || order != capc->cc->order) 732 return false; 733 734 /* Do not accidentally pollute CMA or isolated regions*/ 735 if (is_migrate_cma(migratetype) || 736 is_migrate_isolate(migratetype)) 737 return false; 738 739 /* 740 * Do not let lower order allocations pollute a movable pageblock 741 * unless compaction is also requesting movable pages. 742 * This might let an unmovable request use a reclaimable pageblock 743 * and vice-versa but no more than normal fallback logic which can 744 * have trouble finding a high-order free page. 745 */ 746 if (order < pageblock_order && migratetype == MIGRATE_MOVABLE && 747 capc->cc->migratetype != MIGRATE_MOVABLE) 748 return false; 749 750 if (migratetype != capc->cc->migratetype) 751 trace_mm_page_alloc_extfrag(page, capc->cc->order, order, 752 capc->cc->migratetype, migratetype); 753 754 capc->page = page; 755 return true; 756 } 757 758 #else 759 static inline struct capture_control *task_capc(struct zone *zone) 760 { 761 return NULL; 762 } 763 764 static inline bool 765 compaction_capture(struct capture_control *capc, struct page *page, 766 int order, int migratetype) 767 { 768 return false; 769 } 770 #endif /* CONFIG_COMPACTION */ 771 772 static inline void account_freepages(struct zone *zone, int nr_pages, 773 int migratetype) 774 { 775 lockdep_assert_held(&zone->lock); 776 777 if (is_migrate_isolate(migratetype)) 778 return; 779 780 __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); 781 782 if (is_migrate_cma(migratetype)) 783 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); 784 else if (migratetype == MIGRATE_HIGHATOMIC) 785 WRITE_ONCE(zone->nr_free_highatomic, 786 zone->nr_free_highatomic + nr_pages); 787 } 788 789 /* Used for pages not on another list */ 790 static inline void __add_to_free_list(struct page *page, struct zone *zone, 791 unsigned int order, int migratetype, 792 bool tail) 793 { 794 struct free_area *area = &zone->free_area[order]; 795 int nr_pages = 1 << order; 796 797 VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, 798 "page type is %d, passed migratetype is %d (nr=%d)\n", 799 get_pageblock_migratetype(page), migratetype, nr_pages); 800 801 if (tail) 802 list_add_tail(&page->buddy_list, &area->free_list[migratetype]); 803 else 804 list_add(&page->buddy_list, &area->free_list[migratetype]); 805 area->nr_free++; 806 807 if (order >= pageblock_order && !is_migrate_isolate(migratetype)) 808 __mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, nr_pages); 809 } 810 811 /* 812 * Used for pages which are on another list. Move the pages to the tail 813 * of the list - so the moved pages won't immediately be considered for 814 * allocation again (e.g., optimization for memory onlining). 815 */ 816 static inline void move_to_free_list(struct page *page, struct zone *zone, 817 unsigned int order, int old_mt, int new_mt) 818 { 819 struct free_area *area = &zone->free_area[order]; 820 int nr_pages = 1 << order; 821 822 /* Free page moving can fail, so it happens before the type update */ 823 VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt, 824 "page type is %d, passed migratetype is %d (nr=%d)\n", 825 get_pageblock_migratetype(page), old_mt, nr_pages); 826 827 list_move_tail(&page->buddy_list, &area->free_list[new_mt]); 828 829 account_freepages(zone, -nr_pages, old_mt); 830 account_freepages(zone, nr_pages, new_mt); 831 832 if (order >= pageblock_order && 833 is_migrate_isolate(old_mt) != is_migrate_isolate(new_mt)) { 834 if (!is_migrate_isolate(old_mt)) 835 nr_pages = -nr_pages; 836 __mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, nr_pages); 837 } 838 } 839 840 static inline void __del_page_from_free_list(struct page *page, struct zone *zone, 841 unsigned int order, int migratetype) 842 { 843 int nr_pages = 1 << order; 844 845 VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, 846 "page type is %d, passed migratetype is %d (nr=%d)\n", 847 get_pageblock_migratetype(page), migratetype, nr_pages); 848 849 /* clear reported state and update reported page count */ 850 if (page_reported(page)) 851 __ClearPageReported(page); 852 853 list_del(&page->buddy_list); 854 __ClearPageBuddy(page); 855 set_page_private(page, 0); 856 zone->free_area[order].nr_free--; 857 858 if (order >= pageblock_order && !is_migrate_isolate(migratetype)) 859 __mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, -nr_pages); 860 } 861 862 static inline void del_page_from_free_list(struct page *page, struct zone *zone, 863 unsigned int order, int migratetype) 864 { 865 __del_page_from_free_list(page, zone, order, migratetype); 866 account_freepages(zone, -(1 << order), migratetype); 867 } 868 869 static inline struct page *get_page_from_free_area(struct free_area *area, 870 int migratetype) 871 { 872 return list_first_entry_or_null(&area->free_list[migratetype], 873 struct page, buddy_list); 874 } 875 876 /* 877 * If this is less than the 2nd largest possible page, check if the buddy 878 * of the next-higher order is free. If it is, it's possible 879 * that pages are being freed that will coalesce soon. In case, 880 * that is happening, add the free page to the tail of the list 881 * so it's less likely to be used soon and more likely to be merged 882 * as a 2-level higher order page 883 */ 884 static inline bool 885 buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, 886 struct page *page, unsigned int order) 887 { 888 unsigned long higher_page_pfn; 889 struct page *higher_page; 890 891 if (order >= MAX_PAGE_ORDER - 1) 892 return false; 893 894 higher_page_pfn = buddy_pfn & pfn; 895 higher_page = page + (higher_page_pfn - pfn); 896 897 return find_buddy_page_pfn(higher_page, higher_page_pfn, order + 1, 898 NULL) != NULL; 899 } 900 901 static void change_pageblock_range(struct page *pageblock_page, 902 int start_order, int migratetype) 903 { 904 int nr_pageblocks = 1 << (start_order - pageblock_order); 905 906 while (nr_pageblocks--) { 907 set_pageblock_migratetype(pageblock_page, migratetype); 908 pageblock_page += pageblock_nr_pages; 909 } 910 } 911 912 /* 913 * Freeing function for a buddy system allocator. 914 * 915 * The concept of a buddy system is to maintain direct-mapped table 916 * (containing bit values) for memory blocks of various "orders". 917 * The bottom level table contains the map for the smallest allocatable 918 * units of memory (here, pages), and each level above it describes 919 * pairs of units from the levels below, hence, "buddies". 920 * At a high level, all that happens here is marking the table entry 921 * at the bottom level available, and propagating the changes upward 922 * as necessary, plus some accounting needed to play nicely with other 923 * parts of the VM system. 924 * At each level, we keep a list of pages, which are heads of continuous 925 * free pages of length of (1 << order) and marked with PageBuddy. 926 * Page's order is recorded in page_private(page) field. 927 * So when we are allocating or freeing one, we can derive the state of the 928 * other. That is, if we allocate a small block, and both were 929 * free, the remainder of the region must be split into blocks. 930 * If a block is freed, and its buddy is also free, then this 931 * triggers coalescing into a block of larger size. 932 * 933 * -- nyc 934 */ 935 936 static inline void __free_one_page(struct page *page, 937 unsigned long pfn, 938 struct zone *zone, unsigned int order, 939 int migratetype, fpi_t fpi_flags) 940 { 941 struct capture_control *capc = task_capc(zone); 942 unsigned long buddy_pfn = 0; 943 unsigned long combined_pfn; 944 struct page *buddy; 945 bool to_tail; 946 947 VM_BUG_ON(!zone_is_initialized(zone)); 948 VM_BUG_ON_PAGE(page->flags.f & PAGE_FLAGS_CHECK_AT_PREP, page); 949 950 VM_BUG_ON(migratetype == -1); 951 VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); 952 VM_BUG_ON_PAGE(bad_range(zone, page), page); 953 954 account_freepages(zone, 1 << order, migratetype); 955 956 while (order < MAX_PAGE_ORDER) { 957 int buddy_mt = migratetype; 958 959 if (compaction_capture(capc, page, order, migratetype)) { 960 account_freepages(zone, -(1 << order), migratetype); 961 return; 962 } 963 964 buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn); 965 if (!buddy) 966 goto done_merging; 967 968 if (unlikely(order >= pageblock_order)) { 969 /* 970 * We want to prevent merge between freepages on pageblock 971 * without fallbacks and normal pageblock. Without this, 972 * pageblock isolation could cause incorrect freepage or CMA 973 * accounting or HIGHATOMIC accounting. 974 */ 975 buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn); 976 977 if (migratetype != buddy_mt && 978 (!migratetype_is_mergeable(migratetype) || 979 !migratetype_is_mergeable(buddy_mt))) 980 goto done_merging; 981 } 982 983 /* 984 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, 985 * merge with it and move up one order. 986 */ 987 if (page_is_guard(buddy)) 988 clear_page_guard(zone, buddy, order); 989 else 990 __del_page_from_free_list(buddy, zone, order, buddy_mt); 991 992 if (unlikely(buddy_mt != migratetype)) { 993 /* 994 * Match buddy type. This ensures that an 995 * expand() down the line puts the sub-blocks 996 * on the right freelists. 997 */ 998 change_pageblock_range(buddy, order, migratetype); 999 } 1000 1001 combined_pfn = buddy_pfn & pfn; 1002 page = page + (combined_pfn - pfn); 1003 pfn = combined_pfn; 1004 order++; 1005 } 1006 1007 done_merging: 1008 set_buddy_order(page, order); 1009 1010 if (fpi_flags & FPI_TO_TAIL) 1011 to_tail = true; 1012 else if (is_shuffle_order(order)) 1013 to_tail = shuffle_pick_tail(); 1014 else 1015 to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); 1016 1017 __add_to_free_list(page, zone, order, migratetype, to_tail); 1018 1019 /* Notify page reporting subsystem of freed page */ 1020 if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY)) 1021 page_reporting_notify_free(order); 1022 } 1023 1024 /* 1025 * A bad page could be due to a number of fields. Instead of multiple branches, 1026 * try and check multiple fields with one check. The caller must do a detailed 1027 * check if necessary. 1028 */ 1029 static inline bool page_expected_state(struct page *page, 1030 unsigned long check_flags) 1031 { 1032 if (unlikely(atomic_read(&page->_mapcount) != -1)) 1033 return false; 1034 1035 if (unlikely((unsigned long)page->mapping | 1036 page_ref_count(page) | 1037 #ifdef CONFIG_MEMCG 1038 page->memcg_data | 1039 #endif 1040 page_pool_page_is_pp(page) | 1041 (page->flags.f & check_flags))) 1042 return false; 1043 1044 return true; 1045 } 1046 1047 static const char *page_bad_reason(struct page *page, unsigned long flags) 1048 { 1049 const char *bad_reason = NULL; 1050 1051 if (unlikely(atomic_read(&page->_mapcount) != -1)) 1052 bad_reason = "nonzero mapcount"; 1053 if (unlikely(page->mapping != NULL)) 1054 bad_reason = "non-NULL mapping"; 1055 if (unlikely(page_ref_count(page) != 0)) 1056 bad_reason = "nonzero _refcount"; 1057 if (unlikely(page->flags.f & flags)) { 1058 if (flags == PAGE_FLAGS_CHECK_AT_PREP) 1059 bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set"; 1060 else 1061 bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; 1062 } 1063 #ifdef CONFIG_MEMCG 1064 if (unlikely(page->memcg_data)) 1065 bad_reason = "page still charged to cgroup"; 1066 #endif 1067 if (unlikely(page_pool_page_is_pp(page))) 1068 bad_reason = "page_pool leak"; 1069 return bad_reason; 1070 } 1071 1072 static inline bool free_page_is_bad(struct page *page) 1073 { 1074 if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) 1075 return false; 1076 1077 /* Something has gone sideways, find it */ 1078 bad_page(page, page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); 1079 return true; 1080 } 1081 1082 static inline bool is_check_pages_enabled(void) 1083 { 1084 return static_branch_unlikely(&check_pages_enabled); 1085 } 1086 1087 static int free_tail_page_prepare(struct page *head_page, struct page *page) 1088 { 1089 struct folio *folio = (struct folio *)head_page; 1090 int ret = 1; 1091 1092 /* 1093 * We rely page->lru.next never has bit 0 set, unless the page 1094 * is PageTail(). Let's make sure that's true even for poisoned ->lru. 1095 */ 1096 BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); 1097 1098 if (!is_check_pages_enabled()) { 1099 ret = 0; 1100 goto out; 1101 } 1102 switch (page - head_page) { 1103 case 1: 1104 /* the first tail page: these may be in place of ->mapping */ 1105 if (unlikely(folio_large_mapcount(folio))) { 1106 bad_page(page, "nonzero large_mapcount"); 1107 goto out; 1108 } 1109 if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT) && 1110 unlikely(atomic_read(&folio->_nr_pages_mapped))) { 1111 bad_page(page, "nonzero nr_pages_mapped"); 1112 goto out; 1113 } 1114 if (IS_ENABLED(CONFIG_MM_ID)) { 1115 if (unlikely(folio->_mm_id_mapcount[0] != -1)) { 1116 bad_page(page, "nonzero mm mapcount 0"); 1117 goto out; 1118 } 1119 if (unlikely(folio->_mm_id_mapcount[1] != -1)) { 1120 bad_page(page, "nonzero mm mapcount 1"); 1121 goto out; 1122 } 1123 } 1124 if (IS_ENABLED(CONFIG_64BIT)) { 1125 if (unlikely(atomic_read(&folio->_entire_mapcount) + 1)) { 1126 bad_page(page, "nonzero entire_mapcount"); 1127 goto out; 1128 } 1129 if (unlikely(atomic_read(&folio->_pincount))) { 1130 bad_page(page, "nonzero pincount"); 1131 goto out; 1132 } 1133 } 1134 break; 1135 case 2: 1136 /* the second tail page: deferred_list overlaps ->mapping */ 1137 if (unlikely(!list_empty(&folio->_deferred_list))) { 1138 bad_page(page, "on deferred list"); 1139 goto out; 1140 } 1141 if (!IS_ENABLED(CONFIG_64BIT)) { 1142 if (unlikely(atomic_read(&folio->_entire_mapcount) + 1)) { 1143 bad_page(page, "nonzero entire_mapcount"); 1144 goto out; 1145 } 1146 if (unlikely(atomic_read(&folio->_pincount))) { 1147 bad_page(page, "nonzero pincount"); 1148 goto out; 1149 } 1150 } 1151 break; 1152 case 3: 1153 /* the third tail page: hugetlb specifics overlap ->mappings */ 1154 if (IS_ENABLED(CONFIG_HUGETLB_PAGE)) 1155 break; 1156 fallthrough; 1157 default: 1158 if (page->mapping != TAIL_MAPPING) { 1159 bad_page(page, "corrupted mapping in tail page"); 1160 goto out; 1161 } 1162 break; 1163 } 1164 if (unlikely(!PageTail(page))) { 1165 bad_page(page, "PageTail not set"); 1166 goto out; 1167 } 1168 if (unlikely(compound_head(page) != head_page)) { 1169 bad_page(page, "compound_head not consistent"); 1170 goto out; 1171 } 1172 ret = 0; 1173 out: 1174 page->mapping = NULL; 1175 clear_compound_head(page); 1176 return ret; 1177 } 1178 1179 /* 1180 * Skip KASAN memory poisoning when either: 1181 * 1182 * 1. For generic KASAN: deferred memory initialization has not yet completed. 1183 * Tag-based KASAN modes skip pages freed via deferred memory initialization 1184 * using page tags instead (see below). 1185 * 2. For tag-based KASAN modes: the page has a match-all KASAN tag, indicating 1186 * that error detection is disabled for accesses via the page address. 1187 * 1188 * Pages will have match-all tags in the following circumstances: 1189 * 1190 * 1. Pages are being initialized for the first time, including during deferred 1191 * memory init; see the call to page_kasan_tag_reset in __init_single_page. 1192 * 2. The allocation was not unpoisoned due to __GFP_SKIP_KASAN, with the 1193 * exception of pages unpoisoned by kasan_unpoison_vmalloc. 1194 * 3. The allocation was excluded from being checked due to sampling, 1195 * see the call to kasan_unpoison_pages. 1196 * 1197 * Poisoning pages during deferred memory init will greatly lengthen the 1198 * process and cause problem in large memory systems as the deferred pages 1199 * initialization is done with interrupt disabled. 1200 * 1201 * Assuming that there will be no reference to those newly initialized 1202 * pages before they are ever allocated, this should have no effect on 1203 * KASAN memory tracking as the poison will be properly inserted at page 1204 * allocation time. The only corner case is when pages are allocated by 1205 * on-demand allocation and then freed again before the deferred pages 1206 * initialization is done, but this is not likely to happen. 1207 */ 1208 static inline bool should_skip_kasan_poison(struct page *page) 1209 { 1210 if (IS_ENABLED(CONFIG_KASAN_GENERIC)) 1211 return deferred_pages_enabled(); 1212 1213 return page_kasan_tag(page) == KASAN_TAG_KERNEL; 1214 } 1215 1216 static void clear_highpages_kasan_tagged(struct page *page, int numpages) 1217 { 1218 /* s390's use of memset() could override KASAN redzones. */ 1219 kasan_disable_current(); 1220 if (!IS_ENABLED(CONFIG_HIGHMEM)) { 1221 clear_pages(kasan_reset_tag(page_address(page)), numpages); 1222 } else { 1223 int i; 1224 1225 for (i = 0; i < numpages; i++) 1226 clear_highpage_kasan_tagged(page + i); 1227 } 1228 kasan_enable_current(); 1229 } 1230 1231 #ifdef CONFIG_MEM_ALLOC_PROFILING 1232 1233 /* Should be called only if mem_alloc_profiling_enabled() */ 1234 void __clear_page_tag_ref(struct page *page) 1235 { 1236 union pgtag_ref_handle handle; 1237 union codetag_ref ref; 1238 1239 if (get_page_tag_ref(page, &ref, &handle)) { 1240 set_codetag_empty(&ref); 1241 update_page_tag_ref(handle, &ref); 1242 put_page_tag_ref(handle); 1243 } 1244 } 1245 1246 /* Should be called only if mem_alloc_profiling_enabled() */ 1247 static noinline 1248 void __pgalloc_tag_add(struct page *page, struct task_struct *task, 1249 unsigned int nr) 1250 { 1251 union pgtag_ref_handle handle; 1252 union codetag_ref ref; 1253 1254 if (likely(get_page_tag_ref(page, &ref, &handle))) { 1255 alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr); 1256 update_page_tag_ref(handle, &ref); 1257 put_page_tag_ref(handle); 1258 } else { 1259 /* 1260 * page_ext is not available yet, record the pfn so we can 1261 * clear the tag ref later when page_ext is initialized. 1262 */ 1263 alloc_tag_add_early_pfn(page_to_pfn(page)); 1264 if (task->alloc_tag) 1265 alloc_tag_set_inaccurate(task->alloc_tag); 1266 } 1267 } 1268 1269 static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, 1270 unsigned int nr) 1271 { 1272 if (mem_alloc_profiling_enabled()) 1273 __pgalloc_tag_add(page, task, nr); 1274 } 1275 1276 /* Should be called only if mem_alloc_profiling_enabled() */ 1277 static noinline 1278 void __pgalloc_tag_sub(struct page *page, unsigned int nr) 1279 { 1280 union pgtag_ref_handle handle; 1281 union codetag_ref ref; 1282 1283 if (get_page_tag_ref(page, &ref, &handle)) { 1284 alloc_tag_sub(&ref, PAGE_SIZE * nr); 1285 update_page_tag_ref(handle, &ref); 1286 put_page_tag_ref(handle); 1287 } 1288 } 1289 1290 static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) 1291 { 1292 if (mem_alloc_profiling_enabled()) 1293 __pgalloc_tag_sub(page, nr); 1294 } 1295 1296 /* When tag is not NULL, assuming mem_alloc_profiling_enabled */ 1297 static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) 1298 { 1299 if (tag) 1300 this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr); 1301 } 1302 1303 #else /* CONFIG_MEM_ALLOC_PROFILING */ 1304 1305 static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, 1306 unsigned int nr) {} 1307 static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} 1308 static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {} 1309 1310 #endif /* CONFIG_MEM_ALLOC_PROFILING */ 1311 1312 static __always_inline bool __free_pages_prepare(struct page *page, 1313 unsigned int order, fpi_t fpi_flags) 1314 { 1315 int bad = 0; 1316 bool skip_kasan_poison = should_skip_kasan_poison(page); 1317 bool init = want_init_on_free(); 1318 bool compound = PageCompound(page); 1319 struct folio *folio = page_folio(page); 1320 1321 if (fpi_flags & FPI_PREPARED) 1322 return true; 1323 1324 VM_BUG_ON_PAGE(PageTail(page), page); 1325 1326 trace_mm_page_free(page, order); 1327 kmsan_free_page(page, order); 1328 1329 if (memcg_kmem_online() && PageMemcgKmem(page)) 1330 __memcg_kmem_uncharge_page(page, order); 1331 1332 /* 1333 * In rare cases, when truncation or holepunching raced with 1334 * munlock after VM_LOCKED was cleared, Mlocked may still be 1335 * found set here. This does not indicate a problem, unless 1336 * "unevictable_pgs_cleared" appears worryingly large. 1337 */ 1338 if (unlikely(folio_test_mlocked(folio))) { 1339 long nr_pages = folio_nr_pages(folio); 1340 1341 __folio_clear_mlocked(folio); 1342 zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages); 1343 count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); 1344 } 1345 1346 if (unlikely(PageHWPoison(page)) && !order) { 1347 /* Do not let hwpoison pages hit pcplists/buddy */ 1348 reset_page_owner(page, order); 1349 page_table_check_free(page, order); 1350 pgalloc_tag_sub(page, 1 << order); 1351 1352 /* 1353 * The page is isolated and accounted for. 1354 * Mark the codetag as empty to avoid accounting error 1355 * when the page is freed by unpoison_memory(). 1356 */ 1357 clear_page_tag_ref(page); 1358 return false; 1359 } 1360 1361 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); 1362 1363 /* 1364 * Check tail pages before head page information is cleared to 1365 * avoid checking PageCompound for order-0 pages. 1366 */ 1367 if (unlikely(order)) { 1368 int i; 1369 1370 if (compound) { 1371 page[1].flags.f &= ~PAGE_FLAGS_SECOND; 1372 #ifdef NR_PAGES_IN_LARGE_FOLIO 1373 folio->_nr_pages = 0; 1374 #endif 1375 } 1376 for (i = 1; i < (1 << order); i++) { 1377 if (compound) 1378 bad += free_tail_page_prepare(page, page + i); 1379 if (is_check_pages_enabled()) { 1380 if (free_page_is_bad(page + i)) { 1381 bad++; 1382 continue; 1383 } 1384 } 1385 (page + i)->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP; 1386 } 1387 } 1388 if (folio_test_anon(folio)) { 1389 mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); 1390 folio->mapping = NULL; 1391 } 1392 if (unlikely(page_has_type(page))) 1393 /* Reset the page_type (which overlays _mapcount) */ 1394 page->page_type = UINT_MAX; 1395 1396 if (is_check_pages_enabled()) { 1397 if (free_page_is_bad(page)) 1398 bad++; 1399 if (bad) 1400 return false; 1401 } 1402 1403 page_cpupid_reset_last(page); 1404 page->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP; 1405 page->private = 0; 1406 reset_page_owner(page, order); 1407 page_table_check_free(page, order); 1408 pgalloc_tag_sub(page, 1 << order); 1409 1410 if (!PageHighMem(page) && !(fpi_flags & FPI_TRYLOCK)) { 1411 debug_check_no_locks_freed(page_address(page), 1412 PAGE_SIZE << order); 1413 debug_check_no_obj_freed(page_address(page), 1414 PAGE_SIZE << order); 1415 } 1416 1417 kernel_poison_pages(page, 1 << order); 1418 1419 /* 1420 * As memory initialization might be integrated into KASAN, 1421 * KASAN poisoning and memory initialization code must be 1422 * kept together to avoid discrepancies in behavior. 1423 * 1424 * With hardware tag-based KASAN, memory tags must be set before the 1425 * page becomes unavailable via debug_pagealloc or arch_free_page. 1426 */ 1427 if (!skip_kasan_poison) { 1428 kasan_poison_pages(page, order, init); 1429 1430 /* Memory is already initialized if KASAN did it internally. */ 1431 if (kasan_has_integrated_init()) 1432 init = false; 1433 } 1434 if (init) 1435 clear_highpages_kasan_tagged(page, 1 << order); 1436 1437 /* 1438 * arch_free_page() can make the page's contents inaccessible. s390 1439 * does this. So nothing which can access the page's contents should 1440 * happen after this. 1441 */ 1442 arch_free_page(page, order); 1443 1444 debug_pagealloc_unmap_pages(page, 1 << order); 1445 1446 return true; 1447 } 1448 1449 bool free_pages_prepare(struct page *page, unsigned int order) 1450 { 1451 return __free_pages_prepare(page, order, FPI_NONE); 1452 } 1453 1454 /* 1455 * Frees a number of pages from the PCP lists 1456 * Assumes all pages on list are in same zone. 1457 * count is the number of pages to free. 1458 */ 1459 static void free_pcppages_bulk(struct zone *zone, int count, 1460 struct per_cpu_pages *pcp, 1461 int pindex) 1462 { 1463 unsigned int order; 1464 struct page *page; 1465 1466 /* 1467 * Ensure proper count is passed which otherwise would stuck in the 1468 * below while (list_empty(list)) loop. 1469 */ 1470 count = min(pcp->count, count); 1471 1472 /* Ensure requested pindex is drained first. */ 1473 pindex = pindex - 1; 1474 1475 guard(spinlock_irqsave)(&zone->lock); 1476 1477 while (count > 0) { 1478 struct list_head *list; 1479 int nr_pages; 1480 1481 /* Remove pages from lists in a round-robin fashion. */ 1482 do { 1483 if (++pindex > NR_PCP_LISTS - 1) 1484 pindex = 0; 1485 list = &pcp->lists[pindex]; 1486 } while (list_empty(list)); 1487 1488 order = pindex_to_order(pindex); 1489 nr_pages = 1 << order; 1490 do { 1491 unsigned long pfn; 1492 int mt; 1493 1494 page = list_last_entry(list, struct page, pcp_list); 1495 pfn = page_to_pfn(page); 1496 mt = get_pfnblock_migratetype(page, pfn); 1497 1498 /* must delete to avoid corrupting pcp list */ 1499 list_del(&page->pcp_list); 1500 count -= nr_pages; 1501 pcp->count -= nr_pages; 1502 1503 __free_one_page(page, pfn, zone, order, mt, FPI_NONE); 1504 trace_mm_page_pcpu_drain(page, order, mt); 1505 } while (count > 0 && !list_empty(list)); 1506 } 1507 } 1508 1509 /* Split a multi-block free page into its individual pageblocks. */ 1510 static void split_large_buddy(struct zone *zone, struct page *page, 1511 unsigned long pfn, int order, fpi_t fpi) 1512 { 1513 unsigned long end = pfn + (1 << order); 1514 1515 VM_WARN_ON_ONCE(!IS_ALIGNED(pfn, 1 << order)); 1516 /* Caller removed page from freelist, buddy info cleared! */ 1517 VM_WARN_ON_ONCE(PageBuddy(page)); 1518 1519 if (order > pageblock_order) 1520 order = pageblock_order; 1521 1522 do { 1523 int mt = get_pfnblock_migratetype(page, pfn); 1524 1525 __free_one_page(page, pfn, zone, order, mt, fpi); 1526 pfn += 1 << order; 1527 if (pfn == end) 1528 break; 1529 page = pfn_to_page(pfn); 1530 } while (1); 1531 } 1532 1533 static void add_page_to_zone_llist(struct zone *zone, struct page *page, 1534 unsigned int order) 1535 { 1536 /* Remember the order */ 1537 page->private = order; 1538 /* Add the page to the free list */ 1539 llist_add(&page->pcp_llist, &zone->trylock_free_pages); 1540 } 1541 1542 static void free_one_page(struct zone *zone, struct page *page, 1543 unsigned long pfn, unsigned int order, 1544 fpi_t fpi_flags) 1545 { 1546 struct llist_head *llhead; 1547 unsigned long flags; 1548 1549 if (unlikely(fpi_flags & FPI_TRYLOCK)) { 1550 if (!spin_trylock_irqsave(&zone->lock, flags)) { 1551 add_page_to_zone_llist(zone, page, order); 1552 return; 1553 } 1554 } else { 1555 spin_lock_irqsave(&zone->lock, flags); 1556 } 1557 1558 /* The lock succeeded. Process deferred pages. */ 1559 llhead = &zone->trylock_free_pages; 1560 if (unlikely(!llist_empty(llhead) && !(fpi_flags & FPI_TRYLOCK))) { 1561 struct llist_node *llnode; 1562 struct page *p, *tmp; 1563 1564 llnode = llist_del_all(llhead); 1565 llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) { 1566 unsigned int p_order = p->private; 1567 1568 split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags); 1569 __count_vm_events(PGFREE, 1 << p_order); 1570 } 1571 } 1572 split_large_buddy(zone, page, pfn, order, fpi_flags); 1573 spin_unlock_irqrestore(&zone->lock, flags); 1574 1575 __count_vm_events(PGFREE, 1 << order); 1576 } 1577 1578 static void __free_pages_ok(struct page *page, unsigned int order, 1579 fpi_t fpi_flags) 1580 { 1581 unsigned long pfn = page_to_pfn(page); 1582 struct zone *zone = page_zone(page); 1583 1584 if (__free_pages_prepare(page, order, fpi_flags)) 1585 free_one_page(zone, page, pfn, order, fpi_flags); 1586 } 1587 1588 void __meminit __free_pages_core(struct page *page, unsigned int order, 1589 enum meminit_context context) 1590 { 1591 unsigned int nr_pages = 1 << order; 1592 struct page *p = page; 1593 unsigned int loop; 1594 1595 /* 1596 * When initializing the memmap, __init_single_page() sets the refcount 1597 * of all pages to 1 ("allocated"/"not free"). We have to set the 1598 * refcount of all involved pages to 0. 1599 * 1600 * Note that hotplugged memory pages are initialized to PageOffline(). 1601 * Pages freed from memblock might be marked as reserved. 1602 */ 1603 if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && 1604 unlikely(context == MEMINIT_HOTPLUG)) { 1605 for (loop = 0; loop < nr_pages; loop++, p++) { 1606 VM_WARN_ON_ONCE(PageReserved(p)); 1607 __ClearPageOffline(p); 1608 set_page_count(p, 0); 1609 } 1610 1611 adjust_managed_page_count(page, nr_pages); 1612 } else { 1613 for (loop = 0; loop < nr_pages; loop++, p++) { 1614 __ClearPageReserved(p); 1615 set_page_count(p, 0); 1616 } 1617 1618 /* memblock adjusts totalram_pages() manually. */ 1619 atomic_long_add(nr_pages, &page_zone(page)->managed_pages); 1620 } 1621 1622 if (page_contains_unaccepted(page, order)) { 1623 if (order == MAX_PAGE_ORDER && __free_unaccepted(page)) 1624 return; 1625 1626 accept_memory(page_to_phys(page), PAGE_SIZE << order); 1627 } 1628 1629 /* 1630 * Bypass PCP and place fresh pages right to the tail, primarily 1631 * relevant for memory onlining. 1632 */ 1633 __free_pages_ok(page, order, FPI_TO_TAIL); 1634 } 1635 1636 /* 1637 * Check that the whole (or subset of) a pageblock given by the interval of 1638 * [start_pfn, end_pfn) is valid and within the same zone, before scanning it 1639 * with the migration of free compaction scanner. 1640 * 1641 * Return struct page pointer of start_pfn, or NULL if checks were not passed. 1642 * 1643 * It's possible on some configurations to have a setup like node0 node1 node0 1644 * i.e. it's possible that all pages within a zones range of pages do not 1645 * belong to a single zone. We assume that a border between node0 and node1 1646 * can occur within a single pageblock, but not a node0 node1 node0 1647 * interleaving within a single pageblock. It is therefore sufficient to check 1648 * the first and last page of a pageblock and avoid checking each individual 1649 * page in a pageblock. 1650 * 1651 * Note: the function may return non-NULL struct page even for a page block 1652 * which contains a memory hole (i.e. there is no physical memory for a subset 1653 * of the pfn range). For example, if the pageblock order is MAX_PAGE_ORDER, which 1654 * will fall into 2 sub-sections, and the end pfn of the pageblock may be hole 1655 * even though the start pfn is online and valid. This should be safe most of 1656 * the time because struct pages are still initialized via init_unavailable_range() 1657 * and pfn walkers shouldn't touch any physical memory range for which they do 1658 * not recognize any specific metadata in struct pages. 1659 */ 1660 struct page *__pageblock_pfn_to_page(unsigned long start_pfn, 1661 unsigned long end_pfn, struct zone *zone) 1662 { 1663 struct page *start_page; 1664 struct page *end_page; 1665 1666 /* end_pfn is one past the range we are checking */ 1667 end_pfn--; 1668 1669 if (!pfn_valid(end_pfn)) 1670 return NULL; 1671 1672 start_page = pfn_to_online_page(start_pfn); 1673 if (!start_page) 1674 return NULL; 1675 1676 if (page_zone(start_page) != zone) 1677 return NULL; 1678 1679 end_page = pfn_to_page(end_pfn); 1680 1681 /* This gives a shorter code than deriving page_zone(end_page) */ 1682 if (page_zone_id(start_page) != page_zone_id(end_page)) 1683 return NULL; 1684 1685 return start_page; 1686 } 1687 1688 /* 1689 * The order of subdivision here is critical for the IO subsystem. 1690 * Please do not alter this order without good reasons and regression 1691 * testing. Specifically, as large blocks of memory are subdivided, 1692 * the order in which smaller blocks are delivered depends on the order 1693 * they're subdivided in this function. This is the primary factor 1694 * influencing the order in which pages are delivered to the IO 1695 * subsystem according to empirical testing, and this is also justified 1696 * by considering the behavior of a buddy system containing a single 1697 * large block of memory acted on by a series of small allocations. 1698 * This behavior is a critical factor in sglist merging's success. 1699 * 1700 * -- nyc 1701 */ 1702 static inline unsigned int expand(struct zone *zone, struct page *page, int low, 1703 int high, int migratetype) 1704 { 1705 unsigned int size = 1 << high; 1706 unsigned int nr_added = 0; 1707 1708 while (high > low) { 1709 high--; 1710 size >>= 1; 1711 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); 1712 1713 /* 1714 * Mark as guard pages (or page), that will allow to 1715 * merge back to allocator when buddy will be freed. 1716 * Corresponding page table entries will not be touched, 1717 * pages will stay not present in virtual address space 1718 */ 1719 if (set_page_guard(zone, &page[size], high)) 1720 continue; 1721 1722 __add_to_free_list(&page[size], zone, high, migratetype, false); 1723 set_buddy_order(&page[size], high); 1724 nr_added += size; 1725 } 1726 1727 return nr_added; 1728 } 1729 1730 static __always_inline void page_del_and_expand(struct zone *zone, 1731 struct page *page, int low, 1732 int high, int migratetype) 1733 { 1734 int nr_pages = 1 << high; 1735 1736 __del_page_from_free_list(page, zone, high, migratetype); 1737 nr_pages -= expand(zone, page, low, high, migratetype); 1738 account_freepages(zone, -nr_pages, migratetype); 1739 } 1740 1741 static void check_new_page_bad(struct page *page) 1742 { 1743 if (unlikely(PageHWPoison(page))) { 1744 /* Don't complain about hwpoisoned pages */ 1745 if (PageBuddy(page)) 1746 __ClearPageBuddy(page); 1747 return; 1748 } 1749 1750 bad_page(page, 1751 page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP)); 1752 } 1753 1754 /* 1755 * This page is about to be returned from the page allocator 1756 */ 1757 static bool check_new_page(struct page *page) 1758 { 1759 if (likely(page_expected_state(page, 1760 PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))) 1761 return false; 1762 1763 check_new_page_bad(page); 1764 return true; 1765 } 1766 1767 static inline bool check_new_pages(struct page *page, unsigned int order) 1768 { 1769 if (is_check_pages_enabled()) { 1770 for (int i = 0; i < (1 << order); i++) { 1771 struct page *p = page + i; 1772 1773 if (check_new_page(p)) 1774 return true; 1775 } 1776 } 1777 1778 return false; 1779 } 1780 1781 static inline bool should_skip_kasan_unpoison(gfp_t flags) 1782 { 1783 /* Don't skip if a software KASAN mode is enabled. */ 1784 if (IS_ENABLED(CONFIG_KASAN_GENERIC) || 1785 IS_ENABLED(CONFIG_KASAN_SW_TAGS)) 1786 return false; 1787 1788 /* Skip, if hardware tag-based KASAN is not enabled. */ 1789 if (!kasan_hw_tags_enabled()) 1790 return true; 1791 1792 /* 1793 * With hardware tag-based KASAN enabled, skip if this has been 1794 * requested via __GFP_SKIP_KASAN. 1795 */ 1796 return flags & __GFP_SKIP_KASAN; 1797 } 1798 1799 static inline bool should_skip_init(gfp_t flags) 1800 { 1801 /* Don't skip, if hardware tag-based KASAN is not enabled. */ 1802 if (!kasan_hw_tags_enabled()) 1803 return false; 1804 1805 /* For hardware tag-based KASAN, skip if requested. */ 1806 return (flags & __GFP_SKIP_ZERO); 1807 } 1808 1809 inline void post_alloc_hook(struct page *page, unsigned int order, 1810 gfp_t gfp_flags) 1811 { 1812 const bool zero_tags = gfp_flags & __GFP_ZEROTAGS; 1813 bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && 1814 !should_skip_init(gfp_flags); 1815 int i; 1816 1817 set_page_private(page, 0); 1818 1819 arch_alloc_page(page, order); 1820 debug_pagealloc_map_pages(page, 1 << order); 1821 1822 /* 1823 * Page unpoisoning must happen before memory initialization. 1824 * Otherwise, the poison pattern will be overwritten for __GFP_ZERO 1825 * allocations and the page unpoisoning code will complain. 1826 */ 1827 kernel_unpoison_pages(page, 1 << order); 1828 1829 /* 1830 * As memory initialization might be integrated into KASAN, 1831 * KASAN unpoisoning and memory initialization code must be 1832 * kept together to avoid discrepancies in behavior. 1833 */ 1834 1835 /* 1836 * Clearing tags can efficiently clear the memory for us as well, if 1837 * required. 1838 */ 1839 if (zero_tags) 1840 init = tag_clear_highpages(page, 1 << order, /* clear_pages= */init); 1841 1842 if (!should_skip_kasan_unpoison(gfp_flags) && 1843 kasan_unpoison_pages(page, order, init)) { 1844 /* Take note that memory was initialized by KASAN. */ 1845 if (kasan_has_integrated_init()) 1846 init = false; 1847 } else { 1848 /* 1849 * If memory tags have not been set by KASAN, reset the page 1850 * tags to ensure page_address() dereferencing does not fault. 1851 */ 1852 for (i = 0; i != 1 << order; ++i) 1853 page_kasan_tag_reset(page + i); 1854 } 1855 /* If memory is still not initialized, initialize it now. */ 1856 if (init) 1857 clear_highpages_kasan_tagged(page, 1 << order); 1858 1859 set_page_owner(page, order, gfp_flags); 1860 page_table_check_alloc(page, order); 1861 pgalloc_tag_add(page, current, 1 << order); 1862 } 1863 1864 static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, 1865 unsigned int alloc_flags) 1866 { 1867 post_alloc_hook(page, order, gfp_flags); 1868 1869 if (order && (gfp_flags & __GFP_COMP)) 1870 prep_compound_page(page, order); 1871 1872 /* 1873 * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to 1874 * allocate the page. The expectation is that the caller is taking 1875 * steps that will free more memory. The caller should avoid the page 1876 * being used for !PFMEMALLOC purposes. 1877 */ 1878 if (alloc_flags & ALLOC_NO_WATERMARKS) 1879 set_page_pfmemalloc(page); 1880 else 1881 clear_page_pfmemalloc(page); 1882 } 1883 1884 /* 1885 * Go through the free lists for the given migratetype and remove 1886 * the smallest available page from the freelists 1887 */ 1888 static __always_inline 1889 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 1890 int migratetype) 1891 { 1892 unsigned int current_order; 1893 struct free_area *area; 1894 struct page *page; 1895 1896 /* Find a page of the appropriate size in the preferred list */ 1897 for (current_order = order; current_order < NR_PAGE_ORDERS; ++current_order) { 1898 area = &(zone->free_area[current_order]); 1899 page = get_page_from_free_area(area, migratetype); 1900 if (!page) 1901 continue; 1902 1903 page_del_and_expand(zone, page, order, current_order, 1904 migratetype); 1905 trace_mm_page_alloc_zone_locked(page, order, migratetype, 1906 pcp_allowed_order(order) && 1907 migratetype < MIGRATE_PCPTYPES); 1908 return page; 1909 } 1910 1911 return NULL; 1912 } 1913 1914 1915 /* 1916 * This array describes the order lists are fallen back to when 1917 * the free lists for the desirable migrate type are depleted 1918 * 1919 * The other migratetypes do not have fallbacks. 1920 */ 1921 static int fallbacks[MIGRATE_PCPTYPES][MIGRATE_PCPTYPES - 1] = { 1922 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE }, 1923 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE }, 1924 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE }, 1925 }; 1926 1927 #ifdef CONFIG_CMA 1928 static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone, 1929 unsigned int order) 1930 { 1931 return __rmqueue_smallest(zone, order, MIGRATE_CMA); 1932 } 1933 #else 1934 static inline struct page *__rmqueue_cma_fallback(struct zone *zone, 1935 unsigned int order) { return NULL; } 1936 #endif 1937 1938 /* 1939 * Move all free pages of a block to new type's freelist. Caller needs to 1940 * change the block type. 1941 */ 1942 static int __move_freepages_block(struct zone *zone, unsigned long start_pfn, 1943 int old_mt, int new_mt) 1944 { 1945 struct page *page; 1946 unsigned long pfn, end_pfn; 1947 unsigned int order; 1948 int pages_moved = 0; 1949 1950 VM_WARN_ON(start_pfn & (pageblock_nr_pages - 1)); 1951 end_pfn = pageblock_end_pfn(start_pfn); 1952 1953 for (pfn = start_pfn; pfn < end_pfn;) { 1954 page = pfn_to_page(pfn); 1955 if (!PageBuddy(page)) { 1956 pfn++; 1957 continue; 1958 } 1959 1960 /* Make sure we are not inadvertently changing nodes */ 1961 VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); 1962 VM_BUG_ON_PAGE(page_zone(page) != zone, page); 1963 1964 order = buddy_order(page); 1965 1966 move_to_free_list(page, zone, order, old_mt, new_mt); 1967 1968 pfn += 1 << order; 1969 pages_moved += 1 << order; 1970 } 1971 1972 return pages_moved; 1973 } 1974 1975 static bool prep_move_freepages_block(struct zone *zone, struct page *page, 1976 unsigned long *start_pfn, 1977 int *num_free, int *num_movable) 1978 { 1979 unsigned long pfn, start, end; 1980 1981 pfn = page_to_pfn(page); 1982 start = pageblock_start_pfn(pfn); 1983 end = pageblock_end_pfn(pfn); 1984 1985 /* 1986 * The caller only has the lock for @zone, don't touch ranges 1987 * that straddle into other zones. While we could move part of 1988 * the range that's inside the zone, this call is usually 1989 * accompanied by other operations such as migratetype updates 1990 * which also should be locked. 1991 */ 1992 if (!zone_spans_pfn(zone, start)) 1993 return false; 1994 if (!zone_spans_pfn(zone, end - 1)) 1995 return false; 1996 1997 *start_pfn = start; 1998 1999 if (num_free) { 2000 *num_free = 0; 2001 *num_movable = 0; 2002 for (pfn = start; pfn < end;) { 2003 page = pfn_to_page(pfn); 2004 if (PageBuddy(page)) { 2005 int nr = 1 << buddy_order(page); 2006 2007 *num_free += nr; 2008 pfn += nr; 2009 continue; 2010 } 2011 /* 2012 * We assume that pages that could be isolated for 2013 * migration are movable. But we don't actually try 2014 * isolating, as that would be expensive. 2015 */ 2016 if (PageLRU(page) || page_has_movable_ops(page)) 2017 (*num_movable)++; 2018 pfn++; 2019 } 2020 } 2021 2022 return true; 2023 } 2024 2025 static int move_freepages_block(struct zone *zone, struct page *page, 2026 int old_mt, int new_mt) 2027 { 2028 unsigned long start_pfn; 2029 int res; 2030 2031 if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL)) 2032 return -1; 2033 2034 res = __move_freepages_block(zone, start_pfn, old_mt, new_mt); 2035 set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt); 2036 2037 return res; 2038 2039 } 2040 2041 #ifdef CONFIG_MEMORY_ISOLATION 2042 /* Look for a buddy that straddles start_pfn */ 2043 static unsigned long find_large_buddy(unsigned long start_pfn) 2044 { 2045 /* 2046 * If start_pfn is not an order-0 PageBuddy, next PageBuddy containing 2047 * start_pfn has minimal order of __ffs(start_pfn) + 1. Start checking 2048 * the order with __ffs(start_pfn). If start_pfn is order-0 PageBuddy, 2049 * the starting order does not matter. 2050 */ 2051 int order = start_pfn ? __ffs(start_pfn) : MAX_PAGE_ORDER; 2052 struct page *page; 2053 unsigned long pfn = start_pfn; 2054 2055 while (!PageBuddy(page = pfn_to_page(pfn))) { 2056 /* Nothing found */ 2057 if (++order > MAX_PAGE_ORDER) 2058 return start_pfn; 2059 pfn &= ~0UL << order; 2060 } 2061 2062 /* 2063 * Found a preceding buddy, but does it straddle? 2064 */ 2065 if (pfn + (1 << buddy_order(page)) > start_pfn) 2066 return pfn; 2067 2068 /* Nothing found */ 2069 return start_pfn; 2070 } 2071 2072 static inline void toggle_pageblock_isolate(struct page *page, bool isolate) 2073 { 2074 if (isolate) 2075 set_pageblock_isolate(page); 2076 else 2077 clear_pageblock_isolate(page); 2078 } 2079 2080 /** 2081 * __move_freepages_block_isolate - move free pages in block for page isolation 2082 * @zone: the zone 2083 * @page: the pageblock page 2084 * @isolate: to isolate the given pageblock or unisolate it 2085 * 2086 * This is similar to move_freepages_block(), but handles the special 2087 * case encountered in page isolation, where the block of interest 2088 * might be part of a larger buddy spanning multiple pageblocks. 2089 * 2090 * Unlike the regular page allocator path, which moves pages while 2091 * stealing buddies off the freelist, page isolation is interested in 2092 * arbitrary pfn ranges that may have overlapping buddies on both ends. 2093 * 2094 * This function handles that. Straddling buddies are split into 2095 * individual pageblocks. Only the block of interest is moved. 2096 * 2097 * Returns %true if pages could be moved, %false otherwise. 2098 */ 2099 static bool __move_freepages_block_isolate(struct zone *zone, 2100 struct page *page, bool isolate) 2101 { 2102 unsigned long start_pfn, buddy_pfn; 2103 int from_mt; 2104 int to_mt; 2105 struct page *buddy; 2106 2107 if (isolate == get_pageblock_isolate(page)) { 2108 VM_WARN_ONCE(1, "%s a pageblock that is already in that state", 2109 isolate ? "Isolate" : "Unisolate"); 2110 return false; 2111 } 2112 2113 if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL)) 2114 return false; 2115 2116 /* No splits needed if buddies can't span multiple blocks */ 2117 if (pageblock_order == MAX_PAGE_ORDER) 2118 goto move; 2119 2120 buddy_pfn = find_large_buddy(start_pfn); 2121 buddy = pfn_to_page(buddy_pfn); 2122 /* We're a part of a larger buddy */ 2123 if (PageBuddy(buddy) && buddy_order(buddy) > pageblock_order) { 2124 int order = buddy_order(buddy); 2125 2126 del_page_from_free_list(buddy, zone, order, 2127 get_pfnblock_migratetype(buddy, buddy_pfn)); 2128 toggle_pageblock_isolate(page, isolate); 2129 split_large_buddy(zone, buddy, buddy_pfn, order, FPI_NONE); 2130 return true; 2131 } 2132 2133 move: 2134 /* Use PAGEBLOCK_MIGRATETYPE_MASK to get non-isolate migratetype */ 2135 if (isolate) { 2136 from_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page), 2137 PAGEBLOCK_MIGRATETYPE_MASK); 2138 to_mt = MIGRATE_ISOLATE; 2139 } else { 2140 from_mt = MIGRATE_ISOLATE; 2141 to_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page), 2142 PAGEBLOCK_MIGRATETYPE_MASK); 2143 } 2144 2145 __move_freepages_block(zone, start_pfn, from_mt, to_mt); 2146 toggle_pageblock_isolate(pfn_to_page(start_pfn), isolate); 2147 2148 return true; 2149 } 2150 2151 bool pageblock_isolate_and_move_free_pages(struct zone *zone, struct page *page) 2152 { 2153 return __move_freepages_block_isolate(zone, page, true); 2154 } 2155 2156 bool pageblock_unisolate_and_move_free_pages(struct zone *zone, struct page *page) 2157 { 2158 return __move_freepages_block_isolate(zone, page, false); 2159 } 2160 2161 #endif /* CONFIG_MEMORY_ISOLATION */ 2162 2163 static inline bool boost_watermark(struct zone *zone) 2164 { 2165 unsigned long max_boost; 2166 2167 if (!watermark_boost_factor) 2168 return false; 2169 /* 2170 * Don't bother in zones that are unlikely to produce results. 2171 * On small machines, including kdump capture kernels running 2172 * in a small area, boosting the watermark can cause an out of 2173 * memory situation immediately. 2174 */ 2175 if ((pageblock_nr_pages * 4) > zone_managed_pages(zone)) 2176 return false; 2177 2178 max_boost = mult_frac(zone->_watermark[WMARK_HIGH], 2179 watermark_boost_factor, 10000); 2180 2181 /* 2182 * high watermark may be uninitialised if fragmentation occurs 2183 * very early in boot so do not boost. We do not fall 2184 * through and boost by pageblock_nr_pages as failing 2185 * allocations that early means that reclaim is not going 2186 * to help and it may even be impossible to reclaim the 2187 * boosted watermark resulting in a hang. 2188 */ 2189 if (!max_boost) 2190 return false; 2191 2192 max_boost = max(pageblock_nr_pages, max_boost); 2193 2194 zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, 2195 max_boost); 2196 2197 return true; 2198 } 2199 2200 /* 2201 * When we are falling back to another migratetype during allocation, should we 2202 * try to claim an entire block to satisfy further allocations, instead of 2203 * polluting multiple pageblocks? 2204 */ 2205 static bool should_try_claim_block(unsigned int order, int start_mt) 2206 { 2207 /* 2208 * Leaving this order check is intended, although there is 2209 * relaxed order check in next check. The reason is that 2210 * we can actually claim the whole pageblock if this condition met, 2211 * but, below check doesn't guarantee it and that is just heuristic 2212 * so could be changed anytime. 2213 */ 2214 if (order >= pageblock_order) 2215 return true; 2216 2217 /* 2218 * Above a certain threshold, always try to claim, as it's likely there 2219 * will be more free pages in the pageblock. 2220 */ 2221 if (order >= pageblock_order / 2) 2222 return true; 2223 2224 /* 2225 * Unmovable/reclaimable allocations would cause permanent 2226 * fragmentations if they fell back to allocating from a movable block 2227 * (polluting it), so we try to claim the whole block regardless of the 2228 * allocation size. Later movable allocations can always steal from this 2229 * block, which is less problematic. 2230 */ 2231 if (start_mt == MIGRATE_RECLAIMABLE || start_mt == MIGRATE_UNMOVABLE) 2232 return true; 2233 2234 if (page_group_by_mobility_disabled) 2235 return true; 2236 2237 /* 2238 * Movable pages won't cause permanent fragmentation, so when you alloc 2239 * small pages, we just need to temporarily steal unmovable or 2240 * reclaimable pages that are closest to the request size. After a 2241 * while, memory compaction may occur to form large contiguous pages, 2242 * and the next movable allocation may not need to steal. 2243 */ 2244 return false; 2245 } 2246 2247 /* 2248 * Check whether there is a suitable fallback freepage with requested order. 2249 * If claimable is true, this function returns fallback_mt only if 2250 * we would do this whole-block claiming. This would help to reduce 2251 * fragmentation due to mixed migratetype pages in one pageblock. 2252 */ 2253 enum fallback_result 2254 find_suitable_fallback(struct free_area *area, unsigned int order, 2255 int migratetype, bool claimable, int *mt_out) 2256 { 2257 int i; 2258 2259 if (claimable && !should_try_claim_block(order, migratetype)) 2260 return FALLBACK_NOCLAIM; 2261 2262 if (area->nr_free == 0) 2263 return FALLBACK_EMPTY; 2264 2265 for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) { 2266 int fallback_mt = fallbacks[migratetype][i]; 2267 2268 if (!free_area_empty(area, fallback_mt)) { 2269 if (mt_out) 2270 *mt_out = fallback_mt; 2271 return FALLBACK_FOUND; 2272 } 2273 } 2274 2275 return FALLBACK_EMPTY; 2276 } 2277 2278 /* 2279 * This function implements actual block claiming behaviour. If order is large 2280 * enough, we can claim the whole pageblock for the requested migratetype. If 2281 * not, we check the pageblock for constituent pages; if at least half of the 2282 * pages are free or compatible, we can still claim the whole block, so pages 2283 * freed in the future will be put on the correct free list. 2284 */ 2285 static struct page * 2286 try_to_claim_block(struct zone *zone, struct page *page, 2287 int current_order, int order, int start_type, 2288 int block_type, unsigned int alloc_flags) 2289 { 2290 int free_pages, movable_pages, alike_pages; 2291 unsigned long start_pfn; 2292 2293 /* Take ownership for orders >= pageblock_order */ 2294 if (current_order >= pageblock_order) { 2295 unsigned int nr_added; 2296 2297 del_page_from_free_list(page, zone, current_order, block_type); 2298 change_pageblock_range(page, current_order, start_type); 2299 nr_added = expand(zone, page, order, current_order, start_type); 2300 account_freepages(zone, nr_added, start_type); 2301 return page; 2302 } 2303 2304 /* 2305 * Boost watermarks to increase reclaim pressure to reduce the 2306 * likelihood of future fallbacks. Wake kswapd now as the node 2307 * may be balanced overall and kswapd will not wake naturally. 2308 */ 2309 if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD)) 2310 set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); 2311 2312 /* moving whole block can fail due to zone boundary conditions */ 2313 if (!prep_move_freepages_block(zone, page, &start_pfn, &free_pages, 2314 &movable_pages)) 2315 return NULL; 2316 2317 /* 2318 * Determine how many pages are compatible with our allocation. 2319 * For movable allocation, it's the number of movable pages which 2320 * we just obtained. For other types it's a bit more tricky. 2321 */ 2322 if (start_type == MIGRATE_MOVABLE) { 2323 alike_pages = movable_pages; 2324 } else { 2325 /* 2326 * If we are falling back a RECLAIMABLE or UNMOVABLE allocation 2327 * to MOVABLE pageblock, consider all non-movable pages as 2328 * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or 2329 * vice versa, be conservative since we can't distinguish the 2330 * exact migratetype of non-movable pages. 2331 */ 2332 if (block_type == MIGRATE_MOVABLE) 2333 alike_pages = pageblock_nr_pages 2334 - (free_pages + movable_pages); 2335 else 2336 alike_pages = 0; 2337 } 2338 /* 2339 * If a sufficient number of pages in the block are either free or of 2340 * compatible migratability as our allocation, claim the whole block. 2341 */ 2342 if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || 2343 page_group_by_mobility_disabled) { 2344 __move_freepages_block(zone, start_pfn, block_type, start_type); 2345 set_pageblock_migratetype(pfn_to_page(start_pfn), start_type); 2346 return __rmqueue_smallest(zone, order, start_type); 2347 } 2348 2349 return NULL; 2350 } 2351 2352 /* 2353 * Try to allocate from some fallback migratetype by claiming the entire block, 2354 * i.e. converting it to the allocation's start migratetype. 2355 * 2356 * The use of signed ints for order and current_order is a deliberate 2357 * deviation from the rest of this file, to make the for loop 2358 * condition simpler. 2359 */ 2360 static __always_inline struct page * 2361 __rmqueue_claim(struct zone *zone, int order, int start_migratetype, 2362 unsigned int alloc_flags) 2363 { 2364 struct free_area *area; 2365 int current_order; 2366 int min_order = order; 2367 struct page *page; 2368 int fallback_mt; 2369 2370 /* 2371 * Do not steal pages from freelists belonging to other pageblocks 2372 * i.e. orders < pageblock_order. If there are no local zones free, 2373 * the zonelists will be reiterated without ALLOC_NOFRAGMENT. 2374 */ 2375 if (order < pageblock_order && alloc_flags & ALLOC_NOFRAGMENT) 2376 min_order = pageblock_order; 2377 2378 /* 2379 * Find the largest available free page in the other list. This roughly 2380 * approximates finding the pageblock with the most free pages, which 2381 * would be too costly to do exactly. 2382 */ 2383 for (current_order = MAX_PAGE_ORDER; current_order >= min_order; 2384 --current_order) { 2385 enum fallback_result result; 2386 2387 area = &(zone->free_area[current_order]); 2388 result = find_suitable_fallback(area, current_order, 2389 start_migratetype, true, &fallback_mt); 2390 2391 if (result == FALLBACK_EMPTY) 2392 continue; 2393 2394 if (result == FALLBACK_NOCLAIM) 2395 break; 2396 2397 page = get_page_from_free_area(area, fallback_mt); 2398 page = try_to_claim_block(zone, page, current_order, order, 2399 start_migratetype, fallback_mt, 2400 alloc_flags); 2401 if (page) { 2402 trace_mm_page_alloc_extfrag(page, order, current_order, 2403 start_migratetype, fallback_mt); 2404 return page; 2405 } 2406 } 2407 2408 return NULL; 2409 } 2410 2411 /* 2412 * Try to steal a single page from some fallback migratetype. Leave the rest of 2413 * the block as its current migratetype, potentially causing fragmentation. 2414 */ 2415 static __always_inline struct page * 2416 __rmqueue_steal(struct zone *zone, int order, int start_migratetype) 2417 { 2418 struct free_area *area; 2419 int current_order; 2420 struct page *page; 2421 int fallback_mt; 2422 2423 for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) { 2424 enum fallback_result result; 2425 2426 area = &(zone->free_area[current_order]); 2427 result = find_suitable_fallback(area, current_order, start_migratetype, 2428 false, &fallback_mt); 2429 if (result == FALLBACK_EMPTY) 2430 continue; 2431 2432 page = get_page_from_free_area(area, fallback_mt); 2433 page_del_and_expand(zone, page, order, current_order, fallback_mt); 2434 trace_mm_page_alloc_extfrag(page, order, current_order, 2435 start_migratetype, fallback_mt); 2436 return page; 2437 } 2438 2439 return NULL; 2440 } 2441 2442 enum rmqueue_mode { 2443 RMQUEUE_NORMAL, 2444 RMQUEUE_CMA, 2445 RMQUEUE_CLAIM, 2446 RMQUEUE_STEAL, 2447 }; 2448 2449 /* 2450 * Do the hard work of removing an element from the buddy allocator. 2451 * Call me with the zone->lock already held. 2452 */ 2453 static __always_inline struct page * 2454 __rmqueue(struct zone *zone, unsigned int order, int migratetype, 2455 unsigned int alloc_flags, enum rmqueue_mode *mode) 2456 { 2457 struct page *page; 2458 2459 if (IS_ENABLED(CONFIG_CMA)) { 2460 /* 2461 * Balance movable allocations between regular and CMA areas by 2462 * allocating from CMA when over half of the zone's free memory 2463 * is in the CMA area. 2464 */ 2465 if (alloc_flags & ALLOC_CMA && 2466 zone_page_state(zone, NR_FREE_CMA_PAGES) > 2467 zone_page_state(zone, NR_FREE_PAGES) / 2) { 2468 page = __rmqueue_cma_fallback(zone, order); 2469 if (page) 2470 return page; 2471 } 2472 } 2473 2474 /* 2475 * First try the freelists of the requested migratetype, then try 2476 * fallbacks modes with increasing levels of fragmentation risk. 2477 * 2478 * The fallback logic is expensive and rmqueue_bulk() calls in 2479 * a loop with the zone->lock held, meaning the freelists are 2480 * not subject to any outside changes. Remember in *mode where 2481 * we found pay dirt, to save us the search on the next call. 2482 */ 2483 switch (*mode) { 2484 case RMQUEUE_NORMAL: 2485 page = __rmqueue_smallest(zone, order, migratetype); 2486 if (page) 2487 return page; 2488 fallthrough; 2489 case RMQUEUE_CMA: 2490 if (alloc_flags & ALLOC_CMA) { 2491 page = __rmqueue_cma_fallback(zone, order); 2492 if (page) { 2493 *mode = RMQUEUE_CMA; 2494 return page; 2495 } 2496 } 2497 fallthrough; 2498 case RMQUEUE_CLAIM: 2499 page = __rmqueue_claim(zone, order, migratetype, alloc_flags); 2500 if (page) { 2501 /* Replenished preferred freelist, back to normal mode. */ 2502 *mode = RMQUEUE_NORMAL; 2503 return page; 2504 } 2505 fallthrough; 2506 case RMQUEUE_STEAL: 2507 if (!(alloc_flags & ALLOC_NOFRAGMENT)) { 2508 page = __rmqueue_steal(zone, order, migratetype); 2509 if (page) { 2510 *mode = RMQUEUE_STEAL; 2511 return page; 2512 } 2513 } 2514 } 2515 return NULL; 2516 } 2517 2518 /* 2519 * Obtain a specified number of elements from the buddy allocator, all under 2520 * a single hold of the lock, for efficiency. Add them to the supplied list. 2521 * Returns the number of new pages which were placed at *list. 2522 */ 2523 static int rmqueue_bulk(struct zone *zone, unsigned int order, 2524 unsigned long count, struct list_head *list, 2525 int migratetype, unsigned int alloc_flags) 2526 { 2527 enum rmqueue_mode rmqm = RMQUEUE_NORMAL; 2528 unsigned long flags; 2529 int i; 2530 2531 if (unlikely(alloc_flags & ALLOC_TRYLOCK)) { 2532 if (!spin_trylock_irqsave(&zone->lock, flags)) 2533 return 0; 2534 } else { 2535 spin_lock_irqsave(&zone->lock, flags); 2536 } 2537 for (i = 0; i < count; ++i) { 2538 struct page *page = __rmqueue(zone, order, migratetype, 2539 alloc_flags, &rmqm); 2540 if (unlikely(page == NULL)) 2541 break; 2542 2543 /* 2544 * Split buddy pages returned by expand() are received here in 2545 * physical page order. The page is added to the tail of 2546 * caller's list. From the callers perspective, the linked list 2547 * is ordered by page number under some conditions. This is 2548 * useful for IO devices that can forward direction from the 2549 * head, thus also in the physical page order. This is useful 2550 * for IO devices that can merge IO requests if the physical 2551 * pages are ordered properly. 2552 */ 2553 list_add_tail(&page->pcp_list, list); 2554 } 2555 spin_unlock_irqrestore(&zone->lock, flags); 2556 2557 return i; 2558 } 2559 2560 /* 2561 * Called from the vmstat counter updater to decay the PCP high. 2562 * Return whether there are addition works to do. 2563 */ 2564 bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) 2565 { 2566 int high_min, to_drain, to_drain_batched, batch; 2567 bool todo = false; 2568 2569 high_min = READ_ONCE(pcp->high_min); 2570 batch = READ_ONCE(pcp->batch); 2571 /* 2572 * Decrease pcp->high periodically to try to free possible 2573 * idle PCP pages. And, avoid to free too many pages to 2574 * control latency. This caps pcp->high decrement too. 2575 */ 2576 if (pcp->high > high_min) { 2577 pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX), 2578 pcp->high - (pcp->high >> 3), high_min); 2579 if (pcp->high > high_min) 2580 todo = true; 2581 } 2582 2583 to_drain = pcp->count - pcp->high; 2584 while (to_drain > 0) { 2585 to_drain_batched = min(to_drain, batch); 2586 pcp_spin_lock_nopin(pcp); 2587 free_pcppages_bulk(zone, to_drain_batched, pcp, 0); 2588 pcp_spin_unlock_nopin(pcp); 2589 todo = true; 2590 2591 to_drain -= to_drain_batched; 2592 } 2593 2594 return todo; 2595 } 2596 2597 #ifdef CONFIG_NUMA 2598 /* 2599 * Called from the vmstat counter updater to drain pagesets of this 2600 * currently executing processor on remote nodes after they have 2601 * expired. 2602 */ 2603 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 2604 { 2605 int to_drain, batch; 2606 2607 batch = READ_ONCE(pcp->batch); 2608 to_drain = min(pcp->count, batch); 2609 if (to_drain > 0) { 2610 pcp_spin_lock_nopin(pcp); 2611 free_pcppages_bulk(zone, to_drain, pcp, 0); 2612 pcp_spin_unlock_nopin(pcp); 2613 } 2614 } 2615 #endif 2616 2617 /* 2618 * Drain pcplists of the indicated processor and zone. 2619 */ 2620 static void drain_pages_zone(unsigned int cpu, struct zone *zone) 2621 { 2622 struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); 2623 int count; 2624 2625 do { 2626 pcp_spin_lock_nopin(pcp); 2627 count = pcp->count; 2628 if (count) { 2629 int to_drain = min(count, 2630 pcp->batch << CONFIG_PCP_BATCH_SCALE_MAX); 2631 2632 free_pcppages_bulk(zone, to_drain, pcp, 0); 2633 count -= to_drain; 2634 } 2635 pcp_spin_unlock_nopin(pcp); 2636 } while (count); 2637 } 2638 2639 /* 2640 * Drain pcplists of all zones on the indicated processor. 2641 */ 2642 static void drain_pages(unsigned int cpu) 2643 { 2644 struct zone *zone; 2645 2646 for_each_populated_zone(zone) { 2647 drain_pages_zone(cpu, zone); 2648 } 2649 } 2650 2651 /* 2652 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 2653 */ 2654 void drain_local_pages(struct zone *zone) 2655 { 2656 int cpu = smp_processor_id(); 2657 2658 if (zone) 2659 drain_pages_zone(cpu, zone); 2660 else 2661 drain_pages(cpu); 2662 } 2663 2664 /* 2665 * The implementation of drain_all_pages(), exposing an extra parameter to 2666 * drain on all cpus. 2667 * 2668 * drain_all_pages() is optimized to only execute on cpus where pcplists are 2669 * not empty. The check for non-emptiness can however race with a free to 2670 * pcplist that has not yet increased the pcp->count from 0 to 1. Callers 2671 * that need the guarantee that every CPU has drained can disable the 2672 * optimizing racy check. 2673 */ 2674 static void __drain_all_pages(struct zone *zone, bool force_all_cpus) 2675 { 2676 int cpu; 2677 2678 /* 2679 * Allocate in the BSS so we won't require allocation in 2680 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y 2681 */ 2682 static cpumask_t cpus_with_pcps; 2683 2684 /* 2685 * Do not drain if one is already in progress unless it's specific to 2686 * a zone. Such callers are primarily CMA and memory hotplug and need 2687 * the drain to be complete when the call returns. 2688 */ 2689 if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) { 2690 if (!zone) 2691 return; 2692 mutex_lock(&pcpu_drain_mutex); 2693 } 2694 2695 /* 2696 * We don't care about racing with CPU hotplug event 2697 * as offline notification will cause the notified 2698 * cpu to drain that CPU pcps and on_each_cpu_mask 2699 * disables preemption as part of its processing 2700 */ 2701 for_each_online_cpu(cpu) { 2702 struct per_cpu_pages *pcp; 2703 struct zone *z; 2704 bool has_pcps = false; 2705 2706 if (force_all_cpus) { 2707 /* 2708 * The pcp.count check is racy, some callers need a 2709 * guarantee that no cpu is missed. 2710 */ 2711 has_pcps = true; 2712 } else if (zone) { 2713 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); 2714 if (pcp->count) 2715 has_pcps = true; 2716 } else { 2717 for_each_populated_zone(z) { 2718 pcp = per_cpu_ptr(z->per_cpu_pageset, cpu); 2719 if (pcp->count) { 2720 has_pcps = true; 2721 break; 2722 } 2723 } 2724 } 2725 2726 if (has_pcps) 2727 cpumask_set_cpu(cpu, &cpus_with_pcps); 2728 else 2729 cpumask_clear_cpu(cpu, &cpus_with_pcps); 2730 } 2731 2732 for_each_cpu(cpu, &cpus_with_pcps) { 2733 if (zone) 2734 drain_pages_zone(cpu, zone); 2735 else 2736 drain_pages(cpu); 2737 } 2738 2739 mutex_unlock(&pcpu_drain_mutex); 2740 } 2741 2742 /* 2743 * Spill all the per-cpu pages from all CPUs back into the buddy allocator. 2744 * 2745 * When zone parameter is non-NULL, spill just the single zone's pages. 2746 */ 2747 void drain_all_pages(struct zone *zone) 2748 { 2749 __drain_all_pages(zone, false); 2750 } 2751 2752 static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free_high) 2753 { 2754 int min_nr_free, max_nr_free; 2755 2756 /* Free as much as possible if batch freeing high-order pages. */ 2757 if (unlikely(free_high)) 2758 return min(pcp->count, batch << CONFIG_PCP_BATCH_SCALE_MAX); 2759 2760 /* Check for PCP disabled or boot pageset */ 2761 if (unlikely(high < batch)) 2762 return 1; 2763 2764 /* Leave at least pcp->batch pages on the list */ 2765 min_nr_free = batch; 2766 max_nr_free = high - batch; 2767 2768 /* 2769 * Increase the batch number to the number of the consecutive 2770 * freed pages to reduce zone lock contention. 2771 */ 2772 batch = clamp_t(int, pcp->free_count, min_nr_free, max_nr_free); 2773 2774 return batch; 2775 } 2776 2777 static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, 2778 int batch, bool free_high) 2779 { 2780 int high, high_min, high_max; 2781 2782 high_min = READ_ONCE(pcp->high_min); 2783 high_max = READ_ONCE(pcp->high_max); 2784 high = pcp->high = clamp(pcp->high, high_min, high_max); 2785 2786 if (unlikely(!high)) 2787 return 0; 2788 2789 if (unlikely(free_high)) { 2790 pcp->high = max(high - (batch << CONFIG_PCP_BATCH_SCALE_MAX), 2791 high_min); 2792 return 0; 2793 } 2794 2795 /* 2796 * If reclaim is active, limit the number of pages that can be 2797 * stored on pcp lists 2798 */ 2799 if (test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) { 2800 int free_count = max_t(int, pcp->free_count, batch); 2801 2802 pcp->high = max(high - free_count, high_min); 2803 return min(batch << 2, pcp->high); 2804 } 2805 2806 if (high_min == high_max) 2807 return high; 2808 2809 if (test_bit(ZONE_BELOW_HIGH, &zone->flags)) { 2810 int free_count = max_t(int, pcp->free_count, batch); 2811 2812 pcp->high = max(high - free_count, high_min); 2813 high = max(pcp->count, high_min); 2814 } else if (pcp->count >= high) { 2815 int need_high = pcp->free_count + batch; 2816 2817 /* pcp->high should be large enough to hold batch freed pages */ 2818 if (pcp->high < need_high) 2819 pcp->high = clamp(need_high, high_min, high_max); 2820 } 2821 2822 return high; 2823 } 2824 2825 /* 2826 * Tune pcp alloc factor and adjust count & free_count. Free pages to bring the 2827 * pcp's watermarks below high. 2828 * 2829 * May return a freed pcp, if during page freeing the pcp spinlock cannot be 2830 * reacquired. Return true if pcp is locked, false otherwise. 2831 */ 2832 static bool free_frozen_page_commit(struct zone *zone, 2833 struct per_cpu_pages *pcp, struct page *page, int migratetype, 2834 unsigned int order, fpi_t fpi_flags) 2835 { 2836 int high, batch; 2837 int to_free, to_free_batched; 2838 int pindex; 2839 int cpu = smp_processor_id(); 2840 int ret = true; 2841 bool free_high = false; 2842 2843 /* 2844 * On freeing, reduce the number of pages that are batch allocated. 2845 * See nr_pcp_alloc() where alloc_factor is increased for subsequent 2846 * allocations. 2847 */ 2848 pcp->alloc_factor >>= 1; 2849 __count_vm_events(PGFREE, 1 << order); 2850 pindex = order_to_pindex(migratetype, order); 2851 list_add(&page->pcp_list, &pcp->lists[pindex]); 2852 pcp->count += 1 << order; 2853 2854 batch = READ_ONCE(pcp->batch); 2855 /* 2856 * As high-order pages other than THP's stored on PCP can contribute 2857 * to fragmentation, limit the number stored when PCP is heavily 2858 * freeing without allocation. The remainder after bulk freeing 2859 * stops will be drained from vmstat refresh context. 2860 */ 2861 if (order && order <= PAGE_ALLOC_COSTLY_ORDER) { 2862 free_high = (pcp->free_count >= (batch + pcp->high_min / 2) && 2863 (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) && 2864 (!(pcp->flags & PCPF_FREE_HIGH_BATCH) || 2865 pcp->count >= batch)); 2866 pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER; 2867 } else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) { 2868 pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER; 2869 } 2870 if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX)) 2871 pcp->free_count += (1 << order); 2872 2873 if (unlikely(fpi_flags & FPI_TRYLOCK)) { 2874 /* 2875 * Do not attempt to take a zone lock. Let pcp->count get 2876 * over high mark temporarily. 2877 */ 2878 return true; 2879 } 2880 2881 high = nr_pcp_high(pcp, zone, batch, free_high); 2882 if (pcp->count < high) 2883 return true; 2884 2885 to_free = nr_pcp_free(pcp, batch, high, free_high); 2886 while (to_free > 0 && pcp->count > 0) { 2887 to_free_batched = min(to_free, batch); 2888 free_pcppages_bulk(zone, to_free_batched, pcp, pindex); 2889 to_free -= to_free_batched; 2890 2891 if (to_free == 0 || pcp->count == 0) 2892 break; 2893 2894 pcp_spin_unlock(pcp); 2895 2896 pcp = pcp_spin_trylock(zone->per_cpu_pageset); 2897 if (!pcp) { 2898 ret = false; 2899 break; 2900 } 2901 2902 /* 2903 * Check if this thread has been migrated to a different CPU. 2904 * If that is the case, give up and indicate that the pcp is 2905 * returned in an unlocked state. 2906 */ 2907 if (smp_processor_id() != cpu) { 2908 pcp_spin_unlock(pcp); 2909 ret = false; 2910 break; 2911 } 2912 } 2913 2914 if (test_bit(ZONE_BELOW_HIGH, &zone->flags) && 2915 zone_watermark_ok(zone, 0, high_wmark_pages(zone), 2916 ZONE_MOVABLE, 0)) { 2917 struct pglist_data *pgdat = zone->zone_pgdat; 2918 clear_bit(ZONE_BELOW_HIGH, &zone->flags); 2919 2920 /* 2921 * Assume that memory pressure on this node is gone and may be 2922 * in a reclaimable state. If a memory fallback node exists, 2923 * direct reclaim may not have been triggered, causing a 2924 * 'hopeless node' to stay in that state for a while. Let 2925 * kswapd work again by resetting kswapd_failures. 2926 */ 2927 if (kswapd_test_hopeless(pgdat) && 2928 next_memory_node(pgdat->node_id) < MAX_NUMNODES) 2929 kswapd_clear_hopeless(pgdat, KSWAPD_CLEAR_HOPELESS_PCP); 2930 } 2931 return ret; 2932 } 2933 2934 /* 2935 * Free a pcp page 2936 */ 2937 static void __free_frozen_pages(struct page *page, unsigned int order, 2938 fpi_t fpi_flags) 2939 { 2940 struct per_cpu_pages *pcp; 2941 struct zone *zone; 2942 unsigned long pfn = page_to_pfn(page); 2943 int migratetype; 2944 2945 if (!pcp_allowed_order(order)) { 2946 __free_pages_ok(page, order, fpi_flags); 2947 return; 2948 } 2949 2950 if (!__free_pages_prepare(page, order, fpi_flags)) 2951 return; 2952 2953 /* 2954 * We only track unmovable, reclaimable and movable on pcp lists. 2955 * Place ISOLATE pages on the isolated list because they are being 2956 * offlined but treat HIGHATOMIC and CMA as movable pages so we can 2957 * get those areas back if necessary. Otherwise, we may have to free 2958 * excessively into the page allocator 2959 */ 2960 zone = page_zone(page); 2961 migratetype = get_pfnblock_migratetype(page, pfn); 2962 if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { 2963 if (unlikely(is_migrate_isolate(migratetype))) { 2964 free_one_page(zone, page, pfn, order, fpi_flags); 2965 return; 2966 } 2967 migratetype = MIGRATE_MOVABLE; 2968 } 2969 2970 if (unlikely((fpi_flags & FPI_TRYLOCK) && IS_ENABLED(CONFIG_PREEMPT_RT) 2971 && (in_nmi() || in_hardirq()))) { 2972 add_page_to_zone_llist(zone, page, order); 2973 return; 2974 } 2975 pcp = pcp_spin_trylock(zone->per_cpu_pageset); 2976 if (pcp) { 2977 if (!free_frozen_page_commit(zone, pcp, page, migratetype, 2978 order, fpi_flags)) 2979 return; 2980 pcp_spin_unlock(pcp); 2981 } else { 2982 free_one_page(zone, page, pfn, order, fpi_flags); 2983 } 2984 } 2985 2986 void free_frozen_pages(struct page *page, unsigned int order) 2987 { 2988 __free_frozen_pages(page, order, FPI_NONE); 2989 } 2990 2991 void free_frozen_pages_nolock(struct page *page, unsigned int order) 2992 { 2993 __free_frozen_pages(page, order, FPI_TRYLOCK); 2994 } 2995 2996 /* 2997 * Free a batch of folios 2998 */ 2999 void free_unref_folios(struct folio_batch *folios) 3000 { 3001 struct per_cpu_pages *pcp = NULL; 3002 struct zone *locked_zone = NULL; 3003 int i, j; 3004 3005 /* Prepare folios for freeing */ 3006 for (i = 0, j = 0; i < folios->nr; i++) { 3007 struct folio *folio = folios->folios[i]; 3008 unsigned long pfn = folio_pfn(folio); 3009 unsigned int order = folio_order(folio); 3010 3011 if (!__free_pages_prepare(&folio->page, order, FPI_NONE)) 3012 continue; 3013 /* 3014 * Free orders not handled on the PCP directly to the 3015 * allocator. 3016 */ 3017 if (!pcp_allowed_order(order)) { 3018 free_one_page(folio_zone(folio), &folio->page, 3019 pfn, order, FPI_NONE); 3020 continue; 3021 } 3022 folio->private = (void *)(unsigned long)order; 3023 if (j != i) 3024 folios->folios[j] = folio; 3025 j++; 3026 } 3027 folios->nr = j; 3028 3029 for (i = 0; i < folios->nr; i++) { 3030 struct folio *folio = folios->folios[i]; 3031 struct zone *zone = folio_zone(folio); 3032 unsigned long pfn = folio_pfn(folio); 3033 unsigned int order = (unsigned long)folio->private; 3034 int migratetype; 3035 3036 folio->private = NULL; 3037 migratetype = get_pfnblock_migratetype(&folio->page, pfn); 3038 3039 /* Different zone requires a different pcp lock */ 3040 if (zone != locked_zone || 3041 is_migrate_isolate(migratetype)) { 3042 if (pcp) { 3043 pcp_spin_unlock(pcp); 3044 locked_zone = NULL; 3045 pcp = NULL; 3046 } 3047 3048 /* 3049 * Free isolated pages directly to the 3050 * allocator, see comment in free_frozen_pages. 3051 */ 3052 if (is_migrate_isolate(migratetype)) { 3053 free_one_page(zone, &folio->page, pfn, 3054 order, FPI_NONE); 3055 continue; 3056 } 3057 3058 /* 3059 * trylock is necessary as folios may be getting freed 3060 * from IRQ or SoftIRQ context after an IO completion. 3061 */ 3062 pcp = pcp_spin_trylock(zone->per_cpu_pageset); 3063 if (unlikely(!pcp)) { 3064 free_one_page(zone, &folio->page, pfn, 3065 order, FPI_NONE); 3066 continue; 3067 } 3068 locked_zone = zone; 3069 } 3070 3071 /* 3072 * Non-isolated types over MIGRATE_PCPTYPES get added 3073 * to the MIGRATE_MOVABLE pcp list. 3074 */ 3075 if (unlikely(migratetype >= MIGRATE_PCPTYPES)) 3076 migratetype = MIGRATE_MOVABLE; 3077 3078 trace_mm_page_free_batched(&folio->page); 3079 if (!free_frozen_page_commit(zone, pcp, &folio->page, 3080 migratetype, order, FPI_NONE)) { 3081 pcp = NULL; 3082 locked_zone = NULL; 3083 } 3084 } 3085 3086 if (pcp) 3087 pcp_spin_unlock(pcp); 3088 folio_batch_reinit(folios); 3089 } 3090 3091 static void __split_page(struct page *page, unsigned int order) 3092 { 3093 VM_WARN_ON_PAGE(PageCompound(page), page); 3094 3095 split_page_owner(page, order, 0); 3096 pgalloc_tag_split(page_folio(page), order, 0); 3097 split_page_memcg(page, order); 3098 } 3099 3100 /* 3101 * split_page takes a non-compound higher-order page, and splits it into 3102 * n (1<<order) sub-pages: page[0..n] 3103 * Each sub-page must be freed individually. 3104 * 3105 * Note: this is probably too low level an operation for use in drivers. 3106 * Please consult with lkml before using this in your driver. 3107 */ 3108 void split_page(struct page *page, unsigned int order) 3109 { 3110 int i; 3111 3112 VM_WARN_ON_PAGE(!page_count(page), page); 3113 3114 for (i = 1; i < (1 << order); i++) 3115 set_page_refcounted(page + i); 3116 3117 __split_page(page, order); 3118 } 3119 EXPORT_SYMBOL_GPL(split_page); 3120 3121 int __isolate_free_page(struct page *page, unsigned int order) 3122 { 3123 struct zone *zone = page_zone(page); 3124 int mt = get_pageblock_migratetype(page); 3125 3126 if (!is_migrate_isolate(mt)) { 3127 unsigned long watermark; 3128 /* 3129 * Obey watermarks as if the page was being allocated. We can 3130 * emulate a high-order watermark check with a raised order-0 3131 * watermark, because we already know our high-order page 3132 * exists. 3133 */ 3134 watermark = zone->_watermark[WMARK_MIN] + (1UL << order); 3135 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) 3136 return 0; 3137 } 3138 3139 del_page_from_free_list(page, zone, order, mt); 3140 3141 /* 3142 * Set the pageblock if the isolated page is at least half of a 3143 * pageblock 3144 */ 3145 if (order >= pageblock_order - 1) { 3146 struct page *endpage = page + (1 << order) - 1; 3147 for (; page < endpage; page += pageblock_nr_pages) { 3148 int mt = get_pageblock_migratetype(page); 3149 /* 3150 * Only change normal pageblocks (i.e., they can merge 3151 * with others) 3152 */ 3153 if (migratetype_is_mergeable(mt)) 3154 move_freepages_block(zone, page, mt, 3155 MIGRATE_MOVABLE); 3156 } 3157 } 3158 3159 return 1UL << order; 3160 } 3161 3162 /** 3163 * __putback_isolated_page - Return a now-isolated page back where we got it 3164 * @page: Page that was isolated 3165 * @order: Order of the isolated page 3166 * @mt: The page's pageblock's migratetype 3167 * 3168 * This function is meant to return a page pulled from the free lists via 3169 * __isolate_free_page back to the free lists they were pulled from. 3170 */ 3171 void __putback_isolated_page(struct page *page, unsigned int order, int mt) 3172 { 3173 struct zone *zone = page_zone(page); 3174 3175 /* zone lock should be held when this function is called */ 3176 lockdep_assert_held(&zone->lock); 3177 3178 /* Return isolated page to tail of freelist. */ 3179 __free_one_page(page, page_to_pfn(page), zone, order, mt, 3180 FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL); 3181 } 3182 3183 /* 3184 * Update NUMA hit/miss statistics 3185 */ 3186 static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, 3187 long nr_account) 3188 { 3189 #ifdef CONFIG_NUMA 3190 enum numa_stat_item local_stat = NUMA_LOCAL; 3191 3192 /* skip numa counters update if numa stats is disabled */ 3193 if (!static_branch_likely(&vm_numa_stat_key)) 3194 return; 3195 3196 if (zone_to_nid(z) != numa_node_id()) 3197 local_stat = NUMA_OTHER; 3198 3199 if (zone_to_nid(z) == zone_to_nid(preferred_zone)) 3200 __count_numa_events(z, NUMA_HIT, nr_account); 3201 else { 3202 __count_numa_events(z, NUMA_MISS, nr_account); 3203 __count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account); 3204 } 3205 __count_numa_events(z, local_stat, nr_account); 3206 #endif 3207 } 3208 3209 static __always_inline 3210 struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, 3211 unsigned int order, unsigned int alloc_flags, 3212 int migratetype) 3213 { 3214 struct page *page; 3215 unsigned long flags; 3216 3217 do { 3218 page = NULL; 3219 if (unlikely(alloc_flags & ALLOC_TRYLOCK)) { 3220 if (!spin_trylock_irqsave(&zone->lock, flags)) 3221 return NULL; 3222 } else { 3223 spin_lock_irqsave(&zone->lock, flags); 3224 } 3225 if (alloc_flags & ALLOC_HIGHATOMIC) 3226 page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); 3227 if (!page) { 3228 enum rmqueue_mode rmqm = RMQUEUE_NORMAL; 3229 3230 page = __rmqueue(zone, order, migratetype, alloc_flags, &rmqm); 3231 3232 /* 3233 * If the allocation fails, allow OOM handling and 3234 * order-0 (atomic) allocs access to HIGHATOMIC 3235 * reserves as failing now is worse than failing a 3236 * high-order atomic allocation in the future. 3237 */ 3238 if (!page && (alloc_flags & (ALLOC_OOM|ALLOC_NON_BLOCK))) 3239 page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); 3240 3241 if (!page) { 3242 spin_unlock_irqrestore(&zone->lock, flags); 3243 return NULL; 3244 } 3245 } 3246 spin_unlock_irqrestore(&zone->lock, flags); 3247 } while (check_new_pages(page, order)); 3248 3249 /* 3250 * If this is a high-order atomic allocation then check 3251 * if the pageblock should be reserved for the future 3252 */ 3253 if (unlikely(alloc_flags & ALLOC_HIGHATOMIC)) 3254 reserve_highatomic_pageblock(page, order, zone); 3255 3256 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); 3257 zone_statistics(preferred_zone, zone, 1); 3258 3259 return page; 3260 } 3261 3262 static int nr_pcp_alloc(struct per_cpu_pages *pcp, struct zone *zone, int order) 3263 { 3264 int high, base_batch, batch, max_nr_alloc; 3265 int high_max, high_min; 3266 3267 base_batch = READ_ONCE(pcp->batch); 3268 high_min = READ_ONCE(pcp->high_min); 3269 high_max = READ_ONCE(pcp->high_max); 3270 high = pcp->high = clamp(pcp->high, high_min, high_max); 3271 3272 /* Check for PCP disabled or boot pageset */ 3273 if (unlikely(high < base_batch)) 3274 return 1; 3275 3276 if (order) 3277 batch = base_batch; 3278 else 3279 batch = (base_batch << pcp->alloc_factor); 3280 3281 /* 3282 * If we had larger pcp->high, we could avoid to allocate from 3283 * zone. 3284 */ 3285 if (high_min != high_max && !test_bit(ZONE_BELOW_HIGH, &zone->flags)) 3286 high = pcp->high = min(high + batch, high_max); 3287 3288 if (!order) { 3289 max_nr_alloc = max(high - pcp->count - base_batch, base_batch); 3290 /* 3291 * Double the number of pages allocated each time there is 3292 * subsequent allocation of order-0 pages without any freeing. 3293 */ 3294 if (batch <= max_nr_alloc && 3295 pcp->alloc_factor < CONFIG_PCP_BATCH_SCALE_MAX) 3296 pcp->alloc_factor++; 3297 batch = min(batch, max_nr_alloc); 3298 } 3299 3300 /* 3301 * Scale batch relative to order if batch implies free pages 3302 * can be stored on the PCP. Batch can be 1 for small zones or 3303 * for boot pagesets which should never store free pages as 3304 * the pages may belong to arbitrary zones. 3305 */ 3306 if (batch > 1) 3307 batch = max(batch >> order, 2); 3308 3309 return batch; 3310 } 3311 3312 /* Remove page from the per-cpu list, caller must protect the list */ 3313 static inline 3314 struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, 3315 int migratetype, 3316 unsigned int alloc_flags, 3317 struct per_cpu_pages *pcp, 3318 struct list_head *list) 3319 { 3320 struct page *page; 3321 3322 do { 3323 if (list_empty(list)) { 3324 int batch = nr_pcp_alloc(pcp, zone, order); 3325 int alloced; 3326 3327 /* 3328 * Don't refill the list for a higher order atomic 3329 * allocation under memory pressure, as this would 3330 * not build up any HIGHATOMIC reserves, which 3331 * might be needed soon. 3332 * 3333 * Instead, direct it towards the reserves by 3334 * returning NULL, which will make the caller fall 3335 * back to rmqueue_buddy. This will try to use the 3336 * reserves first and grow them if needed. 3337 */ 3338 if (alloc_flags & ALLOC_HIGHATOMIC) 3339 return NULL; 3340 3341 alloced = rmqueue_bulk(zone, order, 3342 batch, list, 3343 migratetype, alloc_flags); 3344 3345 pcp->count += alloced << order; 3346 if (unlikely(list_empty(list))) 3347 return NULL; 3348 } 3349 3350 page = list_first_entry(list, struct page, pcp_list); 3351 list_del(&page->pcp_list); 3352 pcp->count -= 1 << order; 3353 } while (check_new_pages(page, order)); 3354 3355 return page; 3356 } 3357 3358 /* Lock and remove page from the per-cpu list */ 3359 static struct page *rmqueue_pcplist(struct zone *preferred_zone, 3360 struct zone *zone, unsigned int order, 3361 int migratetype, unsigned int alloc_flags) 3362 { 3363 struct per_cpu_pages *pcp; 3364 struct list_head *list; 3365 struct page *page; 3366 3367 /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */ 3368 pcp = pcp_spin_trylock(zone->per_cpu_pageset); 3369 if (!pcp) 3370 return NULL; 3371 3372 /* 3373 * On allocation, reduce the number of pages that are batch freed. 3374 * See nr_pcp_free() where free_factor is increased for subsequent 3375 * frees. 3376 */ 3377 pcp->free_count >>= 1; 3378 list = &pcp->lists[order_to_pindex(migratetype, order)]; 3379 page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); 3380 pcp_spin_unlock(pcp); 3381 if (page) { 3382 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); 3383 zone_statistics(preferred_zone, zone, 1); 3384 } 3385 return page; 3386 } 3387 3388 /* 3389 * Allocate a page from the given zone. 3390 * Use pcplists for THP or "cheap" high-order allocations. 3391 */ 3392 3393 /* 3394 * Do not instrument rmqueue() with KMSAN. This function may call 3395 * __msan_poison_alloca() through a call to set_pfnblock_migratetype(). 3396 * If __msan_poison_alloca() attempts to allocate pages for the stack depot, it 3397 * may call rmqueue() again, which will result in a deadlock. 3398 */ 3399 __no_sanitize_memory 3400 static inline 3401 struct page *rmqueue(struct zone *preferred_zone, 3402 struct zone *zone, unsigned int order, 3403 gfp_t gfp_flags, unsigned int alloc_flags, 3404 int migratetype) 3405 { 3406 struct page *page; 3407 3408 if (likely(pcp_allowed_order(order))) { 3409 page = rmqueue_pcplist(preferred_zone, zone, order, 3410 migratetype, alloc_flags); 3411 if (likely(page)) 3412 goto out; 3413 } 3414 3415 page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags, 3416 migratetype); 3417 3418 out: 3419 /* Separate test+clear to avoid unnecessary atomics */ 3420 if ((alloc_flags & ALLOC_KSWAPD) && 3421 unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) { 3422 clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); 3423 wakeup_kswapd(zone, 0, 0, zone_idx(zone)); 3424 } 3425 3426 VM_BUG_ON_PAGE(page && bad_range(zone, page), page); 3427 return page; 3428 } 3429 3430 /* 3431 * Reserve the pageblock(s) surrounding an allocation request for 3432 * exclusive use of high-order atomic allocations if there are no 3433 * empty page blocks that contain a page with a suitable order 3434 */ 3435 static void reserve_highatomic_pageblock(struct page *page, int order, 3436 struct zone *zone) 3437 { 3438 int mt; 3439 unsigned long max_managed; 3440 3441 /* 3442 * The number reserved as: minimum is 1 pageblock, maximum is 3443 * roughly 1% of a zone. But if 1% of a zone falls below a 3444 * pageblock size, then don't reserve any pageblocks. 3445 * Check is race-prone but harmless. 3446 */ 3447 if ((zone_managed_pages(zone) / 100) < pageblock_nr_pages) 3448 return; 3449 max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages); 3450 if (zone->nr_reserved_highatomic >= max_managed) 3451 return; 3452 3453 guard(spinlock_irqsave)(&zone->lock); 3454 3455 /* Recheck the nr_reserved_highatomic limit under the lock */ 3456 if (zone->nr_reserved_highatomic >= max_managed) 3457 return; 3458 3459 /* Yoink! */ 3460 mt = get_pageblock_migratetype(page); 3461 /* Only reserve normal pageblocks (i.e., they can merge with others) */ 3462 if (!migratetype_is_mergeable(mt)) 3463 return; 3464 3465 if (order < pageblock_order) { 3466 if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1) 3467 return; 3468 zone->nr_reserved_highatomic += pageblock_nr_pages; 3469 } else { 3470 change_pageblock_range(page, order, MIGRATE_HIGHATOMIC); 3471 zone->nr_reserved_highatomic += 1 << order; 3472 } 3473 } 3474 3475 /* 3476 * Used when an allocation is about to fail under memory pressure. This 3477 * potentially hurts the reliability of high-order allocations when under 3478 * intense memory pressure but failed atomic allocations should be easier 3479 * to recover from than an OOM. 3480 * 3481 * If @force is true, try to unreserve pageblocks even though highatomic 3482 * pageblock is exhausted. 3483 */ 3484 static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, 3485 bool force) 3486 { 3487 struct zonelist *zonelist = ac->zonelist; 3488 struct zoneref *z; 3489 struct zone *zone; 3490 struct page *page; 3491 int order; 3492 int ret; 3493 3494 for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, 3495 ac->nodemask) { 3496 /* 3497 * Preserve at least one pageblock unless memory pressure 3498 * is really high. 3499 */ 3500 if (!force && zone->nr_reserved_highatomic <= 3501 pageblock_nr_pages) 3502 continue; 3503 3504 guard(spinlock_irqsave)(&zone->lock); 3505 for (order = 0; order < NR_PAGE_ORDERS; order++) { 3506 struct free_area *area = &(zone->free_area[order]); 3507 unsigned long size; 3508 3509 page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); 3510 if (!page) 3511 continue; 3512 3513 size = max(pageblock_nr_pages, 1UL << order); 3514 /* 3515 * It should never happen but changes to 3516 * locking could inadvertently allow a per-cpu 3517 * drain to add pages to MIGRATE_HIGHATOMIC 3518 * while unreserving so be safe and watch for 3519 * underflows. 3520 */ 3521 if (WARN_ON_ONCE(size > zone->nr_reserved_highatomic)) 3522 size = zone->nr_reserved_highatomic; 3523 zone->nr_reserved_highatomic -= size; 3524 3525 /* 3526 * Convert to ac->migratetype and avoid the normal 3527 * pageblock stealing heuristics. Minimally, the caller 3528 * is doing the work and needs the pages. More 3529 * importantly, if the block was always converted to 3530 * MIGRATE_UNMOVABLE or another type then the number 3531 * of pageblocks that cannot be completely freed 3532 * may increase. 3533 */ 3534 if (order < pageblock_order) 3535 ret = move_freepages_block(zone, page, 3536 MIGRATE_HIGHATOMIC, 3537 ac->migratetype); 3538 else { 3539 move_to_free_list(page, zone, order, 3540 MIGRATE_HIGHATOMIC, 3541 ac->migratetype); 3542 change_pageblock_range(page, order, 3543 ac->migratetype); 3544 ret = 1; 3545 } 3546 /* 3547 * Reserving the block(s) already succeeded, 3548 * so this should not fail on zone boundaries. 3549 */ 3550 WARN_ON_ONCE(ret == -1); 3551 if (ret > 0) 3552 return ret; 3553 } 3554 } 3555 3556 return false; 3557 } 3558 3559 static inline long __zone_watermark_unusable_free(struct zone *z, 3560 unsigned int order, unsigned int alloc_flags) 3561 { 3562 long unusable_free = (1 << order) - 1; 3563 3564 /* 3565 * If the caller does not have rights to reserves below the min 3566 * watermark then subtract the free pages reserved for highatomic. 3567 */ 3568 if (likely(!(alloc_flags & ALLOC_RESERVES))) 3569 unusable_free += READ_ONCE(z->nr_free_highatomic); 3570 3571 #ifdef CONFIG_CMA 3572 /* If allocation can't use CMA areas don't use free CMA pages */ 3573 if (!(alloc_flags & ALLOC_CMA)) 3574 unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES); 3575 #endif 3576 3577 return unusable_free; 3578 } 3579 3580 /* 3581 * Return true if free base pages are above 'mark'. For high-order checks it 3582 * will return true of the order-0 watermark is reached and there is at least 3583 * one free page of a suitable size. Checking now avoids taking the zone lock 3584 * to check in the allocation paths if no pages are free. 3585 */ 3586 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, 3587 int highest_zoneidx, unsigned int alloc_flags, 3588 long free_pages) 3589 { 3590 long min = mark; 3591 int o; 3592 3593 /* free_pages may go negative - that's OK */ 3594 free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags); 3595 3596 if (unlikely(alloc_flags & ALLOC_RESERVES)) { 3597 /* 3598 * __GFP_HIGH allows access to 50% of the min reserve as well 3599 * as OOM. 3600 */ 3601 if (alloc_flags & ALLOC_MIN_RESERVE) { 3602 min -= min / 2; 3603 3604 /* 3605 * Non-blocking allocations (e.g. GFP_ATOMIC) can 3606 * access more reserves than just __GFP_HIGH. Other 3607 * non-blocking allocations requests such as GFP_NOWAIT 3608 * or (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) do not get 3609 * access to the min reserve. 3610 */ 3611 if (alloc_flags & ALLOC_NON_BLOCK) 3612 min -= min / 4; 3613 } 3614 3615 /* 3616 * OOM victims can try even harder than the normal reserve 3617 * users on the grounds that it's definitely going to be in 3618 * the exit path shortly and free memory. Any allocation it 3619 * makes during the free path will be small and short-lived. 3620 */ 3621 if (alloc_flags & ALLOC_OOM) 3622 min -= min / 2; 3623 } 3624 3625 /* 3626 * Check watermarks for an order-0 allocation request. If these 3627 * are not met, then a high-order request also cannot go ahead 3628 * even if a suitable page happened to be free. 3629 */ 3630 if (free_pages <= min + z->lowmem_reserve[highest_zoneidx]) 3631 return false; 3632 3633 /* If this is an order-0 request then the watermark is fine */ 3634 if (!order) 3635 return true; 3636 3637 /* For a high-order request, check at least one suitable page is free */ 3638 for (o = order; o < NR_PAGE_ORDERS; o++) { 3639 struct free_area *area = &z->free_area[o]; 3640 int mt; 3641 3642 if (!area->nr_free) 3643 continue; 3644 3645 for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { 3646 if (!free_area_empty(area, mt)) 3647 return true; 3648 } 3649 3650 #ifdef CONFIG_CMA 3651 if ((alloc_flags & ALLOC_CMA) && 3652 !free_area_empty(area, MIGRATE_CMA)) { 3653 return true; 3654 } 3655 #endif 3656 if ((alloc_flags & (ALLOC_HIGHATOMIC|ALLOC_OOM)) && 3657 !free_area_empty(area, MIGRATE_HIGHATOMIC)) { 3658 return true; 3659 } 3660 } 3661 return false; 3662 } 3663 3664 bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, 3665 int highest_zoneidx, unsigned int alloc_flags) 3666 { 3667 return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, 3668 zone_page_state(z, NR_FREE_PAGES)); 3669 } 3670 3671 static inline bool zone_watermark_fast(struct zone *z, unsigned int order, 3672 unsigned long mark, int highest_zoneidx, 3673 unsigned int alloc_flags, gfp_t gfp_mask) 3674 { 3675 long free_pages; 3676 3677 free_pages = zone_page_state(z, NR_FREE_PAGES); 3678 3679 /* 3680 * Fast check for order-0 only. If this fails then the reserves 3681 * need to be calculated. 3682 */ 3683 if (!order) { 3684 long usable_free; 3685 long reserved; 3686 3687 usable_free = free_pages; 3688 reserved = __zone_watermark_unusable_free(z, 0, alloc_flags); 3689 3690 /* reserved may over estimate high-atomic reserves. */ 3691 usable_free -= min(usable_free, reserved); 3692 if (usable_free > mark + z->lowmem_reserve[highest_zoneidx]) 3693 return true; 3694 } 3695 3696 if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, 3697 free_pages)) 3698 return true; 3699 3700 /* 3701 * Ignore watermark boosting for __GFP_HIGH order-0 allocations 3702 * when checking the min watermark. The min watermark is the 3703 * point where boosting is ignored so that kswapd is woken up 3704 * when below the low watermark. 3705 */ 3706 if (unlikely(!order && (alloc_flags & ALLOC_MIN_RESERVE) && z->watermark_boost 3707 && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { 3708 mark = z->_watermark[WMARK_MIN]; 3709 return __zone_watermark_ok(z, order, mark, highest_zoneidx, 3710 alloc_flags, free_pages); 3711 } 3712 3713 return false; 3714 } 3715 3716 #ifdef CONFIG_NUMA 3717 int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; 3718 3719 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 3720 { 3721 return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= 3722 node_reclaim_distance; 3723 } 3724 #else /* CONFIG_NUMA */ 3725 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 3726 { 3727 return true; 3728 } 3729 #endif /* CONFIG_NUMA */ 3730 3731 /* 3732 * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid 3733 * fragmentation is subtle. If the preferred zone was HIGHMEM then 3734 * premature use of a lower zone may cause lowmem pressure problems that 3735 * are worse than fragmentation. If the next zone is ZONE_DMA then it is 3736 * probably too small. It only makes sense to spread allocations to avoid 3737 * fragmentation between the Normal and DMA32 zones. 3738 */ 3739 static inline unsigned int 3740 alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) 3741 { 3742 unsigned int alloc_flags; 3743 3744 /* 3745 * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD 3746 * to save a branch. 3747 */ 3748 alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM); 3749 3750 if (defrag_mode) { 3751 alloc_flags |= ALLOC_NOFRAGMENT; 3752 return alloc_flags; 3753 } 3754 3755 #ifdef CONFIG_ZONE_DMA32 3756 if (!zone) 3757 return alloc_flags; 3758 3759 if (zone_idx(zone) != ZONE_NORMAL) 3760 return alloc_flags; 3761 3762 /* 3763 * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and 3764 * the pointer is within zone->zone_pgdat->node_zones[]. Also assume 3765 * on UMA that if Normal is populated then so is DMA32. 3766 */ 3767 BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1); 3768 if (nr_online_nodes > 1 && !populated_zone(--zone)) 3769 return alloc_flags; 3770 3771 alloc_flags |= ALLOC_NOFRAGMENT; 3772 #endif /* CONFIG_ZONE_DMA32 */ 3773 return alloc_flags; 3774 } 3775 3776 /* Must be called after current_gfp_context() which can change gfp_mask */ 3777 static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask, 3778 unsigned int alloc_flags) 3779 { 3780 #ifdef CONFIG_CMA 3781 if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) 3782 alloc_flags |= ALLOC_CMA; 3783 #endif 3784 return alloc_flags; 3785 } 3786 3787 /* 3788 * get_page_from_freelist goes through the zonelist trying to allocate 3789 * a page. 3790 */ 3791 static struct page * 3792 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, 3793 const struct alloc_context *ac) 3794 { 3795 struct zoneref *z; 3796 struct zone *zone; 3797 struct pglist_data *last_pgdat = NULL; 3798 bool last_pgdat_dirty_ok = false; 3799 bool no_fallback; 3800 bool skip_kswapd_nodes = nr_online_nodes > 1; 3801 bool skipped_kswapd_nodes = false; 3802 3803 retry: 3804 /* 3805 * Scan zonelist, looking for a zone with enough free. 3806 * See also cpuset_current_node_allowed() comment in kernel/cgroup/cpuset.c. 3807 */ 3808 no_fallback = alloc_flags & ALLOC_NOFRAGMENT; 3809 z = ac->preferred_zoneref; 3810 for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx, 3811 ac->nodemask) { 3812 struct page *page; 3813 unsigned long mark; 3814 3815 if (cpusets_enabled() && 3816 (alloc_flags & ALLOC_CPUSET) && 3817 !__cpuset_zone_allowed(zone, gfp_mask)) 3818 continue; 3819 /* 3820 * When allocating a page cache page for writing, we 3821 * want to get it from a node that is within its dirty 3822 * limit, such that no single node holds more than its 3823 * proportional share of globally allowed dirty pages. 3824 * The dirty limits take into account the node's 3825 * lowmem reserves and high watermark so that kswapd 3826 * should be able to balance it without having to 3827 * write pages from its LRU list. 3828 * 3829 * XXX: For now, allow allocations to potentially 3830 * exceed the per-node dirty limit in the slowpath 3831 * (spread_dirty_pages unset) before going into reclaim, 3832 * which is important when on a NUMA setup the allowed 3833 * nodes are together not big enough to reach the 3834 * global limit. The proper fix for these situations 3835 * will require awareness of nodes in the 3836 * dirty-throttling and the flusher threads. 3837 */ 3838 if (ac->spread_dirty_pages) { 3839 if (last_pgdat != zone->zone_pgdat) { 3840 last_pgdat = zone->zone_pgdat; 3841 last_pgdat_dirty_ok = node_dirty_ok(zone->zone_pgdat); 3842 } 3843 3844 if (!last_pgdat_dirty_ok) 3845 continue; 3846 } 3847 3848 if (no_fallback && !defrag_mode && nr_online_nodes > 1 && 3849 zone != zonelist_zone(ac->preferred_zoneref)) { 3850 int local_nid; 3851 3852 /* 3853 * If moving to a remote node, retry but allow 3854 * fragmenting fallbacks. Locality is more important 3855 * than fragmentation avoidance. 3856 */ 3857 local_nid = zonelist_node_idx(ac->preferred_zoneref); 3858 if (zone_to_nid(zone) != local_nid) { 3859 alloc_flags &= ~ALLOC_NOFRAGMENT; 3860 goto retry; 3861 } 3862 } 3863 3864 /* 3865 * If kswapd is already active on a node, keep looking 3866 * for other nodes that might be idle. This can happen 3867 * if another process has NUMA bindings and is causing 3868 * kswapd wakeups on only some nodes. Avoid accidental 3869 * "node_reclaim_mode"-like behavior in this case. 3870 */ 3871 if (skip_kswapd_nodes && 3872 !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) { 3873 skipped_kswapd_nodes = true; 3874 continue; 3875 } 3876 3877 cond_accept_memory(zone, order, alloc_flags); 3878 3879 /* 3880 * Detect whether the number of free pages is below high 3881 * watermark. If so, we will decrease pcp->high and free 3882 * PCP pages in free path to reduce the possibility of 3883 * premature page reclaiming. Detection is done here to 3884 * avoid to do that in hotter free path. 3885 */ 3886 if (test_bit(ZONE_BELOW_HIGH, &zone->flags)) 3887 goto check_alloc_wmark; 3888 3889 mark = high_wmark_pages(zone); 3890 if (zone_watermark_fast(zone, order, mark, 3891 ac->highest_zoneidx, alloc_flags, 3892 gfp_mask)) 3893 goto try_this_zone; 3894 else 3895 set_bit(ZONE_BELOW_HIGH, &zone->flags); 3896 3897 check_alloc_wmark: 3898 mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); 3899 if (!zone_watermark_fast(zone, order, mark, 3900 ac->highest_zoneidx, alloc_flags, 3901 gfp_mask)) { 3902 int ret; 3903 3904 if (cond_accept_memory(zone, order, alloc_flags)) 3905 goto try_this_zone; 3906 3907 /* 3908 * Watermark failed for this zone, but see if we can 3909 * grow this zone if it contains deferred pages. 3910 */ 3911 if (deferred_pages_enabled()) { 3912 if (_deferred_grow_zone(zone, order)) 3913 goto try_this_zone; 3914 } 3915 /* Checked here to keep the fast path fast */ 3916 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 3917 if (alloc_flags & ALLOC_NO_WATERMARKS) 3918 goto try_this_zone; 3919 3920 if (!node_reclaim_enabled() || 3921 !zone_allows_reclaim(zonelist_zone(ac->preferred_zoneref), zone)) 3922 continue; 3923 3924 ret = node_reclaim(zone->zone_pgdat, gfp_mask, order); 3925 switch (ret) { 3926 case NODE_RECLAIM_NOSCAN: 3927 /* did not scan */ 3928 continue; 3929 case NODE_RECLAIM_FULL: 3930 /* scanned but unreclaimable */ 3931 continue; 3932 default: 3933 /* did we reclaim enough */ 3934 if (zone_watermark_ok(zone, order, mark, 3935 ac->highest_zoneidx, alloc_flags)) 3936 goto try_this_zone; 3937 3938 continue; 3939 } 3940 } 3941 3942 try_this_zone: 3943 page = rmqueue(zonelist_zone(ac->preferred_zoneref), zone, order, 3944 gfp_mask, alloc_flags, ac->migratetype); 3945 if (page) { 3946 prep_new_page(page, order, gfp_mask, alloc_flags); 3947 3948 return page; 3949 } else { 3950 if (cond_accept_memory(zone, order, alloc_flags)) 3951 goto try_this_zone; 3952 3953 /* Try again if zone has deferred pages */ 3954 if (deferred_pages_enabled()) { 3955 if (_deferred_grow_zone(zone, order)) 3956 goto try_this_zone; 3957 } 3958 } 3959 } 3960 3961 /* 3962 * If we skipped over nodes with active kswapds and found no 3963 * idle nodes, retry and place anywhere the watermarks permit. 3964 */ 3965 if (skip_kswapd_nodes && skipped_kswapd_nodes) { 3966 skip_kswapd_nodes = false; 3967 goto retry; 3968 } 3969 3970 /* 3971 * It's possible on a UMA machine to get through all zones that are 3972 * fragmented. If avoiding fragmentation, reset and try again. 3973 */ 3974 if (no_fallback && !defrag_mode) { 3975 alloc_flags &= ~ALLOC_NOFRAGMENT; 3976 goto retry; 3977 } 3978 3979 return NULL; 3980 } 3981 3982 static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) 3983 { 3984 unsigned int filter = SHOW_MEM_FILTER_NODES; 3985 3986 /* 3987 * This documents exceptions given to allocations in certain 3988 * contexts that are allowed to allocate outside current's set 3989 * of allowed nodes. 3990 */ 3991 if (!(gfp_mask & __GFP_NOMEMALLOC)) 3992 if (tsk_is_oom_victim(current) || 3993 (current->flags & (PF_MEMALLOC | PF_EXITING))) 3994 filter &= ~SHOW_MEM_FILTER_NODES; 3995 if (!in_task() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) 3996 filter &= ~SHOW_MEM_FILTER_NODES; 3997 3998 __show_mem(filter, nodemask, gfp_zone(gfp_mask)); 3999 mem_cgroup_show_protected_memory(NULL); 4000 } 4001 4002 void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) 4003 { 4004 struct va_format vaf; 4005 va_list args; 4006 static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1); 4007 4008 if ((gfp_mask & __GFP_NOWARN) || 4009 !__ratelimit(&nopage_rs) || 4010 ((gfp_mask & __GFP_DMA) && !has_managed_dma())) 4011 return; 4012 4013 va_start(args, fmt); 4014 vaf.fmt = fmt; 4015 vaf.va = &args; 4016 pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl", 4017 current->comm, &vaf, gfp_mask, &gfp_mask, 4018 nodemask_pr_args(nodemask)); 4019 va_end(args); 4020 4021 cpuset_print_current_mems_allowed(); 4022 pr_cont("\n"); 4023 dump_stack(); 4024 warn_alloc_show_mem(gfp_mask, nodemask); 4025 } 4026 4027 static inline struct page * 4028 __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, 4029 unsigned int alloc_flags, 4030 const struct alloc_context *ac) 4031 { 4032 struct page *page; 4033 4034 page = get_page_from_freelist(gfp_mask, order, 4035 alloc_flags|ALLOC_CPUSET, ac); 4036 /* 4037 * fallback to ignore cpuset restriction if our nodes 4038 * are depleted 4039 */ 4040 if (!page) 4041 page = get_page_from_freelist(gfp_mask, order, 4042 alloc_flags, ac); 4043 return page; 4044 } 4045 4046 static inline struct page * 4047 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 4048 const struct alloc_context *ac, unsigned long *did_some_progress) 4049 { 4050 struct oom_control oc = { 4051 .zonelist = ac->zonelist, 4052 .nodemask = ac->nodemask, 4053 .memcg = NULL, 4054 .gfp_mask = gfp_mask, 4055 .order = order, 4056 }; 4057 struct page *page; 4058 4059 *did_some_progress = 0; 4060 4061 /* 4062 * Acquire the oom lock. If that fails, somebody else is 4063 * making progress for us. 4064 */ 4065 if (!mutex_trylock(&oom_lock)) { 4066 *did_some_progress = 1; 4067 schedule_timeout_uninterruptible(1); 4068 return NULL; 4069 } 4070 4071 /* 4072 * Go through the zonelist yet one more time, keep very high watermark 4073 * here, this is only to catch a parallel oom killing, we must fail if 4074 * we're still under heavy pressure. But make sure that this reclaim 4075 * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY 4076 * allocation which will never fail due to oom_lock already held. 4077 */ 4078 page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) & 4079 ~__GFP_DIRECT_RECLAIM, order, 4080 ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); 4081 if (page) 4082 goto out; 4083 4084 /* Coredumps can quickly deplete all memory reserves */ 4085 if (current->flags & PF_DUMPCORE) 4086 goto out; 4087 /* The OOM killer will not help higher order allocs */ 4088 if (order > PAGE_ALLOC_COSTLY_ORDER) 4089 goto out; 4090 /* 4091 * We have already exhausted all our reclaim opportunities without any 4092 * success so it is time to admit defeat. We will skip the OOM killer 4093 * because it is very likely that the caller has a more reasonable 4094 * fallback than shooting a random task. 4095 * 4096 * The OOM killer may not free memory on a specific node. 4097 */ 4098 if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE)) 4099 goto out; 4100 /* The OOM killer does not needlessly kill tasks for lowmem */ 4101 if (ac->highest_zoneidx < ZONE_NORMAL) 4102 goto out; 4103 if (pm_suspended_storage()) 4104 goto out; 4105 /* 4106 * XXX: GFP_NOFS allocations should rather fail than rely on 4107 * other request to make a forward progress. 4108 * We are in an unfortunate situation where out_of_memory cannot 4109 * do much for this context but let's try it to at least get 4110 * access to memory reserved if the current task is killed (see 4111 * out_of_memory). Once filesystems are ready to handle allocation 4112 * failures more gracefully we should just bail out here. 4113 */ 4114 4115 /* Exhausted what can be done so it's blame time */ 4116 if (out_of_memory(&oc) || 4117 WARN_ON_ONCE_GFP(gfp_mask & __GFP_NOFAIL, gfp_mask)) { 4118 *did_some_progress = 1; 4119 4120 /* 4121 * Help non-failing allocations by giving them access to memory 4122 * reserves 4123 */ 4124 if (gfp_mask & __GFP_NOFAIL) 4125 page = __alloc_pages_cpuset_fallback(gfp_mask, order, 4126 ALLOC_NO_WATERMARKS, ac); 4127 } 4128 out: 4129 mutex_unlock(&oom_lock); 4130 return page; 4131 } 4132 4133 /* 4134 * Maximum number of compaction retries with a progress before OOM 4135 * killer is consider as the only way to move forward. 4136 */ 4137 #define MAX_COMPACT_RETRIES 16 4138 4139 #ifdef CONFIG_COMPACTION 4140 /* Try memory compaction for high-order allocations before reclaim */ 4141 static struct page * 4142 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 4143 unsigned int alloc_flags, const struct alloc_context *ac, 4144 enum compact_priority prio, enum compact_result *compact_result) 4145 { 4146 struct page *page = NULL; 4147 unsigned long pflags; 4148 unsigned int noreclaim_flag; 4149 4150 if (!order) 4151 return NULL; 4152 4153 psi_memstall_enter(&pflags); 4154 delayacct_compact_start(); 4155 noreclaim_flag = memalloc_noreclaim_save(); 4156 4157 *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, 4158 prio, &page); 4159 4160 memalloc_noreclaim_restore(noreclaim_flag); 4161 psi_memstall_leave(&pflags); 4162 delayacct_compact_end(); 4163 4164 if (*compact_result == COMPACT_SKIPPED || 4165 *compact_result == COMPACT_DEFERRED) 4166 return NULL; 4167 /* 4168 * At least in one zone compaction wasn't deferred or skipped, so let's 4169 * count a compaction stall 4170 */ 4171 count_vm_event(COMPACTSTALL); 4172 4173 /* Prep a captured page if available */ 4174 if (page) 4175 prep_new_page(page, order, gfp_mask, alloc_flags); 4176 4177 /* Try get a page from the freelist if available */ 4178 if (!page) 4179 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 4180 4181 if (page) { 4182 struct zone *zone = page_zone(page); 4183 4184 zone->compact_blockskip_flush = false; 4185 compaction_defer_reset(zone, order, true); 4186 count_vm_event(COMPACTSUCCESS); 4187 return page; 4188 } 4189 4190 /* 4191 * It's bad if compaction run occurs and fails. The most likely reason 4192 * is that pages exist, but not enough to satisfy watermarks. 4193 */ 4194 count_vm_event(COMPACTFAIL); 4195 4196 cond_resched(); 4197 4198 return NULL; 4199 } 4200 4201 static inline bool 4202 should_compact_retry(gfp_t gfp_mask, struct alloc_context *ac, int order, 4203 int alloc_flags, 4204 enum compact_result compact_result, 4205 enum compact_priority *compact_priority, 4206 int *compaction_retries) 4207 { 4208 int max_retries = MAX_COMPACT_RETRIES; 4209 int min_priority; 4210 bool ret = false; 4211 int retries = *compaction_retries; 4212 enum compact_priority priority = *compact_priority; 4213 4214 if (!order) 4215 return false; 4216 4217 if (fatal_signal_pending(current)) 4218 return false; 4219 4220 /* 4221 * Compaction was skipped due to a lack of free order-0 4222 * migration targets. Continue if reclaim can help. 4223 */ 4224 if (compact_result == COMPACT_SKIPPED) { 4225 ret = compaction_zonelist_suitable(ac, order, alloc_flags, 4226 gfp_mask); 4227 goto out; 4228 } 4229 4230 /* 4231 * Compaction managed to coalesce some page blocks, but the 4232 * allocation failed presumably due to a race. Retry some. 4233 */ 4234 if (compact_result == COMPACT_SUCCESS) { 4235 /* 4236 * !costly requests are much more important than 4237 * __GFP_RETRY_MAYFAIL costly ones because they are de 4238 * facto nofail and invoke OOM killer to move on while 4239 * costly can fail and users are ready to cope with 4240 * that. 1/4 retries is rather arbitrary but we would 4241 * need much more detailed feedback from compaction to 4242 * make a better decision. 4243 */ 4244 if (order > PAGE_ALLOC_COSTLY_ORDER) 4245 max_retries /= 4; 4246 4247 if (++(*compaction_retries) <= max_retries) { 4248 ret = true; 4249 goto out; 4250 } 4251 } 4252 4253 /* 4254 * Compaction failed. Retry with increasing priority. 4255 */ 4256 min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? 4257 MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; 4258 4259 if (*compact_priority > min_priority) { 4260 (*compact_priority)--; 4261 *compaction_retries = 0; 4262 ret = true; 4263 } 4264 out: 4265 trace_compact_retry(order, priority, compact_result, retries, max_retries, ret); 4266 return ret; 4267 } 4268 #else 4269 static inline struct page * 4270 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 4271 unsigned int alloc_flags, const struct alloc_context *ac, 4272 enum compact_priority prio, enum compact_result *compact_result) 4273 { 4274 *compact_result = COMPACT_SKIPPED; 4275 return NULL; 4276 } 4277 4278 static inline bool 4279 should_compact_retry(gfp_t gfp_mask, struct alloc_context *ac, int order, 4280 int alloc_flags, 4281 enum compact_result compact_result, 4282 enum compact_priority *compact_priority, 4283 int *compaction_retries) 4284 { 4285 struct zone *zone; 4286 struct zoneref *z; 4287 4288 if (!order || order > PAGE_ALLOC_COSTLY_ORDER) 4289 return false; 4290 4291 /* 4292 * There are setups with compaction disabled which would prefer to loop 4293 * inside the allocator rather than hit the oom killer prematurely. 4294 * Let's give them a good hope and keep retrying while the order-0 4295 * watermarks are OK. 4296 */ 4297 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, 4298 ac->highest_zoneidx, ac->nodemask) { 4299 if (zone_watermark_ok(zone, 0, min_wmark_pages(zone), 4300 ac->highest_zoneidx, alloc_flags)) 4301 return true; 4302 } 4303 return false; 4304 } 4305 #endif /* CONFIG_COMPACTION */ 4306 4307 #ifdef CONFIG_LOCKDEP 4308 static struct lockdep_map __fs_reclaim_map = 4309 STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map); 4310 4311 static bool __need_reclaim(gfp_t gfp_mask) 4312 { 4313 /* no reclaim without waiting on it */ 4314 if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) 4315 return false; 4316 4317 /* this guy won't enter reclaim */ 4318 if (current->flags & PF_MEMALLOC) 4319 return false; 4320 4321 if (gfp_mask & __GFP_NOLOCKDEP) 4322 return false; 4323 4324 return true; 4325 } 4326 4327 void __fs_reclaim_acquire(unsigned long ip) 4328 { 4329 lock_acquire_exclusive(&__fs_reclaim_map, 0, 0, NULL, ip); 4330 } 4331 4332 void __fs_reclaim_release(unsigned long ip) 4333 { 4334 lock_release(&__fs_reclaim_map, ip); 4335 } 4336 4337 void fs_reclaim_acquire(gfp_t gfp_mask) 4338 { 4339 gfp_mask = current_gfp_context(gfp_mask); 4340 4341 if (__need_reclaim(gfp_mask)) { 4342 if (gfp_mask & __GFP_FS) 4343 __fs_reclaim_acquire(_RET_IP_); 4344 4345 #ifdef CONFIG_MMU_NOTIFIER 4346 lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); 4347 lock_map_release(&__mmu_notifier_invalidate_range_start_map); 4348 #endif 4349 4350 } 4351 } 4352 EXPORT_SYMBOL_GPL(fs_reclaim_acquire); 4353 4354 void fs_reclaim_release(gfp_t gfp_mask) 4355 { 4356 gfp_mask = current_gfp_context(gfp_mask); 4357 4358 if (__need_reclaim(gfp_mask)) { 4359 if (gfp_mask & __GFP_FS) 4360 __fs_reclaim_release(_RET_IP_); 4361 } 4362 } 4363 EXPORT_SYMBOL_GPL(fs_reclaim_release); 4364 #endif 4365 4366 /* 4367 * Zonelists may change due to hotplug during allocation. Detect when zonelists 4368 * have been rebuilt so allocation retries. Reader side does not lock and 4369 * retries the allocation if zonelist changes. Writer side is protected by the 4370 * embedded spin_lock. 4371 */ 4372 static DEFINE_SEQLOCK(zonelist_update_seq); 4373 4374 static unsigned int zonelist_iter_begin(void) 4375 { 4376 if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) 4377 return read_seqbegin(&zonelist_update_seq); 4378 4379 return 0; 4380 } 4381 4382 static unsigned int check_retry_zonelist(unsigned int seq) 4383 { 4384 if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) 4385 return read_seqretry(&zonelist_update_seq, seq); 4386 4387 return seq; 4388 } 4389 4390 /* Perform direct synchronous page reclaim */ 4391 static unsigned long 4392 __perform_reclaim(gfp_t gfp_mask, unsigned int order, 4393 const struct alloc_context *ac) 4394 { 4395 unsigned int noreclaim_flag; 4396 unsigned long progress; 4397 4398 cond_resched(); 4399 4400 /* We now go into synchronous reclaim */ 4401 cpuset_memory_pressure_bump(); 4402 fs_reclaim_acquire(gfp_mask); 4403 noreclaim_flag = memalloc_noreclaim_save(); 4404 4405 progress = try_to_free_pages(ac->zonelist, order, gfp_mask, 4406 ac->nodemask); 4407 4408 memalloc_noreclaim_restore(noreclaim_flag); 4409 fs_reclaim_release(gfp_mask); 4410 4411 cond_resched(); 4412 4413 return progress; 4414 } 4415 4416 /* The really slow allocator path where we enter direct reclaim */ 4417 static inline struct page * 4418 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 4419 unsigned int alloc_flags, const struct alloc_context *ac, 4420 unsigned long *did_some_progress) 4421 { 4422 struct page *page = NULL; 4423 unsigned long pflags; 4424 bool drained = false; 4425 4426 psi_memstall_enter(&pflags); 4427 *did_some_progress = __perform_reclaim(gfp_mask, order, ac); 4428 if (unlikely(!(*did_some_progress))) 4429 goto out; 4430 4431 retry: 4432 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 4433 4434 /* 4435 * If an allocation failed after direct reclaim, it could be because 4436 * pages are pinned on the per-cpu lists or in high alloc reserves. 4437 * Shrink them and try again 4438 */ 4439 if (!page && !drained) { 4440 unreserve_highatomic_pageblock(ac, false); 4441 drain_all_pages(NULL); 4442 drained = true; 4443 goto retry; 4444 } 4445 out: 4446 psi_memstall_leave(&pflags); 4447 4448 return page; 4449 } 4450 4451 static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, 4452 const struct alloc_context *ac) 4453 { 4454 struct zoneref *z; 4455 struct zone *zone; 4456 pg_data_t *last_pgdat = NULL; 4457 enum zone_type highest_zoneidx = ac->highest_zoneidx; 4458 unsigned int reclaim_order; 4459 4460 if (defrag_mode) 4461 reclaim_order = max(order, pageblock_order); 4462 else 4463 reclaim_order = order; 4464 4465 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx, 4466 ac->nodemask) { 4467 if (!managed_zone(zone)) 4468 continue; 4469 if (last_pgdat == zone->zone_pgdat) 4470 continue; 4471 wakeup_kswapd(zone, gfp_mask, reclaim_order, highest_zoneidx); 4472 last_pgdat = zone->zone_pgdat; 4473 } 4474 } 4475 4476 static inline unsigned int 4477 gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order) 4478 { 4479 unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; 4480 4481 /* 4482 * __GFP_HIGH is assumed to be the same as ALLOC_MIN_RESERVE 4483 * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD 4484 * to save two branches. 4485 */ 4486 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_MIN_RESERVE); 4487 BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD); 4488 4489 /* 4490 * The caller may dip into page reserves a bit more if the caller 4491 * cannot run direct reclaim, or if the caller has realtime scheduling 4492 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 4493 * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH). 4494 */ 4495 alloc_flags |= (__force int) 4496 (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); 4497 4498 if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { 4499 /* 4500 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even 4501 * if it can't schedule. 4502 */ 4503 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 4504 alloc_flags |= ALLOC_NON_BLOCK; 4505 4506 if (order > 0 && (alloc_flags & ALLOC_MIN_RESERVE)) 4507 alloc_flags |= ALLOC_HIGHATOMIC; 4508 } 4509 4510 /* 4511 * Ignore cpuset mems for non-blocking __GFP_HIGH (probably 4512 * GFP_ATOMIC) rather than fail, see the comment for 4513 * cpuset_current_node_allowed(). 4514 */ 4515 if (alloc_flags & ALLOC_MIN_RESERVE) 4516 alloc_flags &= ~ALLOC_CPUSET; 4517 } else if (unlikely(rt_or_dl_task(current)) && in_task()) 4518 alloc_flags |= ALLOC_MIN_RESERVE; 4519 4520 alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags); 4521 4522 if (defrag_mode) 4523 alloc_flags |= ALLOC_NOFRAGMENT; 4524 4525 return alloc_flags; 4526 } 4527 4528 static bool oom_reserves_allowed(struct task_struct *tsk) 4529 { 4530 if (!tsk_is_oom_victim(tsk)) 4531 return false; 4532 4533 /* 4534 * !MMU doesn't have oom reaper so give access to memory reserves 4535 * only to the thread with TIF_MEMDIE set 4536 */ 4537 if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE)) 4538 return false; 4539 4540 return true; 4541 } 4542 4543 /* 4544 * Distinguish requests which really need access to full memory 4545 * reserves from oom victims which can live with a portion of it 4546 */ 4547 static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask) 4548 { 4549 if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) 4550 return 0; 4551 if (gfp_mask & __GFP_MEMALLOC) 4552 return ALLOC_NO_WATERMARKS; 4553 if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) 4554 return ALLOC_NO_WATERMARKS; 4555 if (!in_interrupt()) { 4556 if (current->flags & PF_MEMALLOC) 4557 return ALLOC_NO_WATERMARKS; 4558 else if (oom_reserves_allowed(current)) 4559 return ALLOC_OOM; 4560 } 4561 4562 return 0; 4563 } 4564 4565 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) 4566 { 4567 return !!__gfp_pfmemalloc_flags(gfp_mask); 4568 } 4569 4570 /* 4571 * Checks whether it makes sense to retry the reclaim to make a forward progress 4572 * for the given allocation request. 4573 * 4574 * We give up when we either have tried MAX_RECLAIM_RETRIES in a row 4575 * without success, or when we couldn't even meet the watermark if we 4576 * reclaimed all remaining pages on the LRU lists. 4577 * 4578 * Returns true if a retry is viable or false to enter the oom path. 4579 */ 4580 static inline bool 4581 should_reclaim_retry(gfp_t gfp_mask, unsigned order, 4582 struct alloc_context *ac, int alloc_flags, 4583 bool did_some_progress, int *no_progress_loops) 4584 { 4585 struct zone *zone; 4586 struct zoneref *z; 4587 bool ret = false; 4588 4589 /* 4590 * Costly allocations might have made a progress but this doesn't mean 4591 * their order will become available due to high fragmentation so 4592 * always increment the no progress counter for them 4593 */ 4594 if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) 4595 *no_progress_loops = 0; 4596 else 4597 (*no_progress_loops)++; 4598 4599 if (*no_progress_loops > MAX_RECLAIM_RETRIES) 4600 goto out; 4601 4602 4603 /* 4604 * Keep reclaiming pages while there is a chance this will lead 4605 * somewhere. If none of the target zones can satisfy our allocation 4606 * request even if all reclaimable pages are considered then we are 4607 * screwed and have to go OOM. 4608 */ 4609 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, 4610 ac->highest_zoneidx, ac->nodemask) { 4611 unsigned long available; 4612 unsigned long reclaimable; 4613 unsigned long min_wmark = min_wmark_pages(zone); 4614 bool wmark; 4615 4616 if (cpusets_enabled() && 4617 (alloc_flags & ALLOC_CPUSET) && 4618 !__cpuset_zone_allowed(zone, gfp_mask)) 4619 continue; 4620 4621 available = reclaimable = zone_reclaimable_pages(zone); 4622 available += zone_page_state_snapshot(zone, NR_FREE_PAGES); 4623 4624 /* 4625 * Would the allocation succeed if we reclaimed all 4626 * reclaimable pages? 4627 */ 4628 wmark = __zone_watermark_ok(zone, order, min_wmark, 4629 ac->highest_zoneidx, alloc_flags, available); 4630 trace_reclaim_retry_zone(z, order, reclaimable, 4631 available, min_wmark, *no_progress_loops, wmark); 4632 if (wmark) { 4633 ret = true; 4634 break; 4635 } 4636 } 4637 4638 /* 4639 * Memory allocation/reclaim might be called from a WQ context and the 4640 * current implementation of the WQ concurrency control doesn't 4641 * recognize that a particular WQ is congested if the worker thread is 4642 * looping without ever sleeping. Therefore we have to do a short sleep 4643 * here rather than calling cond_resched(). 4644 */ 4645 if (current->flags & PF_WQ_WORKER) 4646 schedule_timeout_uninterruptible(1); 4647 else 4648 cond_resched(); 4649 out: 4650 /* Before OOM, exhaust highatomic_reserve */ 4651 if (!ret) 4652 return unreserve_highatomic_pageblock(ac, true); 4653 4654 return ret; 4655 } 4656 4657 static inline bool 4658 check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac) 4659 { 4660 /* 4661 * It's possible that cpuset's mems_allowed and the nodemask from 4662 * mempolicy don't intersect. This should be normally dealt with by 4663 * policy_nodemask(), but it's possible to race with cpuset update in 4664 * such a way the check therein was true, and then it became false 4665 * before we got our cpuset_mems_cookie here. 4666 * This assumes that for all allocations, ac->nodemask can come only 4667 * from MPOL_BIND mempolicy (whose documented semantics is to be ignored 4668 * when it does not intersect with the cpuset restrictions) or the 4669 * caller can deal with a violated nodemask. 4670 */ 4671 if (cpusets_enabled() && ac->nodemask && 4672 !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) { 4673 ac->nodemask = NULL; 4674 return true; 4675 } 4676 4677 /* 4678 * When updating a task's mems_allowed or mempolicy nodemask, it is 4679 * possible to race with parallel threads in such a way that our 4680 * allocation can fail while the mask is being updated. If we are about 4681 * to fail, check if the cpuset changed during allocation and if so, 4682 * retry. 4683 */ 4684 if (read_mems_allowed_retry(cpuset_mems_cookie)) 4685 return true; 4686 4687 return false; 4688 } 4689 4690 static void check_alloc_stall_warn(gfp_t gfp_mask, nodemask_t *nodemask, 4691 unsigned int order, unsigned long alloc_start_time) 4692 { 4693 static DEFINE_SPINLOCK(alloc_stall_lock); 4694 unsigned long stall_msecs = jiffies_to_msecs(jiffies - alloc_start_time); 4695 4696 if (likely(stall_msecs < ALLOC_STALL_WARN_MSECS)) 4697 return; 4698 if (time_is_after_jiffies(READ_ONCE(alloc_stall_warn_jiffies))) 4699 return; 4700 if (gfp_mask & __GFP_NOWARN) 4701 return; 4702 4703 if (!spin_trylock(&alloc_stall_lock)) 4704 return; 4705 4706 /* Check again, this time under the lock */ 4707 if (time_is_after_jiffies(alloc_stall_warn_jiffies)) { 4708 spin_unlock(&alloc_stall_lock); 4709 return; 4710 } 4711 4712 WRITE_ONCE(alloc_stall_warn_jiffies, jiffies + msecs_to_jiffies(ALLOC_STALL_WARN_MSECS)); 4713 spin_unlock(&alloc_stall_lock); 4714 4715 pr_warn("%s: page allocation stall for %lu secs: order:%d, mode:%#x(%pGg) nodemask=%*pbl", 4716 current->comm, stall_msecs / MSEC_PER_SEC, order, gfp_mask, &gfp_mask, 4717 nodemask_pr_args(nodemask)); 4718 cpuset_print_current_mems_allowed(); 4719 pr_cont("\n"); 4720 dump_stack(); 4721 warn_alloc_show_mem(gfp_mask, nodemask); 4722 } 4723 4724 static inline struct page * 4725 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 4726 struct alloc_context *ac) 4727 { 4728 bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; 4729 bool can_compact = can_direct_reclaim && gfp_compaction_allowed(gfp_mask); 4730 bool nofail = gfp_mask & __GFP_NOFAIL; 4731 const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; 4732 struct page *page = NULL; 4733 unsigned int alloc_flags; 4734 unsigned long did_some_progress; 4735 enum compact_priority compact_priority; 4736 enum compact_result compact_result; 4737 int compaction_retries; 4738 int no_progress_loops; 4739 unsigned int cpuset_mems_cookie; 4740 unsigned int zonelist_iter_cookie; 4741 int reserve_flags; 4742 bool compact_first = false; 4743 bool can_retry_reserves = true; 4744 unsigned long alloc_start_time = jiffies; 4745 4746 if (unlikely(nofail)) { 4747 /* 4748 * Also we don't support __GFP_NOFAIL without __GFP_DIRECT_RECLAIM, 4749 * otherwise, we may result in lockup. 4750 */ 4751 WARN_ON_ONCE(!can_direct_reclaim); 4752 /* 4753 * PF_MEMALLOC request from this context is rather bizarre 4754 * because we cannot reclaim anything and only can loop waiting 4755 * for somebody to do a work for us. 4756 */ 4757 WARN_ON_ONCE(current->flags & PF_MEMALLOC); 4758 } 4759 4760 restart: 4761 compaction_retries = 0; 4762 no_progress_loops = 0; 4763 compact_result = COMPACT_SKIPPED; 4764 compact_priority = DEF_COMPACT_PRIORITY; 4765 cpuset_mems_cookie = read_mems_allowed_begin(); 4766 zonelist_iter_cookie = zonelist_iter_begin(); 4767 4768 /* 4769 * For costly allocations, try direct compaction first, as it's likely 4770 * that we have enough base pages and don't need to reclaim. For non- 4771 * movable high-order allocations, do that as well, as compaction will 4772 * try prevent permanent fragmentation by migrating from blocks of the 4773 * same migratetype. 4774 */ 4775 if (can_compact && (costly_order || (order > 0 && 4776 ac->migratetype != MIGRATE_MOVABLE))) { 4777 compact_first = true; 4778 compact_priority = INIT_COMPACT_PRIORITY; 4779 } 4780 4781 /* 4782 * The fast path uses conservative alloc_flags to succeed only until 4783 * kswapd needs to be woken up, and to avoid the cost of setting up 4784 * alloc_flags precisely. So we do that now. 4785 */ 4786 alloc_flags = gfp_to_alloc_flags(gfp_mask, order); 4787 4788 /* 4789 * We need to recalculate the starting point for the zonelist iterator 4790 * because we might have used different nodemask in the fast path, or 4791 * there was a cpuset modification and we are retrying - otherwise we 4792 * could end up iterating over non-eligible zones endlessly. 4793 */ 4794 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 4795 ac->highest_zoneidx, ac->nodemask); 4796 if (!zonelist_zone(ac->preferred_zoneref)) 4797 goto nopage; 4798 4799 /* 4800 * Check for insane configurations where the cpuset doesn't contain 4801 * any suitable zone to satisfy the request - e.g. non-movable 4802 * GFP_HIGHUSER allocations from MOVABLE nodes only. 4803 */ 4804 if (cpusets_insane_config() && (gfp_mask & __GFP_HARDWALL)) { 4805 struct zoneref *z = first_zones_zonelist(ac->zonelist, 4806 ac->highest_zoneidx, 4807 &cpuset_current_mems_allowed); 4808 if (!zonelist_zone(z)) 4809 goto nopage; 4810 } 4811 4812 retry: 4813 /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ 4814 if (alloc_flags & ALLOC_KSWAPD) 4815 wake_all_kswapds(order, gfp_mask, ac); 4816 4817 /* 4818 * The adjusted alloc_flags might result in immediate success, so try 4819 * that first 4820 */ 4821 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 4822 if (page) 4823 goto got_pg; 4824 4825 reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); 4826 if (reserve_flags) 4827 alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags) | 4828 (alloc_flags & ALLOC_KSWAPD); 4829 4830 /* 4831 * Reset the nodemask and zonelist iterators if memory policies can be 4832 * ignored. These allocations are high priority and system rather than 4833 * user oriented. 4834 */ 4835 if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { 4836 ac->nodemask = NULL; 4837 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 4838 ac->highest_zoneidx, ac->nodemask); 4839 4840 /* 4841 * The first time we adjust anything due to being allowed to 4842 * ignore memory policies or watermarks, retry immediately. This 4843 * allows us to keep the first allocation attempt optimistic so 4844 * it can succeed in a zone that is still above watermarks. 4845 */ 4846 if (can_retry_reserves) { 4847 can_retry_reserves = false; 4848 goto retry; 4849 } 4850 } 4851 4852 /* Caller is not willing to reclaim, we can't balance anything */ 4853 if (!can_direct_reclaim) { 4854 /* 4855 * Reclaim/compaction cannot run, so defrag_mode's strategy 4856 * of enforcing ALLOC_NOFRAGMENT cannot be fulfilled. Allow 4857 * fallbacks rather than failing the allocation outright. 4858 */ 4859 if (defrag_mode && (alloc_flags & ALLOC_NOFRAGMENT) && 4860 (gfp_mask & __GFP_KSWAPD_RECLAIM)) { 4861 alloc_flags &= ~ALLOC_NOFRAGMENT; 4862 goto retry; 4863 } 4864 goto nopage; 4865 } 4866 4867 /* Avoid recursion of direct reclaim */ 4868 if (current->flags & PF_MEMALLOC) 4869 goto nopage; 4870 4871 /* If allocation has taken excessively long, warn about it */ 4872 check_alloc_stall_warn(gfp_mask, ac->nodemask, order, alloc_start_time); 4873 4874 /* Try direct reclaim and then allocating */ 4875 if (!compact_first) { 4876 page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, 4877 ac, &did_some_progress); 4878 if (page) 4879 goto got_pg; 4880 } 4881 4882 /* Try direct compaction and then allocating */ 4883 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, 4884 compact_priority, &compact_result); 4885 if (page) 4886 goto got_pg; 4887 4888 if (compact_first) { 4889 /* 4890 * THP page faults may attempt local node only first, but are 4891 * then allowed to only compact, not reclaim, see 4892 * alloc_pages_mpol(). 4893 * 4894 * Compaction has failed above and we don't want such THP 4895 * allocations to put reclaim pressure on a single node in a 4896 * situation where other nodes might have plenty of available 4897 * memory. 4898 */ 4899 if (gfp_has_flags(gfp_mask, __GFP_NORETRY | __GFP_THISNODE)) 4900 goto nopage; 4901 4902 /* 4903 * For the initial compaction attempt we have lowered its 4904 * priority. Restore it for further retries, if those are 4905 * allowed. With __GFP_NORETRY there will be a single round of 4906 * reclaim and compaction with the lowered priority. 4907 */ 4908 if (!(gfp_mask & __GFP_NORETRY)) 4909 compact_priority = DEF_COMPACT_PRIORITY; 4910 4911 compact_first = false; 4912 goto retry; 4913 } 4914 4915 /* Do not loop if specifically requested */ 4916 if (gfp_mask & __GFP_NORETRY) 4917 goto nopage; 4918 4919 /* 4920 * Do not retry costly high order allocations unless they are 4921 * __GFP_RETRY_MAYFAIL and we can compact 4922 */ 4923 if (costly_order && (!can_compact || 4924 !(gfp_mask & __GFP_RETRY_MAYFAIL))) 4925 goto nopage; 4926 4927 /* 4928 * Deal with possible cpuset update races or zonelist updates to avoid 4929 * infinite retries. No "goto retry;" can be placed above this check 4930 * unless it can execute just once. 4931 */ 4932 if (check_retry_cpuset(cpuset_mems_cookie, ac) || 4933 check_retry_zonelist(zonelist_iter_cookie)) 4934 goto restart; 4935 4936 if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, 4937 did_some_progress > 0, &no_progress_loops)) 4938 goto retry; 4939 4940 /* 4941 * It doesn't make any sense to retry for the compaction if the order-0 4942 * reclaim is not able to make any progress because the current 4943 * implementation of the compaction depends on the sufficient amount 4944 * of free memory (see __compaction_suitable) 4945 */ 4946 if (did_some_progress > 0 && can_compact && 4947 should_compact_retry(gfp_mask, ac, order, alloc_flags, 4948 compact_result, &compact_priority, 4949 &compaction_retries)) 4950 goto retry; 4951 4952 /* Reclaim/compaction failed to prevent the fallback */ 4953 if (defrag_mode && (alloc_flags & ALLOC_NOFRAGMENT)) { 4954 alloc_flags &= ~ALLOC_NOFRAGMENT; 4955 goto retry; 4956 } 4957 4958 /* 4959 * Deal with possible cpuset update races or zonelist updates to avoid 4960 * a unnecessary OOM kill. 4961 */ 4962 if (check_retry_cpuset(cpuset_mems_cookie, ac) || 4963 check_retry_zonelist(zonelist_iter_cookie)) 4964 goto restart; 4965 4966 /* Reclaim has failed us, start killing things */ 4967 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); 4968 if (page) 4969 goto got_pg; 4970 4971 /* Avoid allocations with no watermarks from looping endlessly */ 4972 if (tsk_is_oom_victim(current) && 4973 (alloc_flags & ALLOC_OOM || 4974 (gfp_mask & __GFP_NOMEMALLOC))) 4975 goto nopage; 4976 4977 /* Retry as long as the OOM killer is making progress */ 4978 if (did_some_progress) { 4979 no_progress_loops = 0; 4980 goto retry; 4981 } 4982 4983 nopage: 4984 /* 4985 * Deal with possible cpuset update races or zonelist updates to avoid 4986 * a unnecessary OOM kill. 4987 */ 4988 if (check_retry_cpuset(cpuset_mems_cookie, ac) || 4989 check_retry_zonelist(zonelist_iter_cookie)) 4990 goto restart; 4991 4992 /* 4993 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure 4994 * we always retry 4995 */ 4996 if (unlikely(nofail)) { 4997 /* 4998 * Lacking direct_reclaim we can't do anything to reclaim memory, 4999 * we disregard these unreasonable nofail requests and still 5000 * return NULL 5001 */ 5002 if (!can_direct_reclaim) 5003 goto fail; 5004 5005 /* 5006 * Help non-failing allocations by giving some access to memory 5007 * reserves normally used for high priority non-blocking 5008 * allocations but do not use ALLOC_NO_WATERMARKS because this 5009 * could deplete whole memory reserves which would just make 5010 * the situation worse. 5011 */ 5012 page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_MIN_RESERVE, ac); 5013 if (page) 5014 goto got_pg; 5015 5016 cond_resched(); 5017 goto retry; 5018 } 5019 fail: 5020 warn_alloc(gfp_mask, ac->nodemask, 5021 "page allocation failure: order:%u", order); 5022 got_pg: 5023 return page; 5024 } 5025 5026 static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, 5027 int preferred_nid, nodemask_t *nodemask, 5028 struct alloc_context *ac, gfp_t *alloc_gfp, 5029 unsigned int *alloc_flags) 5030 { 5031 ac->highest_zoneidx = gfp_zone(gfp_mask); 5032 ac->zonelist = node_zonelist(preferred_nid, gfp_mask); 5033 ac->nodemask = nodemask; 5034 ac->migratetype = gfp_migratetype(gfp_mask); 5035 5036 if (cpusets_enabled()) { 5037 *alloc_gfp |= __GFP_HARDWALL; 5038 /* 5039 * When we are in the interrupt context, it is irrelevant 5040 * to the current task context. It means that any node ok. 5041 */ 5042 if (in_task() && !ac->nodemask) 5043 ac->nodemask = &cpuset_current_mems_allowed; 5044 else 5045 *alloc_flags |= ALLOC_CPUSET; 5046 } 5047 5048 might_alloc(gfp_mask); 5049 5050 /* 5051 * Don't invoke should_fail logic, since it may call 5052 * get_random_u32() and printk() which need to spin_lock. 5053 */ 5054 if (!(*alloc_flags & ALLOC_TRYLOCK) && 5055 should_fail_alloc_page(gfp_mask, order)) 5056 return false; 5057 5058 *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags); 5059 5060 /* Dirty zone balancing only done in the fast path */ 5061 ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); 5062 5063 /* 5064 * The preferred zone is used for statistics but crucially it is 5065 * also used as the starting point for the zonelist iterator. It 5066 * may get reset for allocations that ignore memory policies. 5067 */ 5068 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 5069 ac->highest_zoneidx, ac->nodemask); 5070 5071 return true; 5072 } 5073 5074 /* 5075 * __alloc_pages_bulk - Allocate a number of order-0 pages to an array 5076 * @gfp: GFP flags for the allocation 5077 * @preferred_nid: The preferred NUMA node ID to allocate from 5078 * @nodemask: Set of nodes to allocate from, may be NULL 5079 * @nr_pages: The number of pages desired in the array 5080 * @page_array: Array to store the pages 5081 * 5082 * This is a batched version of the page allocator that attempts to allocate 5083 * @nr_pages quickly. Pages are added to @page_array. 5084 * 5085 * Note that only the elements in @page_array that were cleared to %NULL on 5086 * entry are populated with newly allocated pages. @nr_pages is the maximum 5087 * number of pages that will be stored in the array. 5088 * 5089 * Returns the number of pages in @page_array, including ones already 5090 * allocated on entry. This can be less than the number requested in @nr_pages, 5091 * but all empty slots are filled from the beginning. I.e., if all slots in 5092 * @page_array were set to %NULL on entry, the slots from 0 to the return value 5093 * - 1 will be filled. 5094 */ 5095 unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, 5096 nodemask_t *nodemask, int nr_pages, 5097 struct page **page_array) 5098 { 5099 struct page *page; 5100 struct zone *zone; 5101 struct zoneref *z; 5102 struct per_cpu_pages *pcp; 5103 struct list_head *pcp_list; 5104 struct alloc_context ac; 5105 unsigned int alloc_flags = ALLOC_WMARK_LOW; 5106 int nr_populated = 0, nr_account = 0; 5107 5108 /* 5109 * Skip populated array elements to determine if any pages need 5110 * to be allocated before disabling IRQs. 5111 */ 5112 while (nr_populated < nr_pages && page_array[nr_populated]) 5113 nr_populated++; 5114 5115 /* No pages requested? */ 5116 if (unlikely(nr_pages <= 0)) 5117 goto out; 5118 5119 /* Already populated array? */ 5120 if (unlikely(nr_pages - nr_populated == 0)) 5121 goto out; 5122 5123 /* Bulk allocator does not support memcg accounting. */ 5124 if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT)) 5125 goto failed; 5126 5127 /* Use the single page allocator for one page. */ 5128 if (nr_pages - nr_populated == 1) 5129 goto failed; 5130 5131 #ifdef CONFIG_PAGE_OWNER 5132 /* 5133 * PAGE_OWNER may recurse into the allocator to allocate space to 5134 * save the stack with pagesets.lock held. Releasing/reacquiring 5135 * removes much of the performance benefit of bulk allocation so 5136 * force the caller to allocate one page at a time as it'll have 5137 * similar performance to added complexity to the bulk allocator. 5138 */ 5139 if (static_branch_unlikely(&page_owner_inited)) 5140 goto failed; 5141 #endif 5142 5143 /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */ 5144 gfp &= gfp_allowed_mask; 5145 if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &gfp, &alloc_flags)) 5146 goto out; 5147 5148 /* Find an allowed local zone that meets the low watermark. */ 5149 z = ac.preferred_zoneref; 5150 for_next_zone_zonelist_nodemask(zone, z, ac.highest_zoneidx, ac.nodemask) { 5151 unsigned long mark; 5152 5153 if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && 5154 !__cpuset_zone_allowed(zone, gfp)) { 5155 continue; 5156 } 5157 5158 if (nr_online_nodes > 1 && zone != zonelist_zone(ac.preferred_zoneref) && 5159 zone_to_nid(zone) != zonelist_node_idx(ac.preferred_zoneref)) { 5160 goto failed; 5161 } 5162 5163 cond_accept_memory(zone, 0, alloc_flags); 5164 retry_this_zone: 5165 mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages - nr_populated; 5166 if (zone_watermark_fast(zone, 0, mark, 5167 zonelist_zone_idx(ac.preferred_zoneref), 5168 alloc_flags, gfp)) { 5169 break; 5170 } 5171 5172 if (cond_accept_memory(zone, 0, alloc_flags)) 5173 goto retry_this_zone; 5174 5175 /* Try again if zone has deferred pages */ 5176 if (deferred_pages_enabled()) { 5177 if (_deferred_grow_zone(zone, 0)) 5178 goto retry_this_zone; 5179 } 5180 } 5181 5182 /* 5183 * If there are no allowed local zones that meets the watermarks then 5184 * try to allocate a single page and reclaim if necessary. 5185 */ 5186 if (unlikely(!zone)) 5187 goto failed; 5188 5189 /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */ 5190 pcp = pcp_spin_trylock(zone->per_cpu_pageset); 5191 if (!pcp) 5192 goto failed; 5193 5194 /* Attempt the batch allocation */ 5195 pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)]; 5196 while (nr_populated < nr_pages) { 5197 5198 /* Skip existing pages */ 5199 if (page_array[nr_populated]) { 5200 nr_populated++; 5201 continue; 5202 } 5203 5204 page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags, 5205 pcp, pcp_list); 5206 if (unlikely(!page)) { 5207 /* Try and allocate at least one page */ 5208 if (!nr_account) { 5209 pcp_spin_unlock(pcp); 5210 goto failed; 5211 } 5212 break; 5213 } 5214 nr_account++; 5215 5216 prep_new_page(page, 0, gfp, 0); 5217 set_page_refcounted(page); 5218 page_array[nr_populated++] = page; 5219 } 5220 5221 pcp_spin_unlock(pcp); 5222 5223 __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); 5224 zone_statistics(zonelist_zone(ac.preferred_zoneref), zone, nr_account); 5225 5226 out: 5227 return nr_populated; 5228 5229 failed: 5230 page = __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask); 5231 if (page) 5232 page_array[nr_populated++] = page; 5233 goto out; 5234 } 5235 EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof); 5236 5237 /* 5238 * free_pages_bulk - Free an array of order-0 pages 5239 * @page_array: Array of pages to free 5240 * @nr_pages: The number of pages in the array 5241 * 5242 * Free the order-0 pages. Adjacent entries whose PFNs form a contiguous 5243 * run are released with a single __free_contig_range() call. 5244 * 5245 * This assumes page_array is sorted in ascending PFN order. Without that, 5246 * the function still frees all pages, but contiguous runs may not be 5247 * detected and the freeing pattern can degrade to freeing one page at a 5248 * time. 5249 * 5250 * Context: Sleepable process context only; calls cond_resched() 5251 */ 5252 void free_pages_bulk(struct page **page_array, unsigned long nr_pages) 5253 { 5254 while (nr_pages) { 5255 unsigned long nr_contig = num_pages_contiguous(page_array, nr_pages); 5256 5257 __free_contig_range(page_to_pfn(*page_array), nr_contig); 5258 5259 nr_pages -= nr_contig; 5260 page_array += nr_contig; 5261 cond_resched(); 5262 } 5263 } 5264 5265 /* 5266 * This is the 'heart' of the zoned buddy allocator. 5267 */ 5268 struct page *__alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order, 5269 int preferred_nid, nodemask_t *nodemask) 5270 { 5271 struct page *page; 5272 unsigned int alloc_flags = ALLOC_WMARK_LOW; 5273 gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */ 5274 struct alloc_context ac = { }; 5275 5276 /* 5277 * There are several places where we assume that the order value is sane 5278 * so bail out early if the request is out of bound. 5279 */ 5280 if (WARN_ON_ONCE_GFP(order > MAX_PAGE_ORDER, gfp)) 5281 return NULL; 5282 5283 gfp &= gfp_allowed_mask; 5284 /* 5285 * Apply scoped allocation constraints. This is mainly about GFP_NOFS 5286 * resp. GFP_NOIO which has to be inherited for all allocation requests 5287 * from a particular context which has been marked by 5288 * memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures 5289 * movable zones are not used during allocation. 5290 */ 5291 gfp = current_gfp_context(gfp); 5292 alloc_gfp = gfp; 5293 if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac, 5294 &alloc_gfp, &alloc_flags)) 5295 return NULL; 5296 5297 /* 5298 * Forbid the first pass from falling back to types that fragment 5299 * memory until all local zones are considered. 5300 */ 5301 alloc_flags |= alloc_flags_nofragment(zonelist_zone(ac.preferred_zoneref), gfp); 5302 5303 /* First allocation attempt */ 5304 page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac); 5305 if (likely(page)) 5306 goto out; 5307 5308 alloc_gfp = gfp; 5309 ac.spread_dirty_pages = false; 5310 5311 /* 5312 * Restore the original nodemask if it was potentially replaced with 5313 * &cpuset_current_mems_allowed to optimize the fast-path attempt. 5314 */ 5315 ac.nodemask = nodemask; 5316 5317 page = __alloc_pages_slowpath(alloc_gfp, order, &ac); 5318 5319 out: 5320 if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page && 5321 unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) { 5322 free_frozen_pages(page, order); 5323 page = NULL; 5324 } 5325 5326 trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); 5327 kmsan_alloc_page(page, order, alloc_gfp); 5328 5329 return page; 5330 } 5331 EXPORT_SYMBOL(__alloc_frozen_pages_noprof); 5332 5333 struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, 5334 int preferred_nid, nodemask_t *nodemask) 5335 { 5336 struct page *page; 5337 5338 page = __alloc_frozen_pages_noprof(gfp, order, preferred_nid, nodemask); 5339 if (page) 5340 set_page_refcounted(page); 5341 return page; 5342 } 5343 EXPORT_SYMBOL(__alloc_pages_noprof); 5344 5345 struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid, 5346 nodemask_t *nodemask) 5347 { 5348 struct page *page = __alloc_pages_noprof(gfp | __GFP_COMP, order, 5349 preferred_nid, nodemask); 5350 return page_rmappable_folio(page); 5351 } 5352 EXPORT_SYMBOL(__folio_alloc_noprof); 5353 5354 /* 5355 * Common helper functions. Never use with __GFP_HIGHMEM because the returned 5356 * address cannot represent highmem pages. Use alloc_pages and then kmap if 5357 * you need to access high mem. 5358 */ 5359 unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order) 5360 { 5361 struct page *page; 5362 5363 page = alloc_pages_noprof(gfp_mask & ~__GFP_HIGHMEM, order); 5364 if (!page) 5365 return 0; 5366 return (unsigned long) page_address(page); 5367 } 5368 EXPORT_SYMBOL(get_free_pages_noprof); 5369 5370 unsigned long get_zeroed_page_noprof(gfp_t gfp_mask) 5371 { 5372 return get_free_pages_noprof(gfp_mask | __GFP_ZERO, 0); 5373 } 5374 EXPORT_SYMBOL(get_zeroed_page_noprof); 5375 5376 static void ___free_pages(struct page *page, unsigned int order, 5377 fpi_t fpi_flags) 5378 { 5379 /* get PageHead before we drop reference */ 5380 int head = PageHead(page); 5381 /* get alloc tag in case the page is released by others */ 5382 struct alloc_tag *tag = pgalloc_tag_get(page); 5383 5384 if (put_page_testzero(page)) 5385 __free_frozen_pages(page, order, fpi_flags); 5386 else if (!head) { 5387 pgalloc_tag_sub_pages(tag, (1 << order) - 1); 5388 while (order-- > 0) { 5389 /* 5390 * The "tail" pages of this non-compound high-order 5391 * page will have no code tags, so to avoid warnings 5392 * mark them as empty. 5393 */ 5394 clear_page_tag_ref(page + (1 << order)); 5395 __free_frozen_pages(page + (1 << order), order, 5396 fpi_flags); 5397 } 5398 } 5399 } 5400 5401 /** 5402 * __free_pages - Free pages allocated with alloc_pages(). 5403 * @page: The page pointer returned from alloc_pages(). 5404 * @order: The order of the allocation. 5405 * 5406 * This function can free multi-page allocations that are not compound 5407 * pages. It does not check that the @order passed in matches that of 5408 * the allocation, so it is easy to leak memory. Freeing more memory 5409 * than was allocated will probably emit a warning. 5410 * 5411 * If the last reference to this page is speculative, it will be released 5412 * by put_page() which only frees the first page of a non-compound 5413 * allocation. To prevent the remaining pages from being leaked, we free 5414 * the subsequent pages here. If you want to use the page's reference 5415 * count to decide when to free the allocation, you should allocate a 5416 * compound page, and use put_page() instead of __free_pages(). 5417 * 5418 * Context: May be called in interrupt context or while holding a normal 5419 * spinlock, but not in NMI context or while holding a raw spinlock. 5420 */ 5421 void __free_pages(struct page *page, unsigned int order) 5422 { 5423 ___free_pages(page, order, FPI_NONE); 5424 } 5425 EXPORT_SYMBOL(__free_pages); 5426 5427 /* 5428 * Can be called while holding raw_spin_lock or from IRQ and NMI for any 5429 * page type (not only those that came from alloc_pages_nolock) 5430 */ 5431 void free_pages_nolock(struct page *page, unsigned int order) 5432 { 5433 ___free_pages(page, order, FPI_TRYLOCK); 5434 } 5435 5436 /** 5437 * free_pages - Free pages allocated with __get_free_pages(). 5438 * @addr: The virtual address tied to a page returned from __get_free_pages(). 5439 * @order: The order of the allocation. 5440 * 5441 * This function behaves the same as __free_pages(). Use this function 5442 * to free pages when you only have a valid virtual address. If you have 5443 * the page, call __free_pages() instead. 5444 */ 5445 void free_pages(unsigned long addr, unsigned int order) 5446 { 5447 if (addr != 0) { 5448 VM_BUG_ON(!virt_addr_valid((void *)addr)); 5449 __free_pages(virt_to_page((void *)addr), order); 5450 } 5451 } 5452 5453 EXPORT_SYMBOL(free_pages); 5454 5455 static void *make_alloc_exact(unsigned long addr, unsigned int order, 5456 size_t size) 5457 { 5458 if (addr) { 5459 unsigned long nr = DIV_ROUND_UP(size, PAGE_SIZE); 5460 struct page *page = virt_to_page((void *)addr); 5461 struct page *last = page + nr; 5462 5463 __split_page(page, order); 5464 while (page < --last) 5465 set_page_refcounted(last); 5466 5467 last = page + (1UL << order); 5468 for (page += nr; page < last; page++) 5469 __free_pages_ok(page, 0, FPI_TO_TAIL); 5470 } 5471 return (void *)addr; 5472 } 5473 5474 /** 5475 * alloc_pages_exact - allocate an exact number physically-contiguous pages. 5476 * @size: the number of bytes to allocate 5477 * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP 5478 * 5479 * This function is similar to alloc_pages(), except that it allocates the 5480 * minimum number of pages to satisfy the request. alloc_pages() can only 5481 * allocate memory in power-of-two pages. 5482 * 5483 * This function is also limited by MAX_PAGE_ORDER. 5484 * 5485 * Memory allocated by this function must be released by free_pages_exact(). 5486 * 5487 * Return: pointer to the allocated area or %NULL in case of error. 5488 */ 5489 void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask) 5490 { 5491 unsigned int order = get_order(size); 5492 unsigned long addr; 5493 5494 if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM))) 5495 gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM); 5496 5497 addr = get_free_pages_noprof(gfp_mask, order); 5498 return make_alloc_exact(addr, order, size); 5499 } 5500 EXPORT_SYMBOL(alloc_pages_exact_noprof); 5501 5502 /** 5503 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous 5504 * pages on a node. 5505 * @nid: the preferred node ID where memory should be allocated 5506 * @size: the number of bytes to allocate 5507 * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP 5508 * 5509 * Like alloc_pages_exact(), but try to allocate on node nid first before falling 5510 * back. 5511 * 5512 * Return: pointer to the allocated area or %NULL in case of error. 5513 */ 5514 void * __meminit alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask) 5515 { 5516 unsigned int order = get_order(size); 5517 struct page *p; 5518 5519 if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM))) 5520 gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM); 5521 5522 p = alloc_pages_node_noprof(nid, gfp_mask, order); 5523 if (!p) 5524 return NULL; 5525 return make_alloc_exact((unsigned long)page_address(p), order, size); 5526 } 5527 5528 /** 5529 * free_pages_exact - release memory allocated via alloc_pages_exact() 5530 * @virt: the value returned by alloc_pages_exact. 5531 * @size: size of allocation, same value as passed to alloc_pages_exact(). 5532 * 5533 * Release the memory allocated by a previous call to alloc_pages_exact. 5534 */ 5535 void free_pages_exact(void *virt, size_t size) 5536 { 5537 unsigned long addr = (unsigned long)virt; 5538 unsigned long end = addr + PAGE_ALIGN(size); 5539 5540 while (addr < end) { 5541 free_page(addr); 5542 addr += PAGE_SIZE; 5543 } 5544 } 5545 EXPORT_SYMBOL(free_pages_exact); 5546 5547 /** 5548 * nr_free_zone_pages - count number of pages beyond high watermark 5549 * @offset: The zone index of the highest zone 5550 * 5551 * nr_free_zone_pages() counts the number of pages which are beyond the 5552 * high watermark within all zones at or below a given zone index. For each 5553 * zone, the number of pages is calculated as: 5554 * 5555 * nr_free_zone_pages = managed_pages - high_pages 5556 * 5557 * Return: number of pages beyond high watermark. 5558 */ 5559 static unsigned long nr_free_zone_pages(int offset) 5560 { 5561 struct zoneref *z; 5562 struct zone *zone; 5563 5564 /* Just pick one node, since fallback list is circular */ 5565 unsigned long sum = 0; 5566 5567 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); 5568 5569 for_each_zone_zonelist(zone, z, zonelist, offset) { 5570 unsigned long size = zone_managed_pages(zone); 5571 unsigned long high = high_wmark_pages(zone); 5572 if (size > high) 5573 sum += size - high; 5574 } 5575 5576 return sum; 5577 } 5578 5579 /** 5580 * nr_free_buffer_pages - count number of pages beyond high watermark 5581 * 5582 * nr_free_buffer_pages() counts the number of pages which are beyond the high 5583 * watermark within ZONE_DMA and ZONE_NORMAL. 5584 * 5585 * Return: number of pages beyond high watermark within ZONE_DMA and 5586 * ZONE_NORMAL. 5587 */ 5588 unsigned long nr_free_buffer_pages(void) 5589 { 5590 return nr_free_zone_pages(gfp_zone(GFP_USER)); 5591 } 5592 EXPORT_SYMBOL_GPL(nr_free_buffer_pages); 5593 5594 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) 5595 { 5596 zoneref->zone = zone; 5597 zoneref->zone_idx = zone_idx(zone); 5598 } 5599 5600 /* 5601 * Builds allocation fallback zone lists. 5602 * 5603 * Add all populated zones of a node to the zonelist. 5604 */ 5605 static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs) 5606 { 5607 struct zone *zone; 5608 enum zone_type zone_type = MAX_NR_ZONES; 5609 int nr_zones = 0; 5610 5611 do { 5612 zone_type--; 5613 zone = pgdat->node_zones + zone_type; 5614 if (populated_zone(zone)) { 5615 zoneref_set_zone(zone, &zonerefs[nr_zones++]); 5616 check_highest_zone(zone_type); 5617 } 5618 } while (zone_type); 5619 5620 return nr_zones; 5621 } 5622 5623 #ifdef CONFIG_NUMA 5624 5625 static int __parse_numa_zonelist_order(char *s) 5626 { 5627 /* 5628 * We used to support different zonelists modes but they turned 5629 * out to be just not useful. Let's keep the warning in place 5630 * if somebody still use the cmd line parameter so that we do 5631 * not fail it silently 5632 */ 5633 if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) { 5634 pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s); 5635 return -EINVAL; 5636 } 5637 return 0; 5638 } 5639 5640 static char numa_zonelist_order[] = "Node"; 5641 #define NUMA_ZONELIST_ORDER_LEN 16 5642 /* 5643 * sysctl handler for numa_zonelist_order 5644 */ 5645 static int numa_zonelist_order_handler(const struct ctl_table *table, int write, 5646 void *buffer, size_t *length, loff_t *ppos) 5647 { 5648 if (write) 5649 return __parse_numa_zonelist_order(buffer); 5650 return proc_dostring(table, write, buffer, length, ppos); 5651 } 5652 5653 static int node_load[MAX_NUMNODES]; 5654 5655 /** 5656 * find_next_best_node - find the next node that should appear in a given node's fallback list 5657 * @node: node whose fallback list we're appending 5658 * @used_node_mask: nodemask_t of already used nodes 5659 * 5660 * We use a number of factors to determine which is the next node that should 5661 * appear on a given node's fallback list. The node should not have appeared 5662 * already in @node's fallback list, and it should be the next closest node 5663 * according to the distance array (which contains arbitrary distance values 5664 * from each node to each node in the system), and should also prefer nodes 5665 * with no CPUs, since presumably they'll have very little allocation pressure 5666 * on them otherwise. 5667 * 5668 * Return: node id of the found node or %NUMA_NO_NODE if no node is found. 5669 */ 5670 int find_next_best_node(int node, nodemask_t *used_node_mask) 5671 { 5672 int n, val; 5673 int min_val = INT_MAX; 5674 int best_node = NUMA_NO_NODE; 5675 5676 /* 5677 * Use the local node if we haven't already, but for memoryless local 5678 * node, we should skip it and fall back to other nodes. 5679 */ 5680 if (!node_isset(node, *used_node_mask) && node_state(node, N_MEMORY)) { 5681 node_set(node, *used_node_mask); 5682 return node; 5683 } 5684 5685 for_each_node_state(n, N_MEMORY) { 5686 5687 /* Don't want a node to appear more than once */ 5688 if (node_isset(n, *used_node_mask)) 5689 continue; 5690 5691 /* Use the distance array to find the distance */ 5692 val = node_distance(node, n); 5693 5694 /* Penalize nodes under us ("prefer the next node") */ 5695 val += (n < node); 5696 5697 /* Give preference to headless and unused nodes */ 5698 if (!cpumask_empty(cpumask_of_node(n))) 5699 val += PENALTY_FOR_NODE_WITH_CPUS; 5700 5701 /* Slight preference for less loaded node */ 5702 val *= MAX_NUMNODES; 5703 val += node_load[n]; 5704 5705 if (val < min_val) { 5706 min_val = val; 5707 best_node = n; 5708 } 5709 } 5710 5711 if (best_node >= 0) 5712 node_set(best_node, *used_node_mask); 5713 5714 return best_node; 5715 } 5716 5717 5718 /* 5719 * Build zonelists ordered by node and zones within node. 5720 * This results in maximum locality--normal zone overflows into local 5721 * DMA zone, if any--but risks exhausting DMA zone. 5722 */ 5723 static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order, 5724 unsigned nr_nodes) 5725 { 5726 struct zoneref *zonerefs; 5727 int i; 5728 5729 zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; 5730 5731 for (i = 0; i < nr_nodes; i++) { 5732 int nr_zones; 5733 5734 pg_data_t *node = NODE_DATA(node_order[i]); 5735 5736 nr_zones = build_zonerefs_node(node, zonerefs); 5737 zonerefs += nr_zones; 5738 } 5739 zonerefs->zone = NULL; 5740 zonerefs->zone_idx = 0; 5741 } 5742 5743 /* 5744 * Build __GFP_THISNODE zonelists 5745 */ 5746 static void build_thisnode_zonelists(pg_data_t *pgdat) 5747 { 5748 struct zoneref *zonerefs; 5749 int nr_zones; 5750 5751 zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs; 5752 nr_zones = build_zonerefs_node(pgdat, zonerefs); 5753 zonerefs += nr_zones; 5754 zonerefs->zone = NULL; 5755 zonerefs->zone_idx = 0; 5756 } 5757 5758 static void build_zonelists(pg_data_t *pgdat) 5759 { 5760 static int node_order[MAX_NUMNODES]; 5761 int node, nr_nodes = 0; 5762 nodemask_t used_mask = NODE_MASK_NONE; 5763 int local_node, prev_node; 5764 5765 /* NUMA-aware ordering of nodes */ 5766 local_node = pgdat->node_id; 5767 prev_node = local_node; 5768 5769 memset(node_order, 0, sizeof(node_order)); 5770 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 5771 /* 5772 * We don't want to pressure a particular node. 5773 * So adding penalty to the first node in same 5774 * distance group to make it round-robin. 5775 */ 5776 if (node_distance(local_node, node) != 5777 node_distance(local_node, prev_node)) 5778 node_load[node] += 1; 5779 5780 node_order[nr_nodes++] = node; 5781 prev_node = node; 5782 } 5783 5784 build_zonelists_in_node_order(pgdat, node_order, nr_nodes); 5785 build_thisnode_zonelists(pgdat); 5786 pr_info("Fallback order for Node %d: ", local_node); 5787 for (node = 0; node < nr_nodes; node++) 5788 pr_cont("%d ", node_order[node]); 5789 pr_cont("\n"); 5790 } 5791 5792 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 5793 /* 5794 * Return node id of node used for "local" allocations. 5795 * I.e., first node id of first zone in arg node's generic zonelist. 5796 * Used for initializing percpu 'numa_mem', which is used primarily 5797 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. 5798 */ 5799 int local_memory_node(int node) 5800 { 5801 struct zoneref *z; 5802 5803 z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL), 5804 gfp_zone(GFP_KERNEL), 5805 NULL); 5806 return zonelist_node_idx(z); 5807 } 5808 #endif 5809 5810 static void setup_min_unmapped_ratio(void); 5811 static void setup_min_slab_ratio(void); 5812 #else /* CONFIG_NUMA */ 5813 5814 static void build_zonelists(pg_data_t *pgdat) 5815 { 5816 struct zoneref *zonerefs; 5817 int nr_zones; 5818 5819 zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; 5820 nr_zones = build_zonerefs_node(pgdat, zonerefs); 5821 zonerefs += nr_zones; 5822 5823 zonerefs->zone = NULL; 5824 zonerefs->zone_idx = 0; 5825 } 5826 5827 #endif /* CONFIG_NUMA */ 5828 5829 /* 5830 * Boot pageset table. One per cpu which is going to be used for all 5831 * zones and all nodes. The parameters will be set in such a way 5832 * that an item put on a list will immediately be handed over to 5833 * the buddy list. This is safe since pageset manipulation is done 5834 * with interrupts disabled. 5835 * 5836 * The boot_pagesets must be kept even after bootup is complete for 5837 * unused processors and/or zones. They do play a role for bootstrapping 5838 * hotplugged processors. 5839 * 5840 * zoneinfo_show() and maybe other functions do 5841 * not check if the processor is online before following the pageset pointer. 5842 * Other parts of the kernel may not check if the zone is available. 5843 */ 5844 static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats); 5845 /* These effectively disable the pcplists in the boot pageset completely */ 5846 #define BOOT_PAGESET_HIGH 0 5847 #define BOOT_PAGESET_BATCH 1 5848 static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset); 5849 static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats); 5850 5851 static void __build_all_zonelists(void *data) 5852 { 5853 int nid; 5854 int __maybe_unused cpu; 5855 pg_data_t *self = data; 5856 unsigned long flags; 5857 5858 /* 5859 * The zonelist_update_seq must be acquired with irqsave because the 5860 * reader can be invoked from IRQ with GFP_ATOMIC. 5861 */ 5862 write_seqlock_irqsave(&zonelist_update_seq, flags); 5863 /* 5864 * Also disable synchronous printk() to prevent any printk() from 5865 * trying to hold port->lock, for 5866 * tty_insert_flip_string_and_push_buffer() on other CPU might be 5867 * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held. 5868 */ 5869 printk_deferred_enter(); 5870 5871 #ifdef CONFIG_NUMA 5872 memset(node_load, 0, sizeof(node_load)); 5873 #endif 5874 5875 /* 5876 * This node is hotadded and no memory is yet present. So just 5877 * building zonelists is fine - no need to touch other nodes. 5878 */ 5879 if (self && !node_online(self->node_id)) { 5880 build_zonelists(self); 5881 } else { 5882 /* 5883 * All possible nodes have pgdat preallocated 5884 * in free_area_init 5885 */ 5886 for_each_node(nid) { 5887 pg_data_t *pgdat = NODE_DATA(nid); 5888 5889 build_zonelists(pgdat); 5890 } 5891 5892 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 5893 /* 5894 * We now know the "local memory node" for each node-- 5895 * i.e., the node of the first zone in the generic zonelist. 5896 * Set up numa_mem percpu variable for on-line cpus. During 5897 * boot, only the boot cpu should be on-line; we'll init the 5898 * secondary cpus' numa_mem as they come on-line. During 5899 * node/memory hotplug, we'll fixup all on-line cpus. 5900 */ 5901 for_each_online_cpu(cpu) 5902 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); 5903 #endif 5904 } 5905 5906 printk_deferred_exit(); 5907 write_sequnlock_irqrestore(&zonelist_update_seq, flags); 5908 } 5909 5910 static noinline void __init 5911 build_all_zonelists_init(void) 5912 { 5913 int cpu; 5914 5915 __build_all_zonelists(NULL); 5916 5917 /* 5918 * Initialize the boot_pagesets that are going to be used 5919 * for bootstrapping processors. The real pagesets for 5920 * each zone will be allocated later when the per cpu 5921 * allocator is available. 5922 * 5923 * boot_pagesets are used also for bootstrapping offline 5924 * cpus if the system is already booted because the pagesets 5925 * are needed to initialize allocators on a specific cpu too. 5926 * F.e. the percpu allocator needs the page allocator which 5927 * needs the percpu allocator in order to allocate its pagesets 5928 * (a chicken-egg dilemma). 5929 */ 5930 for_each_possible_cpu(cpu) 5931 per_cpu_pages_init(&per_cpu(boot_pageset, cpu), &per_cpu(boot_zonestats, cpu)); 5932 5933 mminit_verify_zonelist(); 5934 cpuset_init_current_mems_allowed(); 5935 } 5936 5937 /* 5938 * unless system_state == SYSTEM_BOOTING. 5939 * 5940 * __ref due to call of __init annotated helper build_all_zonelists_init 5941 * [protected by SYSTEM_BOOTING]. 5942 */ 5943 void __ref build_all_zonelists(pg_data_t *pgdat) 5944 { 5945 unsigned long vm_total_pages; 5946 5947 if (system_state == SYSTEM_BOOTING) { 5948 build_all_zonelists_init(); 5949 } else { 5950 __build_all_zonelists(pgdat); 5951 /* cpuset refresh routine should be here */ 5952 } 5953 /* Get the number of free pages beyond high watermark in all zones. */ 5954 vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); 5955 /* 5956 * Disable grouping by mobility if the number of pages in the 5957 * system is too low to allow the mechanism to work. It would be 5958 * more accurate, but expensive to check per-zone. This check is 5959 * made on memory-hotadd so a system can start with mobility 5960 * disabled and enable it later 5961 */ 5962 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) 5963 page_group_by_mobility_disabled = 1; 5964 else 5965 page_group_by_mobility_disabled = 0; 5966 5967 pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n", 5968 nr_online_nodes, 5969 str_off_on(page_group_by_mobility_disabled), 5970 vm_total_pages); 5971 #ifdef CONFIG_NUMA 5972 pr_info("Policy zone: %s\n", zone_names[policy_zone]); 5973 #endif 5974 } 5975 5976 static int zone_batchsize(struct zone *zone) 5977 { 5978 #ifdef CONFIG_MMU 5979 int batch; 5980 5981 /* 5982 * The number of pages to batch allocate is either ~0.025% 5983 * of the zone or 256KB, whichever is smaller. The batch 5984 * size is striking a balance between allocation latency 5985 * and zone lock contention. 5986 */ 5987 batch = min(zone_managed_pages(zone) >> 12, SZ_256K / PAGE_SIZE); 5988 if (batch <= 1) 5989 return 1; 5990 5991 /* 5992 * Clamp the batch to a 2^n - 1 value. Having a power 5993 * of 2 value was found to be more likely to have 5994 * suboptimal cache aliasing properties in some cases. 5995 * 5996 * For example if 2 tasks are alternately allocating 5997 * batches of pages, one task can end up with a lot 5998 * of pages of one half of the possible page colors 5999 * and the other with pages of the other colors. 6000 */ 6001 batch = rounddown_pow_of_two(batch + batch/2) - 1; 6002 6003 return batch; 6004 6005 #else 6006 /* The deferral and batching of frees should be suppressed under NOMMU 6007 * conditions. 6008 * 6009 * The problem is that NOMMU needs to be able to allocate large chunks 6010 * of contiguous memory as there's no hardware page translation to 6011 * assemble apparent contiguous memory from discontiguous pages. 6012 * 6013 * Queueing large contiguous runs of pages for batching, however, 6014 * causes the pages to actually be freed in smaller chunks. As there 6015 * can be a significant delay between the individual batches being 6016 * recycled, this leads to the once large chunks of space being 6017 * fragmented and becoming unavailable for high-order allocations. 6018 */ 6019 return 1; 6020 #endif 6021 } 6022 6023 static int percpu_pagelist_high_fraction; 6024 static int zone_highsize(struct zone *zone, int batch, int cpu_online, 6025 int high_fraction) 6026 { 6027 #ifdef CONFIG_MMU 6028 int high; 6029 int nr_split_cpus; 6030 unsigned long total_pages; 6031 6032 if (!high_fraction) { 6033 /* 6034 * By default, the high value of the pcp is based on the zone 6035 * low watermark so that if they are full then background 6036 * reclaim will not be started prematurely. 6037 */ 6038 total_pages = low_wmark_pages(zone); 6039 } else { 6040 /* 6041 * If percpu_pagelist_high_fraction is configured, the high 6042 * value is based on a fraction of the managed pages in the 6043 * zone. 6044 */ 6045 total_pages = zone_managed_pages(zone) / high_fraction; 6046 } 6047 6048 /* 6049 * Split the high value across all online CPUs local to the zone. Note 6050 * that early in boot that CPUs may not be online yet and that during 6051 * CPU hotplug that the cpumask is not yet updated when a CPU is being 6052 * onlined. For memory nodes that have no CPUs, split the high value 6053 * across all online CPUs to mitigate the risk that reclaim is triggered 6054 * prematurely due to pages stored on pcp lists. 6055 */ 6056 nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online; 6057 if (!nr_split_cpus) 6058 nr_split_cpus = num_online_cpus(); 6059 high = total_pages / nr_split_cpus; 6060 6061 /* 6062 * Ensure high is at least batch*4. The multiple is based on the 6063 * historical relationship between high and batch. 6064 */ 6065 high = max(high, batch << 2); 6066 6067 return high; 6068 #else 6069 return 0; 6070 #endif 6071 } 6072 6073 /* 6074 * pcp->high and pcp->batch values are related and generally batch is lower 6075 * than high. They are also related to pcp->count such that count is lower 6076 * than high, and as soon as it reaches high, the pcplist is flushed. 6077 * 6078 * However, guaranteeing these relations at all times would require e.g. write 6079 * barriers here but also careful usage of read barriers at the read side, and 6080 * thus be prone to error and bad for performance. Thus the update only prevents 6081 * store tearing. Any new users of pcp->batch, pcp->high_min and pcp->high_max 6082 * should ensure they can cope with those fields changing asynchronously, and 6083 * fully trust only the pcp->count field on the local CPU with interrupts 6084 * disabled. 6085 * 6086 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function 6087 * outside of boot time (or some other assurance that no concurrent updaters 6088 * exist). 6089 */ 6090 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high_min, 6091 unsigned long high_max, unsigned long batch) 6092 { 6093 WRITE_ONCE(pcp->batch, batch); 6094 WRITE_ONCE(pcp->high_min, high_min); 6095 WRITE_ONCE(pcp->high_max, high_max); 6096 } 6097 6098 static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats) 6099 { 6100 int pindex; 6101 6102 memset(pcp, 0, sizeof(*pcp)); 6103 memset(pzstats, 0, sizeof(*pzstats)); 6104 6105 spin_lock_init(&pcp->lock); 6106 for (pindex = 0; pindex < NR_PCP_LISTS; pindex++) 6107 INIT_LIST_HEAD(&pcp->lists[pindex]); 6108 6109 /* 6110 * Set batch and high values safe for a boot pageset. A true percpu 6111 * pageset's initialization will update them subsequently. Here we don't 6112 * need to be as careful as pageset_update() as nobody can access the 6113 * pageset yet. 6114 */ 6115 pcp->high_min = BOOT_PAGESET_HIGH; 6116 pcp->high_max = BOOT_PAGESET_HIGH; 6117 pcp->batch = BOOT_PAGESET_BATCH; 6118 } 6119 6120 static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high_min, 6121 unsigned long high_max, unsigned long batch) 6122 { 6123 struct per_cpu_pages *pcp; 6124 int cpu; 6125 6126 for_each_possible_cpu(cpu) { 6127 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); 6128 pageset_update(pcp, high_min, high_max, batch); 6129 } 6130 } 6131 6132 /* 6133 * Calculate and set new high and batch values for all per-cpu pagesets of a 6134 * zone based on the zone's size. 6135 */ 6136 static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online) 6137 { 6138 int new_high_min, new_high_max, new_batch; 6139 6140 new_batch = zone_batchsize(zone); 6141 if (percpu_pagelist_high_fraction) { 6142 new_high_min = zone_highsize(zone, new_batch, cpu_online, 6143 percpu_pagelist_high_fraction); 6144 /* 6145 * PCP high is tuned manually, disable auto-tuning via 6146 * setting high_min and high_max to the manual value. 6147 */ 6148 new_high_max = new_high_min; 6149 } else { 6150 new_high_min = zone_highsize(zone, new_batch, cpu_online, 0); 6151 new_high_max = zone_highsize(zone, new_batch, cpu_online, 6152 MIN_PERCPU_PAGELIST_HIGH_FRACTION); 6153 } 6154 6155 if (zone->pageset_high_min == new_high_min && 6156 zone->pageset_high_max == new_high_max && 6157 zone->pageset_batch == new_batch) 6158 return; 6159 6160 zone->pageset_high_min = new_high_min; 6161 zone->pageset_high_max = new_high_max; 6162 zone->pageset_batch = new_batch; 6163 6164 __zone_set_pageset_high_and_batch(zone, new_high_min, new_high_max, 6165 new_batch); 6166 } 6167 6168 void __meminit setup_zone_pageset(struct zone *zone) 6169 { 6170 int cpu; 6171 6172 /* Size may be 0 on !SMP && !NUMA */ 6173 if (sizeof(struct per_cpu_zonestat) > 0) 6174 zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat); 6175 6176 zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages); 6177 for_each_possible_cpu(cpu) { 6178 struct per_cpu_pages *pcp; 6179 struct per_cpu_zonestat *pzstats; 6180 6181 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); 6182 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); 6183 per_cpu_pages_init(pcp, pzstats); 6184 } 6185 6186 zone_set_pageset_high_and_batch(zone, 0); 6187 } 6188 6189 /* 6190 * The zone indicated has a new number of managed_pages; batch sizes and percpu 6191 * page high values need to be recalculated. 6192 */ 6193 static void zone_pcp_update(struct zone *zone, int cpu_online) 6194 { 6195 mutex_lock(&pcp_batch_high_lock); 6196 zone_set_pageset_high_and_batch(zone, cpu_online); 6197 mutex_unlock(&pcp_batch_high_lock); 6198 } 6199 6200 static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu) 6201 { 6202 struct per_cpu_pages *pcp; 6203 struct cpu_cacheinfo *cci; 6204 6205 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); 6206 cci = get_cpu_cacheinfo(cpu); 6207 /* 6208 * If data cache slice of CPU is large enough, "pcp->batch" 6209 * pages can be preserved in PCP before draining PCP for 6210 * consecutive high-order pages freeing without allocation. 6211 * This can reduce zone lock contention without hurting 6212 * cache-hot pages sharing. 6213 */ 6214 pcp_spin_lock_nopin(pcp); 6215 if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch) 6216 pcp->flags |= PCPF_FREE_HIGH_BATCH; 6217 else 6218 pcp->flags &= ~PCPF_FREE_HIGH_BATCH; 6219 pcp_spin_unlock_nopin(pcp); 6220 } 6221 6222 void setup_pcp_cacheinfo(unsigned int cpu) 6223 { 6224 struct zone *zone; 6225 6226 for_each_populated_zone(zone) 6227 zone_pcp_update_cacheinfo(zone, cpu); 6228 } 6229 6230 /* 6231 * Allocate per cpu pagesets and initialize them. 6232 * Before this call only boot pagesets were available. 6233 */ 6234 void __init setup_per_cpu_pageset(void) 6235 { 6236 struct pglist_data *pgdat; 6237 struct zone *zone; 6238 int __maybe_unused cpu; 6239 6240 for_each_populated_zone(zone) 6241 setup_zone_pageset(zone); 6242 6243 #ifdef CONFIG_NUMA 6244 /* 6245 * Unpopulated zones continue using the boot pagesets. 6246 * The numa stats for these pagesets need to be reset. 6247 * Otherwise, they will end up skewing the stats of 6248 * the nodes these zones are associated with. 6249 */ 6250 for_each_possible_cpu(cpu) { 6251 struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu); 6252 memset(pzstats->vm_numa_event, 0, 6253 sizeof(pzstats->vm_numa_event)); 6254 } 6255 #endif 6256 6257 for_each_online_pgdat(pgdat) 6258 pgdat->per_cpu_nodestats = 6259 alloc_percpu(struct per_cpu_nodestat); 6260 } 6261 6262 __meminit void zone_pcp_init(struct zone *zone) 6263 { 6264 /* 6265 * per cpu subsystem is not up at this point. The following code 6266 * relies on the ability of the linker to provide the 6267 * offset of a (static) per cpu variable into the per cpu area. 6268 */ 6269 zone->per_cpu_pageset = &boot_pageset; 6270 zone->per_cpu_zonestats = &boot_zonestats; 6271 zone->pageset_high_min = BOOT_PAGESET_HIGH; 6272 zone->pageset_high_max = BOOT_PAGESET_HIGH; 6273 zone->pageset_batch = BOOT_PAGESET_BATCH; 6274 6275 if (populated_zone(zone)) 6276 pr_debug(" %s zone: %lu pages, LIFO batch:%u\n", zone->name, 6277 zone->present_pages, zone_batchsize(zone)); 6278 } 6279 6280 static void setup_per_zone_lowmem_reserve(void); 6281 6282 void adjust_managed_page_count(struct page *page, long count) 6283 { 6284 atomic_long_add(count, &page_zone(page)->managed_pages); 6285 totalram_pages_add(count); 6286 setup_per_zone_lowmem_reserve(); 6287 } 6288 EXPORT_SYMBOL(adjust_managed_page_count); 6289 6290 void free_reserved_page(struct page *page) 6291 { 6292 clear_page_tag_ref(page); 6293 ClearPageReserved(page); 6294 init_page_count(page); 6295 __free_page(page); 6296 adjust_managed_page_count(page, 1); 6297 } 6298 EXPORT_SYMBOL(free_reserved_page); 6299 6300 static int page_alloc_cpu_dead(unsigned int cpu) 6301 { 6302 struct zone *zone; 6303 6304 lru_add_drain_cpu(cpu); 6305 mlock_drain_remote(cpu); 6306 drain_pages(cpu); 6307 6308 /* 6309 * Spill the event counters of the dead processor 6310 * into the current processors event counters. 6311 * This artificially elevates the count of the current 6312 * processor. 6313 */ 6314 vm_events_fold_cpu(cpu); 6315 6316 /* 6317 * Zero the differential counters of the dead processor 6318 * so that the vm statistics are consistent. 6319 * 6320 * This is only okay since the processor is dead and cannot 6321 * race with what we are doing. 6322 */ 6323 cpu_vm_stats_fold(cpu); 6324 6325 for_each_populated_zone(zone) 6326 zone_pcp_update(zone, 0); 6327 6328 return 0; 6329 } 6330 6331 static int page_alloc_cpu_online(unsigned int cpu) 6332 { 6333 struct zone *zone; 6334 6335 for_each_populated_zone(zone) 6336 zone_pcp_update(zone, 1); 6337 return 0; 6338 } 6339 6340 void __init page_alloc_init_cpuhp(void) 6341 { 6342 int ret; 6343 6344 ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC, 6345 "mm/page_alloc:pcp", 6346 page_alloc_cpu_online, 6347 page_alloc_cpu_dead); 6348 WARN_ON(ret < 0); 6349 } 6350 6351 /* 6352 * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio 6353 * or min_free_kbytes changes. 6354 */ 6355 static void calculate_totalreserve_pages(void) 6356 { 6357 struct pglist_data *pgdat; 6358 unsigned long reserve_pages = 0; 6359 enum zone_type i, j; 6360 6361 for_each_online_pgdat(pgdat) { 6362 6363 pgdat->totalreserve_pages = 0; 6364 6365 for (i = 0; i < MAX_NR_ZONES; i++) { 6366 struct zone *zone = pgdat->node_zones + i; 6367 long max = 0; 6368 unsigned long managed_pages = zone_managed_pages(zone); 6369 6370 /* 6371 * lowmem_reserve[j] is monotonically non-decreasing 6372 * in j for a given zone (see 6373 * setup_per_zone_lowmem_reserve()). The maximum 6374 * valid reserve lives at the highest index with a 6375 * non-zero value, so scan backwards and stop at the 6376 * first hit. 6377 */ 6378 for (j = MAX_NR_ZONES - 1; j > i; j--) { 6379 if (!zone->lowmem_reserve[j]) 6380 continue; 6381 6382 max = zone->lowmem_reserve[j]; 6383 break; 6384 } 6385 /* we treat the high watermark as reserved pages. */ 6386 max += high_wmark_pages(zone); 6387 6388 max = min_t(unsigned long, max, managed_pages); 6389 6390 pgdat->totalreserve_pages += max; 6391 6392 reserve_pages += max; 6393 } 6394 } 6395 totalreserve_pages = reserve_pages; 6396 trace_mm_calculate_totalreserve_pages(totalreserve_pages); 6397 } 6398 6399 /* 6400 * setup_per_zone_lowmem_reserve - called whenever 6401 * sysctl_lowmem_reserve_ratio changes. Ensures that each zone 6402 * has a correct pages reserved value, so an adequate number of 6403 * pages are left in the zone after a successful __alloc_pages(). 6404 */ 6405 static void setup_per_zone_lowmem_reserve(void) 6406 { 6407 struct pglist_data *pgdat; 6408 enum zone_type i, j; 6409 /* 6410 * For a given zone node_zones[i], lowmem_reserve[j] (j > i) 6411 * represents how many pages in zone i must effectively be kept 6412 * in reserve when deciding whether an allocation class that is 6413 * allowed to allocate from zones up to j may fall back into 6414 * zone i. 6415 * 6416 * As j increases, the allocation class can use a strictly larger 6417 * set of fallback zones and therefore must not be allowed to 6418 * deplete low zones more aggressively than a less flexible one. 6419 * As a result, lowmem_reserve[j] is required to be monotonically 6420 * non-decreasing in j for each zone i. Callers such as 6421 * calculate_totalreserve_pages() rely on this monotonicity when 6422 * selecting the maximum reserve entry. 6423 */ 6424 for_each_online_pgdat(pgdat) { 6425 for (i = 0; i < MAX_NR_ZONES - 1; i++) { 6426 struct zone *zone = &pgdat->node_zones[i]; 6427 int ratio = sysctl_lowmem_reserve_ratio[i]; 6428 bool clear = !ratio || !zone_managed_pages(zone); 6429 unsigned long managed_pages = 0; 6430 6431 for (j = i + 1; j < MAX_NR_ZONES; j++) { 6432 struct zone *upper_zone = &pgdat->node_zones[j]; 6433 6434 managed_pages += zone_managed_pages(upper_zone); 6435 6436 if (clear) 6437 zone->lowmem_reserve[j] = 0; 6438 else 6439 zone->lowmem_reserve[j] = managed_pages / ratio; 6440 trace_mm_setup_per_zone_lowmem_reserve(zone, upper_zone, 6441 zone->lowmem_reserve[j]); 6442 } 6443 } 6444 } 6445 6446 /* update totalreserve_pages */ 6447 calculate_totalreserve_pages(); 6448 } 6449 6450 static void __setup_per_zone_wmarks(void) 6451 { 6452 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 6453 unsigned long lowmem_pages = 0; 6454 struct zone *zone; 6455 unsigned long flags; 6456 6457 /* Calculate total number of !ZONE_HIGHMEM and !ZONE_MOVABLE pages */ 6458 for_each_zone(zone) { 6459 if (!is_highmem(zone) && zone_idx(zone) != ZONE_MOVABLE) 6460 lowmem_pages += zone_managed_pages(zone); 6461 } 6462 6463 for_each_zone(zone) { 6464 u64 tmp; 6465 6466 spin_lock_irqsave(&zone->lock, flags); 6467 tmp = (u64)pages_min * zone_managed_pages(zone); 6468 tmp = div64_ul(tmp, lowmem_pages); 6469 if (is_highmem(zone) || zone_idx(zone) == ZONE_MOVABLE) { 6470 /* 6471 * __GFP_HIGH and PF_MEMALLOC allocations usually don't 6472 * need highmem and movable zones pages, so cap pages_min 6473 * to a small value here. 6474 * 6475 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) 6476 * deltas control async page reclaim, and so should 6477 * not be capped for highmem and movable zones. 6478 */ 6479 unsigned long min_pages; 6480 6481 min_pages = zone_managed_pages(zone) / 1024; 6482 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); 6483 zone->_watermark[WMARK_MIN] = min_pages; 6484 } else { 6485 /* 6486 * If it's a lowmem zone, reserve a number of pages 6487 * proportionate to the zone's size. 6488 */ 6489 zone->_watermark[WMARK_MIN] = tmp; 6490 } 6491 6492 /* 6493 * Set the kswapd watermarks distance according to the 6494 * scale factor in proportion to available memory, but 6495 * ensure a minimum size on small systems. 6496 */ 6497 tmp = max_t(u64, tmp >> 2, 6498 mult_frac(zone_managed_pages(zone), 6499 watermark_scale_factor, 10000)); 6500 6501 zone->watermark_boost = 0; 6502 zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; 6503 zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp; 6504 zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp; 6505 trace_mm_setup_per_zone_wmarks(zone); 6506 6507 spin_unlock_irqrestore(&zone->lock, flags); 6508 } 6509 6510 /* update totalreserve_pages */ 6511 calculate_totalreserve_pages(); 6512 } 6513 6514 /** 6515 * setup_per_zone_wmarks - called when min_free_kbytes changes 6516 * or when memory is hot-{added|removed} 6517 * 6518 * Ensures that the watermark[min,low,high] values for each zone are set 6519 * correctly with respect to min_free_kbytes. 6520 */ 6521 void setup_per_zone_wmarks(void) 6522 { 6523 struct zone *zone; 6524 static DEFINE_SPINLOCK(lock); 6525 6526 spin_lock(&lock); 6527 __setup_per_zone_wmarks(); 6528 spin_unlock(&lock); 6529 6530 /* 6531 * The watermark size have changed so update the pcpu batch 6532 * and high limits or the limits may be inappropriate. 6533 */ 6534 for_each_zone(zone) 6535 zone_pcp_update(zone, 0); 6536 } 6537 6538 /* 6539 * Initialise min_free_kbytes. 6540 * 6541 * For small machines we want it small (128k min). For large machines 6542 * we want it large (256MB max). But it is not linear, because network 6543 * bandwidth does not increase linearly with machine size. We use 6544 * 6545 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 6546 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 6547 * 6548 * which yields 6549 * 6550 * 16MB: 512k 6551 * 32MB: 724k 6552 * 64MB: 1024k 6553 * 128MB: 1448k 6554 * 256MB: 2048k 6555 * 512MB: 2896k 6556 * 1024MB: 4096k 6557 * 2048MB: 5792k 6558 * 4096MB: 8192k 6559 * 8192MB: 11584k 6560 * 16384MB: 16384k 6561 */ 6562 void calculate_min_free_kbytes(void) 6563 { 6564 unsigned long lowmem_kbytes; 6565 int new_min_free_kbytes; 6566 6567 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 6568 new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 6569 6570 if (new_min_free_kbytes > user_min_free_kbytes) 6571 min_free_kbytes = clamp(new_min_free_kbytes, 128, 262144); 6572 else 6573 pr_warn_ratelimited("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", 6574 new_min_free_kbytes, user_min_free_kbytes); 6575 6576 } 6577 6578 int __meminit init_per_zone_wmark_min(void) 6579 { 6580 calculate_min_free_kbytes(); 6581 setup_per_zone_wmarks(); 6582 refresh_zone_stat_thresholds(); 6583 setup_per_zone_lowmem_reserve(); 6584 6585 #ifdef CONFIG_NUMA 6586 setup_min_unmapped_ratio(); 6587 setup_min_slab_ratio(); 6588 #endif 6589 6590 khugepaged_min_free_kbytes_update(); 6591 6592 return 0; 6593 } 6594 postcore_initcall(init_per_zone_wmark_min) 6595 6596 /* 6597 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 6598 * that we can call two helper functions whenever min_free_kbytes 6599 * changes. 6600 */ 6601 static int min_free_kbytes_sysctl_handler(const struct ctl_table *table, int write, 6602 void *buffer, size_t *length, loff_t *ppos) 6603 { 6604 int rc; 6605 6606 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 6607 if (rc) 6608 return rc; 6609 6610 if (write) { 6611 user_min_free_kbytes = min_free_kbytes; 6612 setup_per_zone_wmarks(); 6613 } 6614 return 0; 6615 } 6616 6617 static int watermark_scale_factor_sysctl_handler(const struct ctl_table *table, int write, 6618 void *buffer, size_t *length, loff_t *ppos) 6619 { 6620 int rc; 6621 6622 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 6623 if (rc) 6624 return rc; 6625 6626 if (write) 6627 setup_per_zone_wmarks(); 6628 6629 return 0; 6630 } 6631 6632 #ifdef CONFIG_NUMA 6633 static void setup_min_unmapped_ratio(void) 6634 { 6635 pg_data_t *pgdat; 6636 struct zone *zone; 6637 6638 for_each_online_pgdat(pgdat) 6639 pgdat->min_unmapped_pages = 0; 6640 6641 for_each_zone(zone) 6642 zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) * 6643 sysctl_min_unmapped_ratio) / 100; 6644 } 6645 6646 6647 static int sysctl_min_unmapped_ratio_sysctl_handler(const struct ctl_table *table, int write, 6648 void *buffer, size_t *length, loff_t *ppos) 6649 { 6650 int rc; 6651 6652 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 6653 if (rc) 6654 return rc; 6655 6656 setup_min_unmapped_ratio(); 6657 6658 return 0; 6659 } 6660 6661 static void setup_min_slab_ratio(void) 6662 { 6663 pg_data_t *pgdat; 6664 struct zone *zone; 6665 6666 for_each_online_pgdat(pgdat) 6667 pgdat->min_slab_pages = 0; 6668 6669 for_each_zone(zone) 6670 zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) * 6671 sysctl_min_slab_ratio) / 100; 6672 } 6673 6674 static int sysctl_min_slab_ratio_sysctl_handler(const struct ctl_table *table, int write, 6675 void *buffer, size_t *length, loff_t *ppos) 6676 { 6677 int rc; 6678 6679 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 6680 if (rc) 6681 return rc; 6682 6683 setup_min_slab_ratio(); 6684 6685 return 0; 6686 } 6687 #endif 6688 6689 /* 6690 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 6691 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 6692 * whenever sysctl_lowmem_reserve_ratio changes. 6693 * 6694 * The reserve ratio obviously has absolutely no relation with the 6695 * minimum watermarks. The lowmem reserve ratio can only make sense 6696 * if in function of the boot time zone sizes. 6697 */ 6698 static int lowmem_reserve_ratio_sysctl_handler(const struct ctl_table *table, 6699 int write, void *buffer, size_t *length, loff_t *ppos) 6700 { 6701 int i; 6702 6703 proc_dointvec_minmax(table, write, buffer, length, ppos); 6704 6705 for (i = 0; i < MAX_NR_ZONES; i++) { 6706 if (sysctl_lowmem_reserve_ratio[i] < 1) 6707 sysctl_lowmem_reserve_ratio[i] = 0; 6708 } 6709 6710 setup_per_zone_lowmem_reserve(); 6711 return 0; 6712 } 6713 6714 /* 6715 * percpu_pagelist_high_fraction - changes the pcp->high for each zone on each 6716 * cpu. It is the fraction of total pages in each zone that a hot per cpu 6717 * pagelist can have before it gets flushed back to buddy allocator. 6718 */ 6719 static int percpu_pagelist_high_fraction_sysctl_handler(const struct ctl_table *table, 6720 int write, void *buffer, size_t *length, loff_t *ppos) 6721 { 6722 struct zone *zone; 6723 int old_percpu_pagelist_high_fraction; 6724 int ret; 6725 6726 /* 6727 * Avoid using pcp_batch_high_lock for reads as the value is read 6728 * atomically and a race with offlining is harmless. 6729 */ 6730 6731 if (!write) 6732 return proc_dointvec_minmax(table, write, buffer, length, ppos); 6733 6734 mutex_lock(&pcp_batch_high_lock); 6735 old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction; 6736 6737 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 6738 if (ret < 0) 6739 goto out; 6740 6741 /* Sanity checking to avoid pcp imbalance */ 6742 if (percpu_pagelist_high_fraction && 6743 percpu_pagelist_high_fraction < MIN_PERCPU_PAGELIST_HIGH_FRACTION) { 6744 percpu_pagelist_high_fraction = old_percpu_pagelist_high_fraction; 6745 ret = -EINVAL; 6746 goto out; 6747 } 6748 6749 /* No change? */ 6750 if (percpu_pagelist_high_fraction == old_percpu_pagelist_high_fraction) 6751 goto out; 6752 6753 for_each_populated_zone(zone) 6754 zone_set_pageset_high_and_batch(zone, 0); 6755 out: 6756 mutex_unlock(&pcp_batch_high_lock); 6757 return ret; 6758 } 6759 6760 static const struct ctl_table page_alloc_sysctl_table[] = { 6761 { 6762 .procname = "min_free_kbytes", 6763 .data = &min_free_kbytes, 6764 .maxlen = sizeof(min_free_kbytes), 6765 .mode = 0644, 6766 .proc_handler = min_free_kbytes_sysctl_handler, 6767 .extra1 = SYSCTL_ZERO, 6768 }, 6769 { 6770 .procname = "watermark_boost_factor", 6771 .data = &watermark_boost_factor, 6772 .maxlen = sizeof(watermark_boost_factor), 6773 .mode = 0644, 6774 .proc_handler = proc_dointvec_minmax, 6775 .extra1 = SYSCTL_ZERO, 6776 }, 6777 { 6778 .procname = "watermark_scale_factor", 6779 .data = &watermark_scale_factor, 6780 .maxlen = sizeof(watermark_scale_factor), 6781 .mode = 0644, 6782 .proc_handler = watermark_scale_factor_sysctl_handler, 6783 .extra1 = SYSCTL_ONE, 6784 .extra2 = SYSCTL_THREE_THOUSAND, 6785 }, 6786 { 6787 .procname = "defrag_mode", 6788 .data = &defrag_mode, 6789 .maxlen = sizeof(defrag_mode), 6790 .mode = 0644, 6791 .proc_handler = proc_dointvec_minmax, 6792 .extra1 = SYSCTL_ZERO, 6793 .extra2 = SYSCTL_ONE, 6794 }, 6795 { 6796 .procname = "percpu_pagelist_high_fraction", 6797 .data = &percpu_pagelist_high_fraction, 6798 .maxlen = sizeof(percpu_pagelist_high_fraction), 6799 .mode = 0644, 6800 .proc_handler = percpu_pagelist_high_fraction_sysctl_handler, 6801 .extra1 = SYSCTL_ZERO, 6802 }, 6803 { 6804 .procname = "lowmem_reserve_ratio", 6805 .data = &sysctl_lowmem_reserve_ratio, 6806 .maxlen = sizeof(sysctl_lowmem_reserve_ratio), 6807 .mode = 0644, 6808 .proc_handler = lowmem_reserve_ratio_sysctl_handler, 6809 }, 6810 #ifdef CONFIG_NUMA 6811 { 6812 .procname = "numa_zonelist_order", 6813 .data = &numa_zonelist_order, 6814 .maxlen = NUMA_ZONELIST_ORDER_LEN, 6815 .mode = 0644, 6816 .proc_handler = numa_zonelist_order_handler, 6817 }, 6818 { 6819 .procname = "min_unmapped_ratio", 6820 .data = &sysctl_min_unmapped_ratio, 6821 .maxlen = sizeof(sysctl_min_unmapped_ratio), 6822 .mode = 0644, 6823 .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, 6824 .extra1 = SYSCTL_ZERO, 6825 .extra2 = SYSCTL_ONE_HUNDRED, 6826 }, 6827 { 6828 .procname = "min_slab_ratio", 6829 .data = &sysctl_min_slab_ratio, 6830 .maxlen = sizeof(sysctl_min_slab_ratio), 6831 .mode = 0644, 6832 .proc_handler = sysctl_min_slab_ratio_sysctl_handler, 6833 .extra1 = SYSCTL_ZERO, 6834 .extra2 = SYSCTL_ONE_HUNDRED, 6835 }, 6836 #endif 6837 }; 6838 6839 void __init page_alloc_sysctl_init(void) 6840 { 6841 register_sysctl_init("vm", page_alloc_sysctl_table); 6842 } 6843 6844 static void free_prepared_contig_range(struct page *page, 6845 unsigned long nr_pages) 6846 { 6847 unsigned long pfn = page_to_pfn(page); 6848 6849 while (nr_pages) { 6850 unsigned int order; 6851 6852 /* We are limited by the largest buddy order. */ 6853 order = pfn ? __ffs(pfn) : MAX_PAGE_ORDER; 6854 /* Don't exceed the number of pages to free. */ 6855 order = min_t(unsigned int, order, ilog2(nr_pages)); 6856 order = min_t(unsigned int, order, MAX_PAGE_ORDER); 6857 6858 /* 6859 * Free the chunk as a single block. Our caller has already 6860 * called free_pages_prepare() for each order-0 page. 6861 */ 6862 __free_frozen_pages(page, order, FPI_PREPARED); 6863 6864 pfn += 1UL << order; 6865 page += 1UL << order; 6866 nr_pages -= 1UL << order; 6867 } 6868 } 6869 6870 static void __free_contig_range_common(unsigned long pfn, unsigned long nr_pages, 6871 bool is_frozen) 6872 { 6873 struct page *page, *start = NULL; 6874 unsigned long nr_start = 0; 6875 unsigned long start_sec; 6876 unsigned long i; 6877 6878 for (i = 0; i < nr_pages; i++) { 6879 bool can_free = true; 6880 6881 /* 6882 * Contiguous PFNs might not have contiguous "struct pages" 6883 * in some kernel configs: page++ across a section boundary 6884 * is undefined. Use pfn_to_page() for each PFN. 6885 */ 6886 page = pfn_to_page(pfn + i); 6887 6888 VM_WARN_ON_ONCE(PageHead(page)); 6889 VM_WARN_ON_ONCE(PageTail(page)); 6890 6891 if (!is_frozen) 6892 can_free = put_page_testzero(page); 6893 6894 if (can_free) 6895 can_free = free_pages_prepare(page, 0); 6896 6897 if (!can_free) { 6898 if (start) { 6899 free_prepared_contig_range(start, i - nr_start); 6900 start = NULL; 6901 } 6902 continue; 6903 } 6904 6905 if (start && memdesc_section(page->flags) != start_sec) { 6906 free_prepared_contig_range(start, i - nr_start); 6907 start = page; 6908 nr_start = i; 6909 start_sec = memdesc_section(page->flags); 6910 } else if (!start) { 6911 start = page; 6912 nr_start = i; 6913 start_sec = memdesc_section(page->flags); 6914 } 6915 } 6916 6917 if (start) 6918 free_prepared_contig_range(start, nr_pages - nr_start); 6919 } 6920 6921 /** 6922 * __free_contig_range - Free contiguous range of order-0 pages. 6923 * @pfn: Page frame number of the first page in the range. 6924 * @nr_pages: Number of pages to free. 6925 * 6926 * For each order-0 struct page in the physically contiguous range, put a 6927 * reference. Free any page who's reference count falls to zero. The 6928 * implementation is functionally equivalent to, but significantly faster than 6929 * calling __free_page() for each struct page in a loop. 6930 * 6931 * Memory allocated with alloc_pages(order>=1) then subsequently split to 6932 * order-0 with split_page() is an example of appropriate contiguous pages that 6933 * can be freed with this API. 6934 * 6935 * Context: May be called in interrupt context or while holding a normal 6936 * spinlock, but not in NMI context or while holding a raw spinlock. 6937 */ 6938 void __free_contig_range(unsigned long pfn, unsigned long nr_pages) 6939 { 6940 __free_contig_range_common(pfn, nr_pages, /* is_frozen= */ false); 6941 } 6942 6943 #ifdef CONFIG_CONTIG_ALLOC 6944 /* Usage: See admin-guide/dynamic-debug-howto.rst */ 6945 static void alloc_contig_dump_pages(struct list_head *page_list) 6946 { 6947 DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure"); 6948 6949 if (DYNAMIC_DEBUG_BRANCH(descriptor)) { 6950 struct page *page; 6951 6952 dump_stack(); 6953 list_for_each_entry(page, page_list, lru) 6954 dump_page(page, "migration failure"); 6955 } 6956 } 6957 6958 /* [start, end) must belong to a single zone. */ 6959 static int __alloc_contig_migrate_range(struct compact_control *cc, 6960 unsigned long start, unsigned long end) 6961 { 6962 /* This function is based on compact_zone() from compaction.c. */ 6963 unsigned int nr_reclaimed; 6964 unsigned long pfn = start; 6965 unsigned int tries = 0; 6966 int ret = 0; 6967 struct migration_target_control mtc = { 6968 .nid = zone_to_nid(cc->zone), 6969 .gfp_mask = cc->gfp_mask, 6970 .reason = MR_CONTIG_RANGE, 6971 }; 6972 6973 lru_cache_disable(); 6974 6975 while (pfn < end || !list_empty(&cc->migratepages)) { 6976 if (fatal_signal_pending(current)) { 6977 ret = -EINTR; 6978 break; 6979 } 6980 6981 if (list_empty(&cc->migratepages)) { 6982 cc->nr_migratepages = 0; 6983 ret = isolate_migratepages_range(cc, pfn, end); 6984 if (ret && ret != -EAGAIN) 6985 break; 6986 pfn = cc->migrate_pfn; 6987 tries = 0; 6988 } else if (++tries == 5) { 6989 ret = -EBUSY; 6990 break; 6991 } 6992 6993 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, 6994 &cc->migratepages); 6995 cc->nr_migratepages -= nr_reclaimed; 6996 6997 ret = migrate_pages(&cc->migratepages, alloc_migration_target, 6998 NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL); 6999 7000 /* 7001 * On -ENOMEM, migrate_pages() bails out right away. It is pointless 7002 * to retry again over this error, so do the same here. 7003 */ 7004 if (ret == -ENOMEM) 7005 break; 7006 } 7007 7008 lru_cache_enable(); 7009 if (ret < 0) { 7010 if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY) 7011 alloc_contig_dump_pages(&cc->migratepages); 7012 putback_movable_pages(&cc->migratepages); 7013 } 7014 7015 return (ret < 0) ? ret : 0; 7016 } 7017 7018 static void split_free_frozen_pages(struct list_head *list, gfp_t gfp_mask) 7019 { 7020 int order; 7021 7022 for (order = 0; order < NR_PAGE_ORDERS; order++) { 7023 struct page *page, *next; 7024 int nr_pages = 1 << order; 7025 7026 list_for_each_entry_safe(page, next, &list[order], lru) { 7027 int i; 7028 7029 post_alloc_hook(page, order, gfp_mask); 7030 if (!order) 7031 continue; 7032 7033 __split_page(page, order); 7034 7035 /* Add all subpages to the order-0 head, in sequence. */ 7036 list_del(&page->lru); 7037 for (i = 0; i < nr_pages; i++) 7038 list_add_tail(&page[i].lru, &list[0]); 7039 } 7040 } 7041 } 7042 7043 static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) 7044 { 7045 const gfp_t reclaim_mask = __GFP_IO | __GFP_FS | __GFP_RECLAIM; 7046 const gfp_t action_mask = __GFP_COMP | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | 7047 __GFP_ZERO | __GFP_ZEROTAGS | __GFP_SKIP_ZERO | 7048 __GFP_SKIP_KASAN; 7049 const gfp_t cc_action_mask = __GFP_RETRY_MAYFAIL | __GFP_NOWARN; 7050 7051 /* 7052 * We are given the range to allocate; node, mobility and placement 7053 * hints are irrelevant at this point. We'll simply ignore them. 7054 */ 7055 gfp_mask &= ~(GFP_ZONEMASK | __GFP_RECLAIMABLE | __GFP_WRITE | 7056 __GFP_HARDWALL | __GFP_THISNODE | __GFP_MOVABLE); 7057 7058 /* 7059 * We only support most reclaim flags (but not NOFAIL/NORETRY), and 7060 * selected action flags. 7061 */ 7062 if (gfp_mask & ~(reclaim_mask | action_mask)) 7063 return -EINVAL; 7064 7065 /* 7066 * Flags to control page compaction/migration/reclaim, to free up our 7067 * page range. Migratable pages are movable, __GFP_MOVABLE is implied 7068 * for them. 7069 * 7070 * Traditionally we always had __GFP_RETRY_MAYFAIL set, keep doing that 7071 * to not degrade callers. 7072 */ 7073 *gfp_cc_mask = (gfp_mask & (reclaim_mask | cc_action_mask)) | 7074 __GFP_MOVABLE | __GFP_RETRY_MAYFAIL; 7075 return 0; 7076 } 7077 7078 static void __free_contig_frozen_range(unsigned long pfn, unsigned long nr_pages) 7079 { 7080 __free_contig_range_common(pfn, nr_pages, /* is_frozen= */ true); 7081 } 7082 7083 /** 7084 * alloc_contig_frozen_range() -- tries to allocate given range of frozen pages 7085 * @start: start PFN to allocate 7086 * @end: one-past-the-last PFN to allocate 7087 * @alloc_flags: allocation information 7088 * @gfp_mask: GFP mask. Node/zone/placement hints are ignored; only some 7089 * action and reclaim modifiers are supported. Reclaim modifiers 7090 * control allocation behavior during compaction/migration/reclaim. 7091 * 7092 * The PFN range does not have to be pageblock aligned. The PFN range must 7093 * belong to a single zone. 7094 * 7095 * The first thing this routine does is attempt to MIGRATE_ISOLATE all 7096 * pageblocks in the range. Once isolated, the pageblocks should not 7097 * be modified by others. 7098 * 7099 * All frozen pages which PFN is in [start, end) are allocated for the 7100 * caller, and they could be freed with free_contig_frozen_range(), 7101 * free_frozen_pages() also could be used to free compound frozen pages 7102 * directly. 7103 * 7104 * Return: zero on success or negative error code. 7105 */ 7106 int alloc_contig_frozen_range_noprof(unsigned long start, unsigned long end, 7107 acr_flags_t alloc_flags, gfp_t gfp_mask) 7108 { 7109 const unsigned int order = ilog2(end - start); 7110 unsigned long outer_start, outer_end; 7111 int ret = 0; 7112 7113 struct compact_control cc = { 7114 .nr_migratepages = 0, 7115 .order = -1, 7116 .zone = page_zone(pfn_to_page(start)), 7117 .mode = MIGRATE_SYNC, 7118 .ignore_skip_hint = true, 7119 .no_set_skip_hint = true, 7120 .alloc_contig = true, 7121 }; 7122 INIT_LIST_HEAD(&cc.migratepages); 7123 enum pb_isolate_mode mode = (alloc_flags & ACR_FLAGS_CMA) ? 7124 PB_ISOLATE_MODE_CMA_ALLOC : 7125 PB_ISOLATE_MODE_OTHER; 7126 7127 /* 7128 * In contrast to the buddy, we allow for orders here that exceed 7129 * MAX_PAGE_ORDER, so we must manually make sure that we are not 7130 * exceeding the maximum folio order. 7131 */ 7132 if (WARN_ON_ONCE((gfp_mask & __GFP_COMP) && order > MAX_FOLIO_ORDER)) 7133 return -EINVAL; 7134 7135 gfp_mask = current_gfp_context(gfp_mask); 7136 if (__alloc_contig_verify_gfp_mask(gfp_mask, (gfp_t *)&cc.gfp_mask)) 7137 return -EINVAL; 7138 7139 /* 7140 * What we do here is we mark all pageblocks in range as 7141 * MIGRATE_ISOLATE. Because pageblock and max order pages may 7142 * have different sizes, and due to the way page allocator 7143 * work, start_isolate_page_range() has special handlings for this. 7144 * 7145 * Once the pageblocks are marked as MIGRATE_ISOLATE, we 7146 * migrate the pages from an unaligned range (ie. pages that 7147 * we are interested in). This will put all the pages in 7148 * range back to page allocator as MIGRATE_ISOLATE. 7149 * 7150 * When this is done, we take the pages in range from page 7151 * allocator removing them from the buddy system. This way 7152 * page allocator will never consider using them. 7153 * 7154 * This lets us mark the pageblocks back as 7155 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the 7156 * aligned range but not in the unaligned, original range are 7157 * put back to page allocator so that buddy can use them. 7158 */ 7159 7160 ret = start_isolate_page_range(start, end, mode); 7161 if (ret) 7162 goto done; 7163 7164 drain_all_pages(cc.zone); 7165 7166 /* 7167 * In case of -EBUSY, we'd like to know which page causes problem. 7168 * So, just fall through. test_pages_isolated() has a tracepoint 7169 * which will report the busy page. 7170 * 7171 * It is possible that busy pages could become available before 7172 * the call to test_pages_isolated, and the range will actually be 7173 * allocated. So, if we fall through be sure to clear ret so that 7174 * -EBUSY is not accidentally used or returned to caller. 7175 */ 7176 ret = __alloc_contig_migrate_range(&cc, start, end); 7177 if (ret && ret != -EBUSY) 7178 goto done; 7179 7180 /* 7181 * When in-use hugetlb pages are migrated, they may simply be released 7182 * back into the free hugepage pool instead of being returned to the 7183 * buddy system. After the migration of in-use huge pages is completed, 7184 * we will invoke replace_free_hugepage_folios() to ensure that these 7185 * hugepages are properly released to the buddy system. 7186 */ 7187 ret = replace_free_hugepage_folios(start, end); 7188 if (ret) 7189 goto done; 7190 7191 /* 7192 * Pages from [start, end) are within a pageblock_nr_pages 7193 * aligned blocks that are marked as MIGRATE_ISOLATE. What's 7194 * more, all pages in [start, end) are free in page allocator. 7195 * What we are going to do is to allocate all pages from 7196 * [start, end) (that is remove them from page allocator). 7197 * 7198 * The only problem is that pages at the beginning and at the 7199 * end of interesting range may be not aligned with pages that 7200 * page allocator holds, ie. they can be part of higher order 7201 * pages. Because of this, we reserve the bigger range and 7202 * once this is done free the pages we are not interested in. 7203 * 7204 * We don't have to hold zone->lock here because the pages are 7205 * isolated thus they won't get removed from buddy. 7206 */ 7207 outer_start = find_large_buddy(start); 7208 7209 /* Make sure the range is really isolated. */ 7210 if (test_pages_isolated(outer_start, end, mode)) { 7211 ret = -EBUSY; 7212 goto done; 7213 } 7214 7215 /* Grab isolated pages from freelists. */ 7216 outer_end = isolate_freepages_range(&cc, outer_start, end); 7217 if (!outer_end) { 7218 ret = -EBUSY; 7219 goto done; 7220 } 7221 7222 if (!(gfp_mask & __GFP_COMP)) { 7223 split_free_frozen_pages(cc.freepages, gfp_mask); 7224 7225 /* Free head and tail (if any) */ 7226 if (start != outer_start) 7227 __free_contig_frozen_range(outer_start, start - outer_start); 7228 if (end != outer_end) 7229 __free_contig_frozen_range(end, outer_end - end); 7230 } else if (start == outer_start && end == outer_end && is_power_of_2(end - start)) { 7231 struct page *head = pfn_to_page(start); 7232 7233 check_new_pages(head, order); 7234 prep_new_page(head, order, gfp_mask, 0); 7235 } else { 7236 ret = -EINVAL; 7237 WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, %lu)\n", 7238 start, end, outer_start, outer_end); 7239 } 7240 done: 7241 undo_isolate_page_range(start, end); 7242 return ret; 7243 } 7244 EXPORT_SYMBOL(alloc_contig_frozen_range_noprof); 7245 7246 /** 7247 * alloc_contig_range() -- tries to allocate given range of pages 7248 * @start: start PFN to allocate 7249 * @end: one-past-the-last PFN to allocate 7250 * @alloc_flags: allocation information 7251 * @gfp_mask: GFP mask. 7252 * 7253 * This routine is a wrapper around alloc_contig_frozen_range(), it can't 7254 * be used to allocate compound pages, the refcount of each allocated page 7255 * will be set to one. 7256 * 7257 * All pages which PFN is in [start, end) are allocated for the caller, 7258 * and should be freed with free_contig_range() or by manually calling 7259 * __free_page() on each allocated page. 7260 * 7261 * Return: zero on success or negative error code. 7262 */ 7263 int alloc_contig_range_noprof(unsigned long start, unsigned long end, 7264 acr_flags_t alloc_flags, gfp_t gfp_mask) 7265 { 7266 int ret; 7267 7268 if (WARN_ON(gfp_mask & __GFP_COMP)) 7269 return -EINVAL; 7270 7271 ret = alloc_contig_frozen_range_noprof(start, end, alloc_flags, gfp_mask); 7272 if (!ret) 7273 set_pages_refcounted(pfn_to_page(start), end - start); 7274 7275 return ret; 7276 } 7277 EXPORT_SYMBOL(alloc_contig_range_noprof); 7278 7279 static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, 7280 unsigned long nr_pages, bool skip_hugetlb, 7281 bool *skipped_hugetlb) 7282 { 7283 unsigned long end_pfn = start_pfn + nr_pages; 7284 struct page *page; 7285 7286 while (start_pfn < end_pfn) { 7287 unsigned long step = 1; 7288 7289 page = pfn_to_online_page(start_pfn); 7290 if (!page) 7291 return false; 7292 7293 if (page_zone(page) != z) 7294 return false; 7295 7296 if (page_is_unmovable(z, page, PB_ISOLATE_MODE_OTHER, &step)) 7297 return false; 7298 7299 /* 7300 * Only consider ranges containing hugepages if those pages are 7301 * smaller than the requested contiguous region. e.g.: 7302 * Move 2MB pages to free up a 1GB range. 7303 * Don't move 1GB pages to free up a 2MB range. 7304 * 7305 * This makes contiguous allocation more reliable if multiple 7306 * hugepage sizes are used without causing needless movement. 7307 */ 7308 if (PageHuge(page)) { 7309 unsigned int order; 7310 7311 if (skip_hugetlb) { 7312 *skipped_hugetlb = true; 7313 return false; 7314 } 7315 7316 page = compound_head(page); 7317 order = compound_order(page); 7318 if ((order >= MAX_FOLIO_ORDER) || 7319 (nr_pages <= (1 << order))) 7320 return false; 7321 } 7322 7323 start_pfn += step; 7324 } 7325 return true; 7326 } 7327 7328 static bool zone_spans_last_pfn(const struct zone *zone, 7329 unsigned long start_pfn, unsigned long nr_pages) 7330 { 7331 unsigned long last_pfn = start_pfn + nr_pages - 1; 7332 7333 return zone_spans_pfn(zone, last_pfn); 7334 } 7335 7336 /** 7337 * alloc_contig_frozen_pages() -- tries to find and allocate contiguous range of frozen pages 7338 * @nr_pages: Number of contiguous pages to allocate 7339 * @gfp_mask: GFP mask. Node/zone/placement hints limit the search; only some 7340 * action and reclaim modifiers are supported. Reclaim modifiers 7341 * control allocation behavior during compaction/migration/reclaim. 7342 * @nid: Target node 7343 * @nodemask: Mask for other possible nodes 7344 * 7345 * This routine is a wrapper around alloc_contig_frozen_range(). It scans over 7346 * zones on an applicable zonelist to find a contiguous pfn range which can then 7347 * be tried for allocation with alloc_contig_frozen_range(). This routine is 7348 * intended for allocation requests which can not be fulfilled with the buddy 7349 * allocator. 7350 * 7351 * The allocated memory is always aligned to a page boundary. If nr_pages is a 7352 * power of two, then allocated range is also guaranteed to be aligned to same 7353 * nr_pages (e.g. 1GB request would be aligned to 1GB). 7354 * 7355 * Allocated frozen pages need be freed with free_contig_frozen_range(), 7356 * or by manually calling free_frozen_pages() on each allocated frozen 7357 * non-compound page, for compound frozen pages could be freed with 7358 * free_frozen_pages() directly. 7359 * 7360 * Return: pointer to contiguous frozen pages on success, or NULL if not successful. 7361 */ 7362 struct page *alloc_contig_frozen_pages_noprof(unsigned long nr_pages, 7363 gfp_t gfp_mask, int nid, nodemask_t *nodemask) 7364 { 7365 unsigned long ret, pfn, flags; 7366 struct zonelist *zonelist; 7367 struct zone *zone; 7368 struct zoneref *z; 7369 bool skip_hugetlb = true; 7370 bool skipped_hugetlb = false; 7371 7372 retry: 7373 zonelist = node_zonelist(nid, gfp_mask); 7374 for_each_zone_zonelist_nodemask(zone, z, zonelist, 7375 gfp_zone(gfp_mask), nodemask) { 7376 spin_lock_irqsave(&zone->lock, flags); 7377 7378 pfn = ALIGN(zone->zone_start_pfn, nr_pages); 7379 while (zone_spans_last_pfn(zone, pfn, nr_pages)) { 7380 if (pfn_range_valid_contig(zone, pfn, nr_pages, 7381 skip_hugetlb, 7382 &skipped_hugetlb)) { 7383 /* 7384 * We release the zone lock here because 7385 * alloc_contig_frozen_range() will also lock 7386 * the zone at some point. If there's an 7387 * allocation spinning on this lock, it may 7388 * win the race and cause allocation to fail. 7389 */ 7390 spin_unlock_irqrestore(&zone->lock, flags); 7391 ret = alloc_contig_frozen_range_noprof(pfn, 7392 pfn + nr_pages, 7393 ACR_FLAGS_NONE, 7394 gfp_mask); 7395 if (!ret) 7396 return pfn_to_page(pfn); 7397 spin_lock_irqsave(&zone->lock, flags); 7398 } 7399 pfn += nr_pages; 7400 } 7401 spin_unlock_irqrestore(&zone->lock, flags); 7402 } 7403 /* 7404 * If we failed, retry the search, but treat regions with HugeTLB pages 7405 * as valid targets. This retains fast-allocations on first pass 7406 * without trying to migrate HugeTLB pages (which may fail). On the 7407 * second pass, we will try moving HugeTLB pages when those pages are 7408 * smaller than the requested contiguous region size. 7409 */ 7410 if (skip_hugetlb && skipped_hugetlb) { 7411 skip_hugetlb = false; 7412 goto retry; 7413 } 7414 return NULL; 7415 } 7416 EXPORT_SYMBOL(alloc_contig_frozen_pages_noprof); 7417 7418 /** 7419 * alloc_contig_pages() -- tries to find and allocate contiguous range of pages 7420 * @nr_pages: Number of contiguous pages to allocate 7421 * @gfp_mask: GFP mask. 7422 * @nid: Target node 7423 * @nodemask: Mask for other possible nodes 7424 * 7425 * This routine is a wrapper around alloc_contig_frozen_pages(), it can't 7426 * be used to allocate compound pages, the refcount of each allocated page 7427 * will be set to one. 7428 * 7429 * Allocated pages can be freed with free_contig_range() or by manually 7430 * calling __free_page() on each allocated page. 7431 * 7432 * Return: pointer to contiguous pages on success, or NULL if not successful. 7433 */ 7434 struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, 7435 int nid, nodemask_t *nodemask) 7436 { 7437 struct page *page; 7438 7439 if (WARN_ON(gfp_mask & __GFP_COMP)) 7440 return NULL; 7441 7442 page = alloc_contig_frozen_pages_noprof(nr_pages, gfp_mask, nid, 7443 nodemask); 7444 if (page) 7445 set_pages_refcounted(page, nr_pages); 7446 7447 return page; 7448 } 7449 EXPORT_SYMBOL(alloc_contig_pages_noprof); 7450 7451 /** 7452 * free_contig_frozen_range() -- free the contiguous range of frozen pages 7453 * @pfn: start PFN to free 7454 * @nr_pages: Number of contiguous frozen pages to free 7455 * 7456 * This can be used to free the allocated compound/non-compound frozen pages. 7457 */ 7458 void free_contig_frozen_range(unsigned long pfn, unsigned long nr_pages) 7459 { 7460 struct page *first_page = pfn_to_page(pfn); 7461 const unsigned int order = ilog2(nr_pages); 7462 7463 if (WARN_ON_ONCE(first_page != compound_head(first_page))) 7464 return; 7465 7466 if (PageHead(first_page)) { 7467 WARN_ON_ONCE(order != compound_order(first_page)); 7468 free_frozen_pages(first_page, order); 7469 return; 7470 } 7471 7472 __free_contig_frozen_range(pfn, nr_pages); 7473 } 7474 EXPORT_SYMBOL(free_contig_frozen_range); 7475 7476 /** 7477 * free_contig_range() -- free the contiguous range of pages 7478 * @pfn: start PFN to free 7479 * @nr_pages: Number of contiguous pages to free 7480 * 7481 * This can be only used to free the allocated non-compound pages. 7482 */ 7483 void free_contig_range(unsigned long pfn, unsigned long nr_pages) 7484 { 7485 if (WARN_ON_ONCE(PageHead(pfn_to_page(pfn)))) 7486 return; 7487 7488 __free_contig_range(pfn, nr_pages); 7489 } 7490 EXPORT_SYMBOL(free_contig_range); 7491 #endif /* CONFIG_CONTIG_ALLOC */ 7492 7493 /* 7494 * Effectively disable pcplists for the zone by setting the high limit to 0 7495 * and draining all cpus. A concurrent page freeing on another CPU that's about 7496 * to put the page on pcplist will either finish before the drain and the page 7497 * will be drained, or observe the new high limit and skip the pcplist. 7498 * 7499 * Must be paired with a call to zone_pcp_enable(). 7500 */ 7501 void zone_pcp_disable(struct zone *zone) 7502 { 7503 mutex_lock(&pcp_batch_high_lock); 7504 __zone_set_pageset_high_and_batch(zone, 0, 0, 1); 7505 __drain_all_pages(zone, true); 7506 } 7507 7508 void zone_pcp_enable(struct zone *zone) 7509 { 7510 __zone_set_pageset_high_and_batch(zone, zone->pageset_high_min, 7511 zone->pageset_high_max, zone->pageset_batch); 7512 mutex_unlock(&pcp_batch_high_lock); 7513 } 7514 7515 void zone_pcp_reset(struct zone *zone) 7516 { 7517 int cpu; 7518 struct per_cpu_zonestat *pzstats; 7519 7520 if (zone->per_cpu_pageset != &boot_pageset) { 7521 for_each_online_cpu(cpu) { 7522 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); 7523 drain_zonestat(zone, pzstats); 7524 } 7525 free_percpu(zone->per_cpu_pageset); 7526 zone->per_cpu_pageset = &boot_pageset; 7527 if (zone->per_cpu_zonestats != &boot_zonestats) { 7528 free_percpu(zone->per_cpu_zonestats); 7529 zone->per_cpu_zonestats = &boot_zonestats; 7530 } 7531 } 7532 } 7533 7534 #ifdef CONFIG_MEMORY_HOTREMOVE 7535 /* 7536 * All pages in the range must be in a single zone, must not contain holes, 7537 * must span full sections, and must be isolated before calling this function. 7538 * 7539 * Returns the number of managed (non-PageOffline()) pages in the range: the 7540 * number of pages for which memory offlining code must adjust managed page 7541 * counters using adjust_managed_page_count(). 7542 */ 7543 unsigned long __offline_isolated_pages(unsigned long start_pfn, 7544 unsigned long end_pfn) 7545 { 7546 unsigned long already_offline = 0; 7547 unsigned long pfn = start_pfn; 7548 struct page *page; 7549 struct zone *zone; 7550 unsigned int order; 7551 7552 offline_mem_sections(pfn, end_pfn); 7553 zone = page_zone(pfn_to_page(pfn)); 7554 guard(spinlock_irqsave)(&zone->lock); 7555 while (pfn < end_pfn) { 7556 page = pfn_to_page(pfn); 7557 /* 7558 * The HWPoisoned page may be not in buddy system, and 7559 * page_count() is not 0. 7560 */ 7561 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { 7562 pfn++; 7563 continue; 7564 } 7565 /* 7566 * At this point all remaining PageOffline() pages have a 7567 * reference count of 0 and can simply be skipped. 7568 */ 7569 if (PageOffline(page)) { 7570 BUG_ON(page_count(page)); 7571 BUG_ON(PageBuddy(page)); 7572 already_offline++; 7573 pfn++; 7574 continue; 7575 } 7576 7577 BUG_ON(page_count(page)); 7578 BUG_ON(!PageBuddy(page)); 7579 VM_WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE); 7580 order = buddy_order(page); 7581 del_page_from_free_list(page, zone, order, MIGRATE_ISOLATE); 7582 pfn += (1 << order); 7583 } 7584 7585 return end_pfn - start_pfn - already_offline; 7586 } 7587 #endif 7588 7589 /* 7590 * This function returns a stable result only if called under zone lock. 7591 */ 7592 bool is_free_buddy_page(const struct page *page) 7593 { 7594 unsigned long pfn = page_to_pfn(page); 7595 unsigned int order; 7596 7597 for (order = 0; order < NR_PAGE_ORDERS; order++) { 7598 const struct page *head = page - (pfn & ((1 << order) - 1)); 7599 7600 if (PageBuddy(head) && 7601 buddy_order_unsafe(head) >= order) 7602 break; 7603 } 7604 7605 return order <= MAX_PAGE_ORDER; 7606 } 7607 EXPORT_SYMBOL(is_free_buddy_page); 7608 7609 #ifdef CONFIG_MEMORY_FAILURE 7610 static inline void add_to_free_list(struct page *page, struct zone *zone, 7611 unsigned int order, int migratetype, 7612 bool tail) 7613 { 7614 __add_to_free_list(page, zone, order, migratetype, tail); 7615 account_freepages(zone, 1 << order, migratetype); 7616 } 7617 7618 /* 7619 * Break down a higher-order page in sub-pages, and keep our target out of 7620 * buddy allocator. 7621 */ 7622 static void break_down_buddy_pages(struct zone *zone, struct page *page, 7623 struct page *target, int low, int high, 7624 int migratetype) 7625 { 7626 unsigned long size = 1 << high; 7627 struct page *current_buddy; 7628 7629 while (high > low) { 7630 high--; 7631 size >>= 1; 7632 7633 if (target >= &page[size]) { 7634 current_buddy = page; 7635 page = page + size; 7636 } else { 7637 current_buddy = page + size; 7638 } 7639 7640 if (set_page_guard(zone, current_buddy, high)) 7641 continue; 7642 7643 add_to_free_list(current_buddy, zone, high, migratetype, false); 7644 set_buddy_order(current_buddy, high); 7645 } 7646 } 7647 7648 /* 7649 * Take a page that will be marked as poisoned off the buddy allocator. 7650 */ 7651 bool take_page_off_buddy(struct page *page) 7652 { 7653 struct zone *zone = page_zone(page); 7654 unsigned long pfn = page_to_pfn(page); 7655 unsigned int order; 7656 7657 guard(spinlock_irqsave)(&zone->lock); 7658 for (order = 0; order < NR_PAGE_ORDERS; order++) { 7659 struct page *page_head = page - (pfn & ((1 << order) - 1)); 7660 int page_order = buddy_order(page_head); 7661 7662 if (PageBuddy(page_head) && page_order >= order) { 7663 unsigned long pfn_head = page_to_pfn(page_head); 7664 int migratetype = get_pfnblock_migratetype(page_head, 7665 pfn_head); 7666 7667 del_page_from_free_list(page_head, zone, page_order, 7668 migratetype); 7669 break_down_buddy_pages(zone, page_head, page, 0, 7670 page_order, migratetype); 7671 SetPageHWPoisonTakenOff(page); 7672 return true; 7673 } 7674 if (page_count(page_head) > 0) 7675 break; 7676 } 7677 return false; 7678 } 7679 7680 /* 7681 * Cancel takeoff done by take_page_off_buddy(). 7682 */ 7683 bool put_page_back_buddy(struct page *page) 7684 { 7685 struct zone *zone = page_zone(page); 7686 7687 guard(spinlock_irqsave)(&zone->lock); 7688 if (put_page_testzero(page)) { 7689 unsigned long pfn = page_to_pfn(page); 7690 int migratetype = get_pfnblock_migratetype(page, pfn); 7691 7692 ClearPageHWPoisonTakenOff(page); 7693 __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE); 7694 if (TestClearPageHWPoison(page)) 7695 return true; 7696 } 7697 7698 return false; 7699 } 7700 #endif 7701 7702 bool has_managed_zone(enum zone_type zone) 7703 { 7704 struct pglist_data *pgdat; 7705 7706 for_each_online_pgdat(pgdat) { 7707 if (managed_zone(&pgdat->node_zones[zone])) 7708 return true; 7709 } 7710 return false; 7711 } 7712 7713 #ifdef CONFIG_UNACCEPTED_MEMORY 7714 7715 static bool lazy_accept = true; 7716 7717 static int __init accept_memory_parse(char *p) 7718 { 7719 if (!strcmp(p, "lazy")) { 7720 lazy_accept = true; 7721 return 0; 7722 } else if (!strcmp(p, "eager")) { 7723 lazy_accept = false; 7724 return 0; 7725 } else { 7726 return -EINVAL; 7727 } 7728 } 7729 early_param("accept_memory", accept_memory_parse); 7730 7731 static bool page_contains_unaccepted(struct page *page, unsigned int order) 7732 { 7733 phys_addr_t start = page_to_phys(page); 7734 7735 return range_contains_unaccepted_memory(start, PAGE_SIZE << order); 7736 } 7737 7738 static void __accept_page(struct zone *zone, unsigned long *flags, 7739 struct page *page) 7740 { 7741 list_del(&page->lru); 7742 account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); 7743 __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); 7744 __ClearPageUnaccepted(page); 7745 spin_unlock_irqrestore(&zone->lock, *flags); 7746 7747 accept_memory(page_to_phys(page), PAGE_SIZE << MAX_PAGE_ORDER); 7748 7749 __free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL); 7750 } 7751 7752 void accept_page(struct page *page) 7753 { 7754 struct zone *zone = page_zone(page); 7755 unsigned long flags; 7756 7757 spin_lock_irqsave(&zone->lock, flags); 7758 if (!PageUnaccepted(page)) { 7759 spin_unlock_irqrestore(&zone->lock, flags); 7760 return; 7761 } 7762 7763 /* Unlocks zone->lock */ 7764 __accept_page(zone, &flags, page); 7765 } 7766 7767 static bool try_to_accept_memory_one(struct zone *zone) 7768 { 7769 unsigned long flags; 7770 struct page *page; 7771 7772 spin_lock_irqsave(&zone->lock, flags); 7773 page = list_first_entry_or_null(&zone->unaccepted_pages, 7774 struct page, lru); 7775 if (!page) { 7776 spin_unlock_irqrestore(&zone->lock, flags); 7777 return false; 7778 } 7779 7780 /* Unlocks zone->lock */ 7781 __accept_page(zone, &flags, page); 7782 7783 return true; 7784 } 7785 7786 static bool cond_accept_memory(struct zone *zone, unsigned int order, 7787 int alloc_flags) 7788 { 7789 long to_accept, wmark; 7790 bool ret = false; 7791 7792 if (list_empty(&zone->unaccepted_pages)) 7793 return false; 7794 7795 /* Bailout, since try_to_accept_memory_one() needs to take a lock */ 7796 if (alloc_flags & ALLOC_TRYLOCK) 7797 return false; 7798 7799 wmark = promo_wmark_pages(zone); 7800 7801 /* 7802 * Watermarks have not been initialized yet. 7803 * 7804 * Accepting one MAX_ORDER page to ensure progress. 7805 */ 7806 if (!wmark) 7807 return try_to_accept_memory_one(zone); 7808 7809 /* How much to accept to get to promo watermark? */ 7810 to_accept = wmark - 7811 (zone_page_state(zone, NR_FREE_PAGES) - 7812 __zone_watermark_unusable_free(zone, order, 0) - 7813 zone_page_state(zone, NR_UNACCEPTED)); 7814 7815 while (to_accept > 0) { 7816 if (!try_to_accept_memory_one(zone)) 7817 break; 7818 ret = true; 7819 to_accept -= MAX_ORDER_NR_PAGES; 7820 } 7821 7822 return ret; 7823 } 7824 7825 static bool __free_unaccepted(struct page *page) 7826 { 7827 struct zone *zone = page_zone(page); 7828 unsigned long flags; 7829 7830 if (!lazy_accept) 7831 return false; 7832 7833 spin_lock_irqsave(&zone->lock, flags); 7834 list_add_tail(&page->lru, &zone->unaccepted_pages); 7835 account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); 7836 __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES); 7837 __SetPageUnaccepted(page); 7838 spin_unlock_irqrestore(&zone->lock, flags); 7839 7840 return true; 7841 } 7842 7843 #else 7844 7845 static bool page_contains_unaccepted(struct page *page, unsigned int order) 7846 { 7847 return false; 7848 } 7849 7850 static bool cond_accept_memory(struct zone *zone, unsigned int order, 7851 int alloc_flags) 7852 { 7853 return false; 7854 } 7855 7856 static bool __free_unaccepted(struct page *page) 7857 { 7858 BUILD_BUG(); 7859 return false; 7860 } 7861 7862 #endif /* CONFIG_UNACCEPTED_MEMORY */ 7863 7864 struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order) 7865 { 7866 /* 7867 * Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed. 7868 * Do not specify __GFP_KSWAPD_RECLAIM either, since wake up of kswapd 7869 * is not safe in arbitrary context. 7870 * 7871 * These two are the conditions for gfpflags_allow_spinning() being true. 7872 * 7873 * Specify __GFP_NOWARN since failing alloc_pages_nolock() is not a reason 7874 * to warn. Also warn would trigger printk() which is unsafe from 7875 * various contexts. We cannot use printk_deferred_enter() to mitigate, 7876 * since the running context is unknown. 7877 * 7878 * Specify __GFP_ZERO to make sure that call to kmsan_alloc_page() below 7879 * is safe in any context. Also zeroing the page is mandatory for 7880 * BPF use cases. 7881 * 7882 * Though __GFP_NOMEMALLOC is not checked in the code path below, 7883 * specify it here to highlight that alloc_pages_nolock() 7884 * doesn't want to deplete reserves. 7885 */ 7886 gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC | __GFP_COMP 7887 | gfp_flags; 7888 unsigned int alloc_flags = ALLOC_TRYLOCK; 7889 struct alloc_context ac = { }; 7890 struct page *page; 7891 7892 VM_WARN_ON_ONCE(gfp_flags & ~__GFP_ACCOUNT); 7893 /* 7894 * In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is 7895 * unsafe in NMI. If spin_trylock() is called from hard IRQ the current 7896 * task may be waiting for one rt_spin_lock, but rt_spin_trylock() will 7897 * mark the task as the owner of another rt_spin_lock which will 7898 * confuse PI logic, so return immediately if called from hard IRQ or 7899 * NMI. 7900 * 7901 * Note, irqs_disabled() case is ok. This function can be called 7902 * from raw_spin_lock_irqsave region. 7903 */ 7904 if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) 7905 return NULL; 7906 7907 /* On UP, spin_trylock() always succeeds even when it is locked */ 7908 if (!IS_ENABLED(CONFIG_SMP) && in_nmi()) 7909 return NULL; 7910 7911 if (!pcp_allowed_order(order)) 7912 return NULL; 7913 7914 /* Bailout, since _deferred_grow_zone() needs to take a lock */ 7915 if (deferred_pages_enabled()) 7916 return NULL; 7917 7918 if (nid == NUMA_NO_NODE) 7919 nid = numa_node_id(); 7920 7921 prepare_alloc_pages(alloc_gfp, order, nid, NULL, &ac, 7922 &alloc_gfp, &alloc_flags); 7923 7924 /* 7925 * Best effort allocation from percpu free list. 7926 * If it's empty attempt to spin_trylock zone->lock. 7927 */ 7928 page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac); 7929 7930 /* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */ 7931 7932 if (memcg_kmem_online() && page && (gfp_flags & __GFP_ACCOUNT) && 7933 unlikely(__memcg_kmem_charge_page(page, alloc_gfp, order) != 0)) { 7934 __free_frozen_pages(page, order, FPI_TRYLOCK); 7935 page = NULL; 7936 } 7937 trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); 7938 kmsan_alloc_page(page, order, alloc_gfp); 7939 return page; 7940 } 7941 /** 7942 * alloc_pages_nolock - opportunistic reentrant allocation from any context 7943 * @gfp_flags: GFP flags. Only __GFP_ACCOUNT allowed. 7944 * @nid: node to allocate from 7945 * @order: allocation order size 7946 * 7947 * Allocates pages of a given order from the given node. This is safe to 7948 * call from any context where RCU is watching (from atomic, NMI, and also 7949 * reentrant allocator -> tracepoint -> alloc_pages_nolock_noprof). 7950 * Allocation is best effort and to be expected to fail easily so nobody should 7951 * rely on the success. Failures are not reported via warn_alloc(). 7952 * See always fail conditions below. 7953 * 7954 * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN. 7955 * It means ENOMEM. There is no reason to call it again and expect !NULL. 7956 */ 7957 struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order) 7958 { 7959 struct page *page; 7960 7961 page = alloc_frozen_pages_nolock_noprof(gfp_flags, nid, order); 7962 if (page) 7963 set_page_refcounted(page); 7964 return page; 7965 } 7966 EXPORT_SYMBOL_GPL(alloc_pages_nolock_noprof); 7967